bench: add scale benchmark suite (94 tests)
Benchmark mempalace at configurable scale (1K–100K drawers) to find real-world performance limits. Tests cover MCP tool OOM thresholds, ChromaDB query degradation, search recall@k, mining throughput, knowledge graph concurrency, memory leak detection, palace boost quantification, and Layer1 unbounded fetch behavior. - tests/benchmarks/ with 8 test modules + data generator + report system - Deterministic data factory with planted needles for recall measurement - JSON report output with regression detection (--bench-report flag) - CI benchmark job on PRs at small scale - psutil added as dev dependency for RSS tracking
This commit is contained in:
@@ -0,0 +1,395 @@
|
||||
"""
|
||||
Deterministic data factory for MemPalace scale benchmarks.
|
||||
|
||||
Generates realistic project files, conversations, and KG triples at
|
||||
configurable scale levels. All randomness uses seeded RNG for reproducibility.
|
||||
|
||||
Planted "needle" drawers enable recall measurement without an LLM judge.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import random
|
||||
import string
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import chromadb
|
||||
import yaml
|
||||
|
||||
|
||||
# ── Scale configurations ─────────────────────────────────────────────────
|
||||
|
||||
SCALE_CONFIGS = {
|
||||
"small": {"drawers": 1_000, "wings": 3, "rooms_per_wing": 5, "kg_entities": 50, "kg_triples": 200, "needles": 20, "search_queries": 20},
|
||||
"medium": {"drawers": 10_000, "wings": 8, "rooms_per_wing": 12, "kg_entities": 200, "kg_triples": 2_000, "needles": 50, "search_queries": 50},
|
||||
"large": {"drawers": 50_000, "wings": 15, "rooms_per_wing": 20, "kg_entities": 500, "kg_triples": 10_000, "needles": 100, "search_queries": 100},
|
||||
"stress": {"drawers": 100_000, "wings": 25, "rooms_per_wing": 30, "kg_entities": 1_000, "kg_triples": 50_000, "needles": 200, "search_queries": 200},
|
||||
}
|
||||
|
||||
# ── Vocabulary banks for realistic content ───────────────────────────────
|
||||
|
||||
WING_NAMES = [
|
||||
"webapp", "backend_api", "mobile_app", "data_pipeline", "ml_platform",
|
||||
"devops", "auth_service", "payments", "analytics", "docs_site",
|
||||
"cli_tool", "dashboard", "notification_service", "search_engine",
|
||||
"user_mgmt", "inventory", "reporting", "testing_infra", "monitoring",
|
||||
"email_service", "chat_bot", "file_storage", "scheduler", "gateway",
|
||||
"marketplace",
|
||||
]
|
||||
|
||||
ROOM_NAMES = [
|
||||
"backend", "frontend", "api", "database", "auth", "tests", "docs",
|
||||
"config", "deployment", "models", "views", "controllers", "middleware",
|
||||
"utils", "schemas", "migrations", "fixtures", "scripts", "styles",
|
||||
"components", "hooks", "services", "routes", "templates", "static",
|
||||
"media", "logging", "cache", "queue", "workers",
|
||||
]
|
||||
|
||||
TECH_TERMS = [
|
||||
"authentication", "authorization", "middleware", "endpoint", "REST API",
|
||||
"GraphQL", "WebSocket", "database migration", "ORM", "query optimization",
|
||||
"caching strategy", "load balancer", "rate limiting", "pagination",
|
||||
"serialization", "validation", "error handling", "logging framework",
|
||||
"monitoring", "deployment pipeline", "CI/CD", "containerization",
|
||||
"microservice", "event sourcing", "message queue", "pub/sub",
|
||||
"connection pooling", "session management", "token refresh", "CORS",
|
||||
"SSL termination", "health check", "circuit breaker", "retry logic",
|
||||
"batch processing", "stream processing", "data pipeline", "ETL",
|
||||
"feature flag", "A/B testing", "blue-green deployment", "canary release",
|
||||
]
|
||||
|
||||
CODE_SNIPPETS = [
|
||||
"def process_request(data):\n validated = schema.validate(data)\n result = handler.execute(validated)\n return Response(result, status=200)\n",
|
||||
"class UserRepository:\n def __init__(self, db):\n self.db = db\n def find_by_id(self, user_id):\n return self.db.query(User).filter(User.id == user_id).first()\n",
|
||||
"async def fetch_data(url, timeout=30):\n async with aiohttp.ClientSession() as session:\n async with session.get(url, timeout=timeout) as resp:\n return await resp.json()\n",
|
||||
"const handleSubmit = async (formData) => {\n try {\n const response = await api.post('/users', formData);\n dispatch({ type: 'USER_CREATED', payload: response.data });\n } catch (error) {\n setError(error.message);\n }\n};\n",
|
||||
"SELECT u.name, COUNT(o.id) as order_count\nFROM users u\nLEFT JOIN orders o ON u.id = o.user_id\nWHERE u.created_at > '2025-01-01'\nGROUP BY u.name\nHAVING COUNT(o.id) > 5\nORDER BY order_count DESC;\n",
|
||||
]
|
||||
|
||||
PROSE_TEMPLATES = [
|
||||
"The {component} module handles {task}. It was refactored in {month} to improve {quality}. Key design decision: {decision}.",
|
||||
"Bug report: {component} fails when {condition}. Root cause: {cause}. Fixed by {fix}. Regression test added in {test_file}.",
|
||||
"Architecture decision: switched from {old_tech} to {new_tech} for {reason}. Migration completed {date}. Performance improved by {percent}%.",
|
||||
"Meeting notes: discussed {topic} with {person}. Agreed to {action}. Deadline: {deadline}. Follow-up: {followup}.",
|
||||
"Feature spec: {feature_name} allows users to {capability}. Dependencies: {deps}. Estimated effort: {effort} days.",
|
||||
]
|
||||
|
||||
ENTITY_NAMES = [
|
||||
"Alice", "Bob", "Carol", "Dave", "Eve", "Frank", "Grace", "Heidi",
|
||||
"Ivan", "Judy", "Karl", "Linda", "Mike", "Nina", "Oscar", "Pat",
|
||||
"Quinn", "Rita", "Steve", "Tina", "Ursula", "Victor", "Wendy", "Xander",
|
||||
]
|
||||
|
||||
ENTITY_TYPES = ["person", "project", "tool", "concept", "team", "service"]
|
||||
|
||||
PREDICATES = [
|
||||
"works_on", "manages", "reports_to", "collaborates_with", "created",
|
||||
"maintains", "uses", "depends_on", "replaced", "reviewed", "deployed",
|
||||
"tested", "documented", "mentors", "leads", "contributes_to",
|
||||
]
|
||||
|
||||
|
||||
class PalaceDataGenerator:
|
||||
"""Generate deterministic, realistic test data at configurable scale."""
|
||||
|
||||
def __init__(self, seed=42, scale="small"):
|
||||
self.rng = random.Random(seed)
|
||||
self.scale = scale
|
||||
self.cfg = SCALE_CONFIGS[scale]
|
||||
self.wings = WING_NAMES[: self.cfg["wings"]]
|
||||
self.rooms_by_wing = {}
|
||||
for wing in self.wings:
|
||||
n = self.cfg["rooms_per_wing"]
|
||||
rooms = self.rng.sample(ROOM_NAMES, min(n, len(ROOM_NAMES)))
|
||||
self.rooms_by_wing[wing] = rooms
|
||||
# Planted needles for recall measurement
|
||||
self.needles = []
|
||||
self._generate_needles()
|
||||
|
||||
def _generate_needles(self):
|
||||
"""Create unique needle content for recall testing."""
|
||||
topics = [
|
||||
"Fibonacci sequence optimization uses memoization with O(n) space complexity",
|
||||
"PostgreSQL vacuum autovacuum threshold set to 50 percent for table users",
|
||||
"Redis cluster failover timeout configured at 30 seconds with sentinel monitoring",
|
||||
"Kubernetes horizontal pod autoscaler targets 70 percent CPU utilization",
|
||||
"GraphQL subscription uses WebSocket transport with heartbeat interval 25 seconds",
|
||||
"JWT token rotation policy requires refresh every 15 minutes with sliding window",
|
||||
"Elasticsearch index sharding strategy uses 5 primary shards with 1 replica each",
|
||||
"Docker multi-stage build reduces image size from 1.2GB to 180MB for production",
|
||||
"Apache Kafka consumer group rebalance timeout set to 45 seconds",
|
||||
"MongoDB change streams resume token persisted every 100 operations",
|
||||
"gRPC streaming uses bidirectional flow control with 64KB window size",
|
||||
"Prometheus alerting rule fires when p99 latency exceeds 500ms for 5 minutes",
|
||||
"Terraform state locking uses DynamoDB with consistent reads enabled",
|
||||
"Nginx rate limiting configured at 100 requests per second with burst of 50",
|
||||
"SQLAlchemy connection pool size set to 20 with max overflow of 10 connections",
|
||||
"React concurrent mode uses startTransition for non-urgent state updates",
|
||||
"AWS Lambda cold start mitigation uses provisioned concurrency of 10 instances",
|
||||
"Git bisect automated with custom test script for regression hunting",
|
||||
"OpenTelemetry trace sampling rate set to 10 percent in production environment",
|
||||
"Celery worker prefetch multiplier set to 1 for fair task distribution",
|
||||
]
|
||||
for i in range(self.cfg["needles"]):
|
||||
topic = topics[i % len(topics)]
|
||||
wing = self.rng.choice(self.wings)
|
||||
room = self.rng.choice(self.rooms_by_wing[wing])
|
||||
needle_id = f"NEEDLE_{i:04d}"
|
||||
content = f"{needle_id}: {topic}. This is a unique planted needle for recall benchmarking at scale."
|
||||
self.needles.append({
|
||||
"id": needle_id,
|
||||
"content": content,
|
||||
"wing": wing,
|
||||
"room": room,
|
||||
"query": topic.split(" uses ")[0] if " uses " in topic else topic.split(" set to ")[0] if " set to " in topic else topic[:60],
|
||||
})
|
||||
|
||||
def _random_text(self, min_chars=600, max_chars=900):
|
||||
"""Generate a random text block of realistic content."""
|
||||
parts = []
|
||||
total = 0
|
||||
target = self.rng.randint(min_chars, max_chars)
|
||||
while total < target:
|
||||
choice = self.rng.random()
|
||||
if choice < 0.3:
|
||||
text = self.rng.choice(CODE_SNIPPETS)
|
||||
elif choice < 0.7:
|
||||
template = self.rng.choice(PROSE_TEMPLATES)
|
||||
text = template.format(
|
||||
component=self.rng.choice(ROOM_NAMES),
|
||||
task=self.rng.choice(TECH_TERMS),
|
||||
month=self.rng.choice(["January", "February", "March", "April", "May"]),
|
||||
quality=self.rng.choice(["performance", "readability", "test coverage", "latency"]),
|
||||
decision=self.rng.choice(TECH_TERMS),
|
||||
condition=self.rng.choice(TECH_TERMS) + " is null",
|
||||
cause=self.rng.choice(["race condition", "null pointer", "timeout", "OOM"]),
|
||||
fix="adding " + self.rng.choice(TECH_TERMS),
|
||||
test_file=f"test_{self.rng.choice(ROOM_NAMES)}.py",
|
||||
old_tech=self.rng.choice(["MySQL", "Flask", "REST", "Jenkins"]),
|
||||
new_tech=self.rng.choice(["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]),
|
||||
reason=self.rng.choice(TECH_TERMS),
|
||||
date=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
|
||||
percent=self.rng.randint(10, 80),
|
||||
topic=self.rng.choice(TECH_TERMS),
|
||||
person=self.rng.choice(ENTITY_NAMES),
|
||||
action=self.rng.choice(["refactor", "migrate", "optimize", "test"]),
|
||||
deadline=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
|
||||
followup=self.rng.choice(TECH_TERMS),
|
||||
feature_name=self.rng.choice(TECH_TERMS),
|
||||
capability=self.rng.choice(TECH_TERMS),
|
||||
deps=", ".join(self.rng.sample(TECH_TERMS, 2)),
|
||||
effort=self.rng.randint(1, 15),
|
||||
)
|
||||
else:
|
||||
words = self.rng.sample(TECH_TERMS, min(5, len(TECH_TERMS)))
|
||||
text = " ".join(words) + ". " + self.rng.choice(TECH_TERMS) + " implementation details follow.\n"
|
||||
parts.append(text)
|
||||
total += len(text)
|
||||
return "\n".join(parts)[:max_chars]
|
||||
|
||||
# ── Project tree generation (for mine() tests) ───────────────────────
|
||||
|
||||
def generate_project_tree(self, base_path, wing=None, rooms=None, n_files=50):
|
||||
"""
|
||||
Write realistic project files + mempalace.yaml to base_path.
|
||||
|
||||
Returns the project path suitable for passing to mine().
|
||||
"""
|
||||
base = Path(base_path)
|
||||
base.mkdir(parents=True, exist_ok=True)
|
||||
wing = wing or self.rng.choice(self.wings)
|
||||
rooms = rooms or self.rooms_by_wing.get(wing, ["general"])
|
||||
|
||||
# Write mempalace.yaml
|
||||
room_defs = [{"name": r, "description": f"{r} code and docs"} for r in rooms]
|
||||
with open(base / "mempalace.yaml", "w") as f:
|
||||
yaml.dump({"wing": wing, "rooms": room_defs}, f)
|
||||
|
||||
# Write files distributed across room directories
|
||||
files_written = 0
|
||||
for i in range(n_files):
|
||||
room = rooms[i % len(rooms)]
|
||||
room_dir = base / room
|
||||
room_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
ext = self.rng.choice([".py", ".js", ".md", ".ts", ".yaml"])
|
||||
filename = f"file_{i:04d}{ext}"
|
||||
content = self._random_text(400, 2000)
|
||||
(room_dir / filename).write_text(content, encoding="utf-8")
|
||||
files_written += 1
|
||||
|
||||
return str(base), wing, rooms, files_written
|
||||
|
||||
# ── Conversation file generation (for mine_convos() tests) ───────────
|
||||
|
||||
def generate_conversation_files(self, base_path, wing=None, n_files=20):
|
||||
"""Write conversation transcript files for convo_miner tests."""
|
||||
base = Path(base_path)
|
||||
base.mkdir(parents=True, exist_ok=True)
|
||||
wing = wing or self.rng.choice(self.wings)
|
||||
|
||||
for i in range(n_files):
|
||||
lines = []
|
||||
n_exchanges = self.rng.randint(5, 20)
|
||||
for j in range(n_exchanges):
|
||||
user_msg = f"> User: {self.rng.choice(TECH_TERMS)}? How does {self.rng.choice(TECH_TERMS)} work with {self.rng.choice(TECH_TERMS)}?"
|
||||
ai_msg = self._random_text(200, 600)
|
||||
lines.append(user_msg)
|
||||
lines.append(ai_msg)
|
||||
lines.append("")
|
||||
|
||||
(base / f"convo_{i:04d}.txt").write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
return str(base), wing
|
||||
|
||||
# ── Direct palace population (bypasses mining for speed) ─────────────
|
||||
|
||||
def populate_palace_directly(self, palace_path, n_drawers=None, include_needles=True):
|
||||
"""
|
||||
Insert drawers directly into ChromaDB, bypassing the mining pipeline.
|
||||
|
||||
Much faster than mining for benchmarks that only care about
|
||||
search/MCP behavior on a pre-populated palace.
|
||||
|
||||
Returns (client, collection, needle_info).
|
||||
"""
|
||||
n_drawers = n_drawers or self.cfg["drawers"]
|
||||
os.makedirs(palace_path, exist_ok=True)
|
||||
client = chromadb.PersistentClient(path=palace_path)
|
||||
col = client.get_or_create_collection("mempalace_drawers")
|
||||
|
||||
batch_size = 500
|
||||
docs = []
|
||||
ids = []
|
||||
metas = []
|
||||
|
||||
# Insert needles first
|
||||
needle_info = []
|
||||
if include_needles:
|
||||
for needle in self.needles:
|
||||
needle_id = f"drawer_{needle['wing']}_{needle['room']}_{hashlib.md5(needle['id'].encode()).hexdigest()[:16]}"
|
||||
docs.append(needle["content"])
|
||||
ids.append(needle_id)
|
||||
metas.append({
|
||||
"wing": needle["wing"],
|
||||
"room": needle["room"],
|
||||
"source_file": f"needle_{needle['id']}.txt",
|
||||
"chunk_index": 0,
|
||||
"added_by": "benchmark",
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
})
|
||||
needle_info.append({"id": needle_id, "query": needle["query"], "wing": needle["wing"], "room": needle["room"]})
|
||||
|
||||
# Fill remaining drawers with realistic content
|
||||
remaining = n_drawers - len(docs)
|
||||
for i in range(remaining):
|
||||
wing = self.wings[i % len(self.wings)]
|
||||
rooms = self.rooms_by_wing[wing]
|
||||
room = rooms[i % len(rooms)]
|
||||
content = self._random_text(400, 800)
|
||||
drawer_id = f"drawer_{wing}_{room}_{hashlib.md5(f'gen_{i}'.encode()).hexdigest()[:16]}"
|
||||
|
||||
docs.append(content)
|
||||
ids.append(drawer_id)
|
||||
metas.append({
|
||||
"wing": wing,
|
||||
"room": room,
|
||||
"source_file": f"generated_{i:06d}.txt",
|
||||
"chunk_index": i % 10,
|
||||
"added_by": "benchmark",
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
})
|
||||
|
||||
# Flush in batches
|
||||
if len(docs) >= batch_size:
|
||||
col.add(documents=docs, ids=ids, metadatas=metas)
|
||||
docs, ids, metas = [], [], []
|
||||
|
||||
# Flush remainder
|
||||
if docs:
|
||||
col.add(documents=docs, ids=ids, metadatas=metas)
|
||||
|
||||
return client, col, needle_info
|
||||
|
||||
# ── KG triple generation ─────────────────────────────────────────────
|
||||
|
||||
def generate_kg_triples(self, n_entities=None, n_triples=None):
|
||||
"""
|
||||
Generate realistic entity-relationship triples.
|
||||
|
||||
Returns (entities, triples) where:
|
||||
entities = [(name, type), ...]
|
||||
triples = [(subject, predicate, object, valid_from, valid_to), ...]
|
||||
"""
|
||||
n_entities = n_entities or self.cfg["kg_entities"]
|
||||
n_triples = n_triples or self.cfg["kg_triples"]
|
||||
|
||||
# Generate entities
|
||||
entities = []
|
||||
entity_names = []
|
||||
for i in range(n_entities):
|
||||
if i < len(ENTITY_NAMES):
|
||||
name = ENTITY_NAMES[i]
|
||||
else:
|
||||
name = f"Entity_{i:04d}"
|
||||
etype = self.rng.choice(ENTITY_TYPES)
|
||||
entities.append((name, etype))
|
||||
entity_names.append(name)
|
||||
|
||||
# Generate triples
|
||||
triples = []
|
||||
base_date = datetime(2024, 1, 1)
|
||||
for i in range(n_triples):
|
||||
subject = self.rng.choice(entity_names)
|
||||
obj = self.rng.choice(entity_names)
|
||||
while obj == subject:
|
||||
obj = self.rng.choice(entity_names)
|
||||
predicate = self.rng.choice(PREDICATES)
|
||||
days_offset = self.rng.randint(0, 730)
|
||||
valid_from = (base_date + timedelta(days=days_offset)).strftime("%Y-%m-%d")
|
||||
# 30% chance of having a valid_to
|
||||
valid_to = None
|
||||
if self.rng.random() < 0.3:
|
||||
end_offset = self.rng.randint(30, 365)
|
||||
valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime("%Y-%m-%d")
|
||||
triples.append((subject, predicate, obj, valid_from, valid_to))
|
||||
|
||||
return entities, triples
|
||||
|
||||
# ── Search query generation ──────────────────────────────────────────
|
||||
|
||||
def generate_search_queries(self, n_queries=None):
|
||||
"""
|
||||
Generate search queries with expected results.
|
||||
|
||||
Returns list of {"query": str, "expected_wing": str|None, "expected_room": str|None, "is_needle": bool}.
|
||||
Needle queries have known-good answers for recall measurement.
|
||||
"""
|
||||
n_queries = n_queries or self.cfg["search_queries"]
|
||||
queries = []
|
||||
|
||||
# Half are needle queries (known-good answers)
|
||||
n_needle = min(n_queries // 2, len(self.needles))
|
||||
for needle in self.needles[:n_needle]:
|
||||
queries.append({
|
||||
"query": needle["query"],
|
||||
"expected_wing": needle["wing"],
|
||||
"expected_room": needle["room"],
|
||||
"needle_id": needle["id"],
|
||||
"is_needle": True,
|
||||
})
|
||||
|
||||
# Other half are generic queries (measure latency, not recall)
|
||||
n_generic = n_queries - n_needle
|
||||
for _ in range(n_generic):
|
||||
queries.append({
|
||||
"query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS),
|
||||
"expected_wing": None,
|
||||
"expected_room": None,
|
||||
"needle_id": None,
|
||||
"is_needle": False,
|
||||
})
|
||||
|
||||
self.rng.shuffle(queries)
|
||||
return queries
|
||||
Reference in New Issue
Block a user