bench: add scale benchmark suite (94 tests)

Benchmark mempalace at configurable scale (1K–100K drawers) to find real-world performance limits. Tests cover MCP tool OOM thresholds, ChromaDB query degradation, search recall@k, mining throughput, knowledge graph concurrency, memory leak detection, palace boost quantification, and Layer1 unbounded fetch behavior. - tests/benchmarks/ with 8 test modules + data generator + report system - Deterministic data factory with planted needles for recall measurement - JSON report output with regression detection (--bench-report flag) - CI benchmark job on PRs at small scale - psutil added as dev dependency for RSS tracking
2026-04-07 19:39:06 -03:00
parent 71736a3f4f
commit 7b89291334
15 changed files with 2453 additions and 3 deletions
@@ -0,0 +1,395 @@
+"""
+Deterministic data factory for MemPalace scale benchmarks.
+
+Generates realistic project files, conversations, and KG triples at
+configurable scale levels. All randomness uses seeded RNG for reproducibility.
+
+Planted "needle" drawers enable recall measurement without an LLM judge.
+"""
+
+import hashlib
+import os
+import random
+import string
+from datetime import datetime, timedelta
+from pathlib import Path
+
+import chromadb
+import yaml
+
+
+# ── Scale configurations ─────────────────────────────────────────────────
+
+SCALE_CONFIGS = {
+    "small": {"drawers": 1_000, "wings": 3, "rooms_per_wing": 5, "kg_entities": 50, "kg_triples": 200, "needles": 20, "search_queries": 20},
+    "medium": {"drawers": 10_000, "wings": 8, "rooms_per_wing": 12, "kg_entities": 200, "kg_triples": 2_000, "needles": 50, "search_queries": 50},
+    "large": {"drawers": 50_000, "wings": 15, "rooms_per_wing": 20, "kg_entities": 500, "kg_triples": 10_000, "needles": 100, "search_queries": 100},
+    "stress": {"drawers": 100_000, "wings": 25, "rooms_per_wing": 30, "kg_entities": 1_000, "kg_triples": 50_000, "needles": 200, "search_queries": 200},
+}
+
+# ── Vocabulary banks for realistic content ───────────────────────────────
+
+WING_NAMES = [
+    "webapp", "backend_api", "mobile_app", "data_pipeline", "ml_platform",
+    "devops", "auth_service", "payments", "analytics", "docs_site",
+    "cli_tool", "dashboard", "notification_service", "search_engine",
+    "user_mgmt", "inventory", "reporting", "testing_infra", "monitoring",
+    "email_service", "chat_bot", "file_storage", "scheduler", "gateway",
+    "marketplace",
+]
+
+ROOM_NAMES = [
+    "backend", "frontend", "api", "database", "auth", "tests", "docs",
+    "config", "deployment", "models", "views", "controllers", "middleware",
+    "utils", "schemas", "migrations", "fixtures", "scripts", "styles",
+    "components", "hooks", "services", "routes", "templates", "static",
+    "media", "logging", "cache", "queue", "workers",
+]
+
+TECH_TERMS = [
+    "authentication", "authorization", "middleware", "endpoint", "REST API",
+    "GraphQL", "WebSocket", "database migration", "ORM", "query optimization",
+    "caching strategy", "load balancer", "rate limiting", "pagination",
+    "serialization", "validation", "error handling", "logging framework",
+    "monitoring", "deployment pipeline", "CI/CD", "containerization",
+    "microservice", "event sourcing", "message queue", "pub/sub",
+    "connection pooling", "session management", "token refresh", "CORS",
+    "SSL termination", "health check", "circuit breaker", "retry logic",
+    "batch processing", "stream processing", "data pipeline", "ETL",
+    "feature flag", "A/B testing", "blue-green deployment", "canary release",
+]
+
+CODE_SNIPPETS = [
+    "def process_request(data):\n    validated = schema.validate(data)\n    result = handler.execute(validated)\n    return Response(result, status=200)\n",
+    "class UserRepository:\n    def __init__(self, db):\n        self.db = db\n    def find_by_id(self, user_id):\n        return self.db.query(User).filter(User.id == user_id).first()\n",
+    "async def fetch_data(url, timeout=30):\n    async with aiohttp.ClientSession() as session:\n        async with session.get(url, timeout=timeout) as resp:\n            return await resp.json()\n",
+    "const handleSubmit = async (formData) => {\n  try {\n    const response = await api.post('/users', formData);\n    dispatch({ type: 'USER_CREATED', payload: response.data });\n  } catch (error) {\n    setError(error.message);\n  }\n};\n",
+    "SELECT u.name, COUNT(o.id) as order_count\nFROM users u\nLEFT JOIN orders o ON u.id = o.user_id\nWHERE u.created_at > '2025-01-01'\nGROUP BY u.name\nHAVING COUNT(o.id) > 5\nORDER BY order_count DESC;\n",
+]
+
+PROSE_TEMPLATES = [
+    "The {component} module handles {task}. It was refactored in {month} to improve {quality}. Key design decision: {decision}.",
+    "Bug report: {component} fails when {condition}. Root cause: {cause}. Fixed by {fix}. Regression test added in {test_file}.",
+    "Architecture decision: switched from {old_tech} to {new_tech} for {reason}. Migration completed {date}. Performance improved by {percent}%.",
+    "Meeting notes: discussed {topic} with {person}. Agreed to {action}. Deadline: {deadline}. Follow-up: {followup}.",
+    "Feature spec: {feature_name} allows users to {capability}. Dependencies: {deps}. Estimated effort: {effort} days.",
+]
+
+ENTITY_NAMES = [
+    "Alice", "Bob", "Carol", "Dave", "Eve", "Frank", "Grace", "Heidi",
+    "Ivan", "Judy", "Karl", "Linda", "Mike", "Nina", "Oscar", "Pat",
+    "Quinn", "Rita", "Steve", "Tina", "Ursula", "Victor", "Wendy", "Xander",
+]
+
+ENTITY_TYPES = ["person", "project", "tool", "concept", "team", "service"]
+
+PREDICATES = [
+    "works_on", "manages", "reports_to", "collaborates_with", "created",
+    "maintains", "uses", "depends_on", "replaced", "reviewed", "deployed",
+    "tested", "documented", "mentors", "leads", "contributes_to",
+]
+
+
+class PalaceDataGenerator:
+    """Generate deterministic, realistic test data at configurable scale."""
+
+    def __init__(self, seed=42, scale="small"):
+        self.rng = random.Random(seed)
+        self.scale = scale
+        self.cfg = SCALE_CONFIGS[scale]
+        self.wings = WING_NAMES[: self.cfg["wings"]]
+        self.rooms_by_wing = {}
+        for wing in self.wings:
+            n = self.cfg["rooms_per_wing"]
+            rooms = self.rng.sample(ROOM_NAMES, min(n, len(ROOM_NAMES)))
+            self.rooms_by_wing[wing] = rooms
+        # Planted needles for recall measurement
+        self.needles = []
+        self._generate_needles()
+
+    def _generate_needles(self):
+        """Create unique needle content for recall testing."""
+        topics = [
+            "Fibonacci sequence optimization uses memoization with O(n) space complexity",
+            "PostgreSQL vacuum autovacuum threshold set to 50 percent for table users",
+            "Redis cluster failover timeout configured at 30 seconds with sentinel monitoring",
+            "Kubernetes horizontal pod autoscaler targets 70 percent CPU utilization",
+            "GraphQL subscription uses WebSocket transport with heartbeat interval 25 seconds",
+            "JWT token rotation policy requires refresh every 15 minutes with sliding window",
+            "Elasticsearch index sharding strategy uses 5 primary shards with 1 replica each",
+            "Docker multi-stage build reduces image size from 1.2GB to 180MB for production",
+            "Apache Kafka consumer group rebalance timeout set to 45 seconds",
+            "MongoDB change streams resume token persisted every 100 operations",
+            "gRPC streaming uses bidirectional flow control with 64KB window size",
+            "Prometheus alerting rule fires when p99 latency exceeds 500ms for 5 minutes",
+            "Terraform state locking uses DynamoDB with consistent reads enabled",
+            "Nginx rate limiting configured at 100 requests per second with burst of 50",
+            "SQLAlchemy connection pool size set to 20 with max overflow of 10 connections",
+            "React concurrent mode uses startTransition for non-urgent state updates",
+            "AWS Lambda cold start mitigation uses provisioned concurrency of 10 instances",
+            "Git bisect automated with custom test script for regression hunting",
+            "OpenTelemetry trace sampling rate set to 10 percent in production environment",
+            "Celery worker prefetch multiplier set to 1 for fair task distribution",
+        ]
+        for i in range(self.cfg["needles"]):
+            topic = topics[i % len(topics)]
+            wing = self.rng.choice(self.wings)
+            room = self.rng.choice(self.rooms_by_wing[wing])
+            needle_id = f"NEEDLE_{i:04d}"
+            content = f"{needle_id}: {topic}. This is a unique planted needle for recall benchmarking at scale."
+            self.needles.append({
+                "id": needle_id,
+                "content": content,
+                "wing": wing,
+                "room": room,
+                "query": topic.split(" uses ")[0] if " uses " in topic else topic.split(" set to ")[0] if " set to " in topic else topic[:60],
+            })
+
+    def _random_text(self, min_chars=600, max_chars=900):
+        """Generate a random text block of realistic content."""
+        parts = []
+        total = 0
+        target = self.rng.randint(min_chars, max_chars)
+        while total < target:
+            choice = self.rng.random()
+            if choice < 0.3:
+                text = self.rng.choice(CODE_SNIPPETS)
+            elif choice < 0.7:
+                template = self.rng.choice(PROSE_TEMPLATES)
+                text = template.format(
+                    component=self.rng.choice(ROOM_NAMES),
+                    task=self.rng.choice(TECH_TERMS),
+                    month=self.rng.choice(["January", "February", "March", "April", "May"]),
+                    quality=self.rng.choice(["performance", "readability", "test coverage", "latency"]),
+                    decision=self.rng.choice(TECH_TERMS),
+                    condition=self.rng.choice(TECH_TERMS) + " is null",
+                    cause=self.rng.choice(["race condition", "null pointer", "timeout", "OOM"]),
+                    fix="adding " + self.rng.choice(TECH_TERMS),
+                    test_file=f"test_{self.rng.choice(ROOM_NAMES)}.py",
+                    old_tech=self.rng.choice(["MySQL", "Flask", "REST", "Jenkins"]),
+                    new_tech=self.rng.choice(["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]),
+                    reason=self.rng.choice(TECH_TERMS),
+                    date=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
+                    percent=self.rng.randint(10, 80),
+                    topic=self.rng.choice(TECH_TERMS),
+                    person=self.rng.choice(ENTITY_NAMES),
+                    action=self.rng.choice(["refactor", "migrate", "optimize", "test"]),
+                    deadline=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
+                    followup=self.rng.choice(TECH_TERMS),
+                    feature_name=self.rng.choice(TECH_TERMS),
+                    capability=self.rng.choice(TECH_TERMS),
+                    deps=", ".join(self.rng.sample(TECH_TERMS, 2)),
+                    effort=self.rng.randint(1, 15),
+                )
+            else:
+                words = self.rng.sample(TECH_TERMS, min(5, len(TECH_TERMS)))
+                text = " ".join(words) + ". " + self.rng.choice(TECH_TERMS) + " implementation details follow.\n"
+            parts.append(text)
+            total += len(text)
+        return "\n".join(parts)[:max_chars]
+
+    # ── Project tree generation (for mine() tests) ───────────────────────
+
+    def generate_project_tree(self, base_path, wing=None, rooms=None, n_files=50):
+        """
+        Write realistic project files + mempalace.yaml to base_path.
+
+        Returns the project path suitable for passing to mine().
+        """
+        base = Path(base_path)
+        base.mkdir(parents=True, exist_ok=True)
+        wing = wing or self.rng.choice(self.wings)
+        rooms = rooms or self.rooms_by_wing.get(wing, ["general"])
+
+        # Write mempalace.yaml
+        room_defs = [{"name": r, "description": f"{r} code and docs"} for r in rooms]
+        with open(base / "mempalace.yaml", "w") as f:
+            yaml.dump({"wing": wing, "rooms": room_defs}, f)
+
+        # Write files distributed across room directories
+        files_written = 0
+        for i in range(n_files):
+            room = rooms[i % len(rooms)]
+            room_dir = base / room
+            room_dir.mkdir(parents=True, exist_ok=True)
+
+            ext = self.rng.choice([".py", ".js", ".md", ".ts", ".yaml"])
+            filename = f"file_{i:04d}{ext}"
+            content = self._random_text(400, 2000)
+            (room_dir / filename).write_text(content, encoding="utf-8")
+            files_written += 1
+
+        return str(base), wing, rooms, files_written
+
+    # ── Conversation file generation (for mine_convos() tests) ───────────
+
+    def generate_conversation_files(self, base_path, wing=None, n_files=20):
+        """Write conversation transcript files for convo_miner tests."""
+        base = Path(base_path)
+        base.mkdir(parents=True, exist_ok=True)
+        wing = wing or self.rng.choice(self.wings)
+
+        for i in range(n_files):
+            lines = []
+            n_exchanges = self.rng.randint(5, 20)
+            for j in range(n_exchanges):
+                user_msg = f"> User: {self.rng.choice(TECH_TERMS)}? How does {self.rng.choice(TECH_TERMS)} work with {self.rng.choice(TECH_TERMS)}?"
+                ai_msg = self._random_text(200, 600)
+                lines.append(user_msg)
+                lines.append(ai_msg)
+                lines.append("")
+
+            (base / f"convo_{i:04d}.txt").write_text("\n".join(lines), encoding="utf-8")
+
+        return str(base), wing
+
+    # ── Direct palace population (bypasses mining for speed) ─────────────
+
+    def populate_palace_directly(self, palace_path, n_drawers=None, include_needles=True):
+        """
+        Insert drawers directly into ChromaDB, bypassing the mining pipeline.
+
+        Much faster than mining for benchmarks that only care about
+        search/MCP behavior on a pre-populated palace.
+
+        Returns (client, collection, needle_info).
+        """
+        n_drawers = n_drawers or self.cfg["drawers"]
+        os.makedirs(palace_path, exist_ok=True)
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection("mempalace_drawers")
+
+        batch_size = 500
+        docs = []
+        ids = []
+        metas = []
+
+        # Insert needles first
+        needle_info = []
+        if include_needles:
+            for needle in self.needles:
+                needle_id = f"drawer_{needle['wing']}_{needle['room']}_{hashlib.md5(needle['id'].encode()).hexdigest()[:16]}"
+                docs.append(needle["content"])
+                ids.append(needle_id)
+                metas.append({
+                    "wing": needle["wing"],
+                    "room": needle["room"],
+                    "source_file": f"needle_{needle['id']}.txt",
+                    "chunk_index": 0,
+                    "added_by": "benchmark",
+                    "filed_at": datetime.now().isoformat(),
+                })
+                needle_info.append({"id": needle_id, "query": needle["query"], "wing": needle["wing"], "room": needle["room"]})
+
+        # Fill remaining drawers with realistic content
+        remaining = n_drawers - len(docs)
+        for i in range(remaining):
+            wing = self.wings[i % len(self.wings)]
+            rooms = self.rooms_by_wing[wing]
+            room = rooms[i % len(rooms)]
+            content = self._random_text(400, 800)
+            drawer_id = f"drawer_{wing}_{room}_{hashlib.md5(f'gen_{i}'.encode()).hexdigest()[:16]}"
+
+            docs.append(content)
+            ids.append(drawer_id)
+            metas.append({
+                "wing": wing,
+                "room": room,
+                "source_file": f"generated_{i:06d}.txt",
+                "chunk_index": i % 10,
+                "added_by": "benchmark",
+                "filed_at": datetime.now().isoformat(),
+            })
+
+            # Flush in batches
+            if len(docs) >= batch_size:
+                col.add(documents=docs, ids=ids, metadatas=metas)
+                docs, ids, metas = [], [], []
+
+        # Flush remainder
+        if docs:
+            col.add(documents=docs, ids=ids, metadatas=metas)
+
+        return client, col, needle_info
+
+    # ── KG triple generation ─────────────────────────────────────────────
+
+    def generate_kg_triples(self, n_entities=None, n_triples=None):
+        """
+        Generate realistic entity-relationship triples.
+
+        Returns (entities, triples) where:
+          entities = [(name, type), ...]
+          triples = [(subject, predicate, object, valid_from, valid_to), ...]
+        """
+        n_entities = n_entities or self.cfg["kg_entities"]
+        n_triples = n_triples or self.cfg["kg_triples"]
+
+        # Generate entities
+        entities = []
+        entity_names = []
+        for i in range(n_entities):
+            if i < len(ENTITY_NAMES):
+                name = ENTITY_NAMES[i]
+            else:
+                name = f"Entity_{i:04d}"
+            etype = self.rng.choice(ENTITY_TYPES)
+            entities.append((name, etype))
+            entity_names.append(name)
+
+        # Generate triples
+        triples = []
+        base_date = datetime(2024, 1, 1)
+        for i in range(n_triples):
+            subject = self.rng.choice(entity_names)
+            obj = self.rng.choice(entity_names)
+            while obj == subject:
+                obj = self.rng.choice(entity_names)
+            predicate = self.rng.choice(PREDICATES)
+            days_offset = self.rng.randint(0, 730)
+            valid_from = (base_date + timedelta(days=days_offset)).strftime("%Y-%m-%d")
+            # 30% chance of having a valid_to
+            valid_to = None
+            if self.rng.random() < 0.3:
+                end_offset = self.rng.randint(30, 365)
+                valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime("%Y-%m-%d")
+            triples.append((subject, predicate, obj, valid_from, valid_to))
+
+        return entities, triples
+
+    # ── Search query generation ──────────────────────────────────────────
+
+    def generate_search_queries(self, n_queries=None):
+        """
+        Generate search queries with expected results.
+
+        Returns list of {"query": str, "expected_wing": str|None, "expected_room": str|None, "is_needle": bool}.
+        Needle queries have known-good answers for recall measurement.
+        """
+        n_queries = n_queries or self.cfg["search_queries"]
+        queries = []
+
+        # Half are needle queries (known-good answers)
+        n_needle = min(n_queries // 2, len(self.needles))
+        for needle in self.needles[:n_needle]:
+            queries.append({
+                "query": needle["query"],
+                "expected_wing": needle["wing"],
+                "expected_room": needle["room"],
+                "needle_id": needle["id"],
+                "is_needle": True,
+            })
+
+        # Other half are generic queries (measure latency, not recall)
+        n_generic = n_queries - n_needle
+        for _ in range(n_generic):
+            queries.append({
+                "query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS),
+                "expected_wing": None,
+                "expected_room": None,
+                "needle_id": None,
+                "is_needle": False,
+            })
+
+        self.rng.shuffle(queries)
+        return queries