fix: resolve formatting, regression logic, and pytest defaults

- Run ruff format on all benchmark files (fixes CI lint job) - Fix check_regression() substring ambiguity: ordered keyword matching so "latency_improvement_pct" is correctly classified as higher-is-better - Update stale comments in conftest.py referencing wrong fixture - Add pytest addopts to skip benchmark/slow/stress markers by default
2026-04-08 10:56:39 -03:00
parent 7e4db33061
commit ebc26f3960
12 changed files with 383 additions and 138 deletions
@@ -65,6 +65,7 @@ quote-style = "double"
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 pythonpath = ["."]
 addopts = "-m 'not benchmark and not slow and not stress'"
 markers = [
    "benchmark: scale/performance benchmark tests",
    "slow: tests that take more than 30 seconds",
@@ -96,8 +96,7 @@ def pytest_terminal_summary(terminalreporter, config):
    if not report_path:
        return
-    # Collect results from the session fixture if available
+    # Collect results written by individual tests via record_metric()
    # The results are written by individual tests via bench_results fixture
    import platform
    import subprocess
@@ -129,7 +128,7 @@ def pytest_terminal_summary(terminalreporter, config):
        "results": {},
    }
-    # Read results from a temp file written by the bench_results fixture
+    # Read results from the temp file written by record_metric()
    results_file = os.path.join(tempfile.gettempdir(), "mempalace_bench_results.json")
    if os.path.exists(results_file):
        try:
@@ -20,42 +20,150 @@ import yaml
 # ── Scale configurations ─────────────────────────────────────────────────
 SCALE_CONFIGS = {
-    "small": {"drawers": 1_000, "wings": 3, "rooms_per_wing": 5, "kg_entities": 50, "kg_triples": 200, "needles": 20, "search_queries": 20},
+    "small": {
-    "medium": {"drawers": 10_000, "wings": 8, "rooms_per_wing": 12, "kg_entities": 200, "kg_triples": 2_000, "needles": 50, "search_queries": 50},
+        "drawers": 1_000,
-    "large": {"drawers": 50_000, "wings": 15, "rooms_per_wing": 20, "kg_entities": 500, "kg_triples": 10_000, "needles": 100, "search_queries": 100},
+        "wings": 3,
-    "stress": {"drawers": 100_000, "wings": 25, "rooms_per_wing": 30, "kg_entities": 1_000, "kg_triples": 50_000, "needles": 200, "search_queries": 200},
+        "rooms_per_wing": 5,
        "kg_entities": 50,
        "kg_triples": 200,
        "needles": 20,
        "search_queries": 20,
    },
    "medium": {
        "drawers": 10_000,
        "wings": 8,
        "rooms_per_wing": 12,
        "kg_entities": 200,
        "kg_triples": 2_000,
        "needles": 50,
        "search_queries": 50,
    },
    "large": {
        "drawers": 50_000,
        "wings": 15,
        "rooms_per_wing": 20,
        "kg_entities": 500,
        "kg_triples": 10_000,
        "needles": 100,
        "search_queries": 100,
    },
    "stress": {
        "drawers": 100_000,
        "wings": 25,
        "rooms_per_wing": 30,
        "kg_entities": 1_000,
        "kg_triples": 50_000,
        "needles": 200,
        "search_queries": 200,
    },
 }
 # ── Vocabulary banks for realistic content ───────────────────────────────
 WING_NAMES = [
-    "webapp", "backend_api", "mobile_app", "data_pipeline", "ml_platform",
+    "webapp",
-    "devops", "auth_service", "payments", "analytics", "docs_site",
+    "backend_api",
-    "cli_tool", "dashboard", "notification_service", "search_engine",
+    "mobile_app",
-    "user_mgmt", "inventory", "reporting", "testing_infra", "monitoring",
+    "data_pipeline",
-    "email_service", "chat_bot", "file_storage", "scheduler", "gateway",
+    "ml_platform",
    "devops",
    "auth_service",
    "payments",
    "analytics",
    "docs_site",
    "cli_tool",
    "dashboard",
    "notification_service",
    "search_engine",
    "user_mgmt",
    "inventory",
    "reporting",
    "testing_infra",
    "monitoring",
    "email_service",
    "chat_bot",
    "file_storage",
    "scheduler",
    "gateway",
    "marketplace",
 ]
 ROOM_NAMES = [
-    "backend", "frontend", "api", "database", "auth", "tests", "docs",
+    "backend",
-    "config", "deployment", "models", "views", "controllers", "middleware",
+    "frontend",
-    "utils", "schemas", "migrations", "fixtures", "scripts", "styles",
+    "api",
-    "components", "hooks", "services", "routes", "templates", "static",
+    "database",
-    "media", "logging", "cache", "queue", "workers",
+    "auth",
    "tests",
    "docs",
    "config",
    "deployment",
    "models",
    "views",
    "controllers",
    "middleware",
    "utils",
    "schemas",
    "migrations",
    "fixtures",
    "scripts",
    "styles",
    "components",
    "hooks",
    "services",
    "routes",
    "templates",
    "static",
    "media",
    "logging",
    "cache",
    "queue",
    "workers",
 ]
 TECH_TERMS = [
-    "authentication", "authorization", "middleware", "endpoint", "REST API",
+    "authentication",
-    "GraphQL", "WebSocket", "database migration", "ORM", "query optimization",
+    "authorization",
-    "caching strategy", "load balancer", "rate limiting", "pagination",
+    "middleware",
-    "serialization", "validation", "error handling", "logging framework",
+    "endpoint",
-    "monitoring", "deployment pipeline", "CI/CD", "containerization",
+    "REST API",
-    "microservice", "event sourcing", "message queue", "pub/sub",
+    "GraphQL",
-    "connection pooling", "session management", "token refresh", "CORS",
+    "WebSocket",
-    "SSL termination", "health check", "circuit breaker", "retry logic",
+    "database migration",
-    "batch processing", "stream processing", "data pipeline", "ETL",
+    "ORM",
-    "feature flag", "A/B testing", "blue-green deployment", "canary release",
+    "query optimization",
    "caching strategy",
    "load balancer",
    "rate limiting",
    "pagination",
    "serialization",
    "validation",
    "error handling",
    "logging framework",
    "monitoring",
    "deployment pipeline",
    "CI/CD",
    "containerization",
    "microservice",
    "event sourcing",
    "message queue",
    "pub/sub",
    "connection pooling",
    "session management",
    "token refresh",
    "CORS",
    "SSL termination",
    "health check",
    "circuit breaker",
    "retry logic",
    "batch processing",
    "stream processing",
    "data pipeline",
    "ETL",
    "feature flag",
    "A/B testing",
    "blue-green deployment",
    "canary release",
 ]
 CODE_SNIPPETS = [
@@ -75,17 +183,51 @@ PROSE_TEMPLATES = [
 ]
 ENTITY_NAMES = [
-    "Alice", "Bob", "Carol", "Dave", "Eve", "Frank", "Grace", "Heidi",
+    "Alice",
-    "Ivan", "Judy", "Karl", "Linda", "Mike", "Nina", "Oscar", "Pat",
+    "Bob",
-    "Quinn", "Rita", "Steve", "Tina", "Ursula", "Victor", "Wendy", "Xander",
+    "Carol",
    "Dave",
    "Eve",
    "Frank",
    "Grace",
    "Heidi",
    "Ivan",
    "Judy",
    "Karl",
    "Linda",
    "Mike",
    "Nina",
    "Oscar",
    "Pat",
    "Quinn",
    "Rita",
    "Steve",
    "Tina",
    "Ursula",
    "Victor",
    "Wendy",
    "Xander",
 ]
 ENTITY_TYPES = ["person", "project", "tool", "concept", "team", "service"]
 PREDICATES = [
-    "works_on", "manages", "reports_to", "collaborates_with", "created",
+    "works_on",
-    "maintains", "uses", "depends_on", "replaced", "reviewed", "deployed",
+    "manages",
-    "tested", "documented", "mentors", "leads", "contributes_to",
+    "reports_to",
    "collaborates_with",
    "created",
    "maintains",
    "uses",
    "depends_on",
    "replaced",
    "reviewed",
    "deployed",
    "tested",
    "documented",
    "mentors",
    "leads",
    "contributes_to",
 ]
@@ -136,13 +278,19 @@ class PalaceDataGenerator:
            room = self.rng.choice(self.rooms_by_wing[wing])
            needle_id = f"NEEDLE_{i:04d}"
            content = f"{needle_id}: {topic}. This is a unique planted needle for recall benchmarking at scale."
-            self.needles.append({
+            self.needles.append(
-                "id": needle_id,
+                {
-                "content": content,
+                    "id": needle_id,
-                "wing": wing,
+                    "content": content,
-                "room": room,
+                    "wing": wing,
-                "query": topic.split(" uses ")[0] if " uses " in topic else topic.split(" set to ")[0] if " set to " in topic else topic[:60],
+                    "room": room,
-            })
+                    "query": topic.split(" uses ")[0]
                    if " uses " in topic
                    else topic.split(" set to ")[0]
                    if " set to " in topic
                    else topic[:60],
                }
            )
    def _random_text(self, min_chars=600, max_chars=900):
        """Generate a random text block of realistic content."""
@@ -159,21 +307,25 @@ class PalaceDataGenerator:
                    component=self.rng.choice(ROOM_NAMES),
                    task=self.rng.choice(TECH_TERMS),
                    month=self.rng.choice(["January", "February", "March", "April", "May"]),
-                    quality=self.rng.choice(["performance", "readability", "test coverage", "latency"]),
+                    quality=self.rng.choice(
                        ["performance", "readability", "test coverage", "latency"]
                    ),
                    decision=self.rng.choice(TECH_TERMS),
                    condition=self.rng.choice(TECH_TERMS) + " is null",
                    cause=self.rng.choice(["race condition", "null pointer", "timeout", "OOM"]),
                    fix="adding " + self.rng.choice(TECH_TERMS),
                    test_file=f"test_{self.rng.choice(ROOM_NAMES)}.py",
                    old_tech=self.rng.choice(["MySQL", "Flask", "REST", "Jenkins"]),
-                    new_tech=self.rng.choice(["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]),
+                    new_tech=self.rng.choice(
                        ["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]
                    ),
                    reason=self.rng.choice(TECH_TERMS),
-                    date=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
+                    date=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}",
                    percent=self.rng.randint(10, 80),
                    topic=self.rng.choice(TECH_TERMS),
                    person=self.rng.choice(ENTITY_NAMES),
                    action=self.rng.choice(["refactor", "migrate", "optimize", "test"]),
-                    deadline=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
+                    deadline=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}",
                    followup=self.rng.choice(TECH_TERMS),
                    feature_name=self.rng.choice(TECH_TERMS),
                    capability=self.rng.choice(TECH_TERMS),
@@ -182,7 +334,12 @@ class PalaceDataGenerator:
                )
            else:
                words = self.rng.sample(TECH_TERMS, min(5, len(TECH_TERMS)))
-                text = " ".join(words) + ". " + self.rng.choice(TECH_TERMS) + " implementation details follow.\n"
+                text = (
                    " ".join(words)
                    + ". "
                    + self.rng.choice(TECH_TERMS)
                    + " implementation details follow.\n"
                )
            parts.append(text)
            total += len(text)
        return "\n".join(parts)[:max_chars]
@@ -270,15 +427,24 @@ class PalaceDataGenerator:
                needle_id = f"drawer_{needle['wing']}_{needle['room']}_{hashlib.md5(needle['id'].encode()).hexdigest()[:16]}"
                docs.append(needle["content"])
                ids.append(needle_id)
-                metas.append({
+                metas.append(
-                    "wing": needle["wing"],
+                    {
-                    "room": needle["room"],
+                        "wing": needle["wing"],
-                    "source_file": f"needle_{needle['id']}.txt",
+                        "room": needle["room"],
-                    "chunk_index": 0,
+                        "source_file": f"needle_{needle['id']}.txt",
-                    "added_by": "benchmark",
+                        "chunk_index": 0,
-                    "filed_at": datetime.now().isoformat(),
+                        "added_by": "benchmark",
-                })
+                        "filed_at": datetime.now().isoformat(),
-                needle_info.append({"id": needle_id, "query": needle["query"], "wing": needle["wing"], "room": needle["room"]})
+                    }
                )
                needle_info.append(
                    {
                        "id": needle_id,
                        "query": needle["query"],
                        "wing": needle["wing"],
                        "room": needle["room"],
                    }
                )
        # Fill remaining drawers with realistic content
        remaining = n_drawers - len(docs)
@@ -291,14 +457,16 @@ class PalaceDataGenerator:
            docs.append(content)
            ids.append(drawer_id)
-            metas.append({
+            metas.append(
-                "wing": wing,
+                {
-                "room": room,
+                    "wing": wing,
-                "source_file": f"generated_{i:06d}.txt",
+                    "room": room,
-                "chunk_index": i % 10,
+                    "source_file": f"generated_{i:06d}.txt",
-                "added_by": "benchmark",
+                    "chunk_index": i % 10,
-                "filed_at": datetime.now().isoformat(),
+                    "added_by": "benchmark",
-            })
+                    "filed_at": datetime.now().isoformat(),
                }
            )
            # Flush in batches
            if len(docs) >= batch_size:
@@ -351,7 +519,9 @@ class PalaceDataGenerator:
            valid_to = None
            if self.rng.random() < 0.3:
                end_offset = self.rng.randint(30, 365)
-                valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime("%Y-%m-%d")
+                valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime(
                    "%Y-%m-%d"
                )
            triples.append((subject, predicate, obj, valid_from, valid_to))
        return entities, triples
@@ -371,24 +541,28 @@ class PalaceDataGenerator:
        # Half are needle queries (known-good answers)
        n_needle = min(n_queries // 2, len(self.needles))
        for needle in self.needles[:n_needle]:
-            queries.append({
+            queries.append(
-                "query": needle["query"],
+                {
-                "expected_wing": needle["wing"],
+                    "query": needle["query"],
-                "expected_room": needle["room"],
+                    "expected_wing": needle["wing"],
-                "needle_id": needle["id"],
+                    "expected_room": needle["room"],
-                "is_needle": True,
+                    "needle_id": needle["id"],
-            })
+                    "is_needle": True,
                }
            )
        # Other half are generic queries (measure latency, not recall)
        n_generic = n_queries - n_needle
        for _ in range(n_generic):
-            queries.append({
+            queries.append(
-                "query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS),
+                {
-                "expected_wing": None,
+                    "query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS),
-                "expected_room": None,
+                    "expected_wing": None,
-                "needle_id": None,
+                    "expected_room": None,
-                "is_needle": False,
+                    "needle_id": None,
-            })
+                    "is_needle": False,
                }
            )
        self.rng.shuffle(queries)
        return queries
@@ -45,16 +45,45 @@ def check_regression(current_report: str, baseline_report: str, threshold: float
        baseline = json.load(f)
    regressions = []
-    # Metrics where HIGHER is worse (latency, memory, etc.)
+    # Keywords for metric direction — checked in order, first match wins.
-    higher_is_worse = {
+    # "improvement" is checked before "latency" so that composite names
-        "latency", "rss", "memory", "oom", "lock_failures", "elapsed",
+    # like "latency_improvement_pct" are classified correctly.
-        "p50_ms", "p95_ms", "p99_ms", "rss_delta_mb", "peak_rss_mb",
+    _higher_is_better_kw = [
-    }
+        "improvement",
-    # Metrics where LOWER is worse (throughput, recall, etc.)
+        "recall",
-    lower_is_worse = {
+        "throughput",
-        "recall", "throughput", "per_sec", "files_per_sec", "drawers_per_sec",
+        "per_sec",
-        "triples_per_sec", "improvement",
+        "files_per_sec",
-    }
+        "drawers_per_sec",
        "triples_per_sec",
        "speedup",
    ]
    _higher_is_worse_kw = [
        "latency",
        "rss",
        "memory",
        "oom",
        "lock_failures",
        "elapsed",
        "p50_ms",
        "p95_ms",
        "p99_ms",
        "rss_delta_mb",
        "peak_rss_mb",
        "errors",
        "failures",
    ]
    def _metric_direction(name: str) -> str:
        """Return 'higher_better', 'higher_worse', or 'unknown'."""
        low = name.lower()
        for kw in _higher_is_better_kw:
            if kw in low:
                return "higher_better"
        for kw in _higher_is_worse_kw:
            if kw in low:
                return "higher_worse"
        return "unknown"
    for category in baseline.get("results", {}):
        if category not in current.get("results", {}):
@@ -68,23 +97,21 @@ def check_regression(current_report: str, baseline_report: str, threshold: float
            if base_val == 0:
                continue
-            # Determine direction
+            direction = _metric_direction(metric)
            is_latency_like = any(kw in metric.lower() for kw in higher_is_worse)
            is_throughput_like = any(kw in metric.lower() for kw in lower_is_worse)
-            if is_latency_like:
+            if direction == "higher_worse":
                # Higher is worse — check if current exceeds baseline by threshold
                if curr_val > base_val * (1 + threshold):
                    pct = ((curr_val - base_val) / base_val) * 100
                    regressions.append(
-                        f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)"
+                        f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)"
                    )
-            elif is_throughput_like:
+            elif direction == "higher_better":
                # Lower is worse — check if current is below baseline by threshold
                if curr_val < base_val * (1 - threshold):
                    pct = ((curr_val - base_val) / base_val) * 100
                    regressions.append(
-                        f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)"
+                        f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)"
                    )
    return regressions
@@ -143,7 +143,10 @@ class TestBulkInsertPerformance:
            batch_end = min(batch_start + batch_size, n_docs)
            batch_docs = contents[batch_start:batch_end]
            batch_ids = [f"batch_{i}" for i in range(batch_start, batch_end)]
-            batch_metas = [{"wing": "test", "room": "bench", "chunk_index": i} for i in range(batch_start, batch_end)]
+            batch_metas = [
                {"wing": "test", "room": "bench", "chunk_index": i}
                for i in range(batch_start, batch_end)
            ]
            col_batch.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
        batched_ms = (time.perf_counter() - start) * 1000
@@ -125,7 +125,9 @@ class TestChunkThroughput:
        chunks_per_sec = total_chunks / max(elapsed, 0.001)
        kb_per_sec = (len(content) * n_iterations / 1024) / max(elapsed, 0.001)
-        record_metric("chunking", f"chunks_per_sec_at_{content_size_kb}kb", round(chunks_per_sec, 1))
+        record_metric(
            "chunking", f"chunks_per_sec_at_{content_size_kb}kb", round(chunks_per_sec, 1)
        )
        record_metric("chunking", f"kb_per_sec_at_{content_size_kb}kb", round(kb_per_sec, 1))
@@ -160,4 +162,8 @@ class TestReingestSkipOverhead:
        record_metric("reingest", "skip_check_elapsed_sec", round(skip_elapsed, 2))
        record_metric("reingest", "files_checked", files_written)
-        record_metric("reingest", "skip_check_per_file_ms", round(skip_elapsed * 1000 / max(files_written, 1), 1))
+        record_metric(
            "reingest",
            "skip_check_per_file_ms",
            round(skip_elapsed * 1000 / max(files_written, 1), 1),
        )
@@ -36,9 +36,7 @@ class TestTripleInsertionRate:
        # Measure triple insertion
        start = time.perf_counter()
        for subject, predicate, obj, valid_from, valid_to in triples:
-            kg.add_triple(
+            kg.add_triple(subject, predicate, obj, valid_from=valid_from, valid_to=valid_to)
                subject, predicate, obj, valid_from=valid_from, valid_to=valid_to
            )
        elapsed = time.perf_counter() - start
        triples_per_sec = n_triples / max(elapsed, 0.001)
@@ -128,7 +126,9 @@ class TestTemporalQueryAccuracy:
        kg.add_entity("ProjectB", "project")
        # Alice worked on ProjectA from 2024-01 to 2024-06
-        kg.add_triple("Alice", "works_on", "ProjectA", valid_from="2024-01-01", valid_to="2024-06-30")
+        kg.add_triple(
            "Alice", "works_on", "ProjectA", valid_from="2024-01-01", valid_to="2024-06-30"
        )
        # Alice worked on ProjectB from 2024-07 onwards
        kg.add_triple("Alice", "works_on", "ProjectB", valid_from="2024-07-01")
@@ -145,8 +145,16 @@ class TestTemporalQueryAccuracy:
        # Query Alice as of September 2024 — should find ProjectB
        result_sept = kg.query_entity("Alice", as_of="2024-09-15")
-        record_metric("kg_temporal", "march_query_results", len(result_march) if isinstance(result_march, list) else 0)
+        record_metric(
-        record_metric("kg_temporal", "sept_query_results", len(result_sept) if isinstance(result_sept, list) else 0)
+            "kg_temporal",
            "march_query_results",
            len(result_march) if isinstance(result_march, list) else 0,
        )
        record_metric(
            "kg_temporal",
            "sept_query_results",
            len(result_sept) if isinstance(result_sept, list) else 0,
        )
@pytest.mark.benchmark
@@ -230,7 +238,9 @@ class TestSQLiteConcurrentAccess:
            fails = 0
            for i in range(50):
                try:
-                    kg.add_triple(f"E_{i % 50}", "new_rel", f"E_{(i + 7) % 50}", valid_from="2025-06-01")
+                    kg.add_triple(
                        f"E_{i % 50}", "new_rel", f"E_{(i + 7) % 50}", valid_from="2025-06-01"
                    )
                except Exception:
                    fails += 1
            write_errors.append(fails)
@@ -116,7 +116,9 @@ class TestLayer1UnboundedFetch:
        record_metric("layer1_filter", "unfiltered_ms", round(unfiltered_ms, 1))
        record_metric("layer1_filter", "filtered_ms", round(filtered_ms, 1))
        if unfiltered_ms > 0:
-            record_metric("layer1_filter", "speedup_pct", round((1 - filtered_ms / unfiltered_ms) * 100, 1))
+            record_metric(
                "layer1_filter", "speedup_pct", round((1 - filtered_ms / unfiltered_ms) * 100, 1)
            )
@pytest.mark.benchmark
@@ -146,7 +148,9 @@ class TestWakeUpTokenBudget:
        record_metric("wakeup_budget", f"tokens_at_{n_drawers}", token_estimate)
        record_metric("wakeup_budget", f"chars_at_{n_drawers}", len(text))
-        assert token_estimate < 1200, f"Wake-up exceeded budget: ~{token_estimate} tokens at {n_drawers} drawers"
+        assert token_estimate < 1200, (
            f"Wake-up exceeded budget: ~{token_estimate} tokens at {n_drawers} drawers"
        )
@pytest.mark.benchmark
@@ -63,7 +63,9 @@ class TestSearchMemoryProfile:
        record_metric("memory_search", "rss_end_mb", round(end_rss, 2))
        record_metric("memory_search", "rss_growth_mb", round(growth, 2))
        record_metric("memory_search", "n_calls", n_calls)
-        record_metric("memory_search", "growth_per_100_calls_mb", round(growth / (n_calls / 100), 2))
+        record_metric(
            "memory_search", "growth_per_100_calls_mb", round(growth / (n_calls / 100), 2)
        )
@pytest.mark.benchmark
@@ -166,11 +168,13 @@ class TestHeapSnapshot:
        stats = snap_after.compare_to(snap_before, "lineno")
        top_allocators = []
        for stat in stats[:10]:
-            top_allocators.append({
+            top_allocators.append(
-                "file": str(stat.traceback),
+                {
-                "size_kb": round(stat.size / 1024, 1),
+                    "file": str(stat.traceback),
-                "count": stat.count,
+                    "size_kb": round(stat.size / 1024, 1),
-            })
+                    "count": stat.count,
                }
            )
        total_growth_kb = sum(s["size_kb"] for s in top_allocators)
        record_metric("heap_search", "top_10_growth_kb", round(total_growth_kb, 1))
@@ -123,8 +123,12 @@ class TestFilterLatencyBenefit:
        record_metric("filter_latency", "avg_wing_filtered_ms", round(avg_wing, 1))
        record_metric("filter_latency", "avg_room_filtered_ms", round(avg_room, 1))
        if avg_none > 0:
-            record_metric("filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1))
+            record_metric(
-            record_metric("filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1))
+                "filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1)
            )
            record_metric(
                "filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1)
            )
@pytest.mark.benchmark
@@ -61,14 +61,16 @@ def _populate_single_room(palace_path, n_drawers, n_needles=10):
        drawer_id = f"drawer_single_room_{hashlib.md5(needle_id.encode()).hexdigest()[:16]}"
        docs.append(content)
        ids.append(drawer_id)
-        metas.append({
+        metas.append(
-            "wing": "concentrated",
+            {
-            "room": "single_room",
+                "wing": "concentrated",
-            "source_file": f"needle_{i}.txt",
+                "room": "single_room",
-            "chunk_index": 0,
+                "source_file": f"needle_{i}.txt",
-            "added_by": "threshold_bench",
+                "chunk_index": 0,
-            "filed_at": datetime.now().isoformat(),
+                "added_by": "threshold_bench",
-        })
+                "filed_at": datetime.now().isoformat(),
            }
        )
    # Fill with noise — all in the SAME room
    remaining = n_drawers - len(docs)
@@ -77,14 +79,16 @@ def _populate_single_room(palace_path, n_drawers, n_needles=10):
        drawer_id = f"drawer_single_room_{hashlib.md5(f'noise_{i}'.encode()).hexdigest()[:16]}"
        docs.append(content)
        ids.append(drawer_id)
-        metas.append({
+        metas.append(
-            "wing": "concentrated",
+            {
-            "room": "single_room",
+                "wing": "concentrated",
-            "source_file": f"noise_{i:06d}.txt",
+                "room": "single_room",
-            "chunk_index": i % 10,
+                "source_file": f"noise_{i:06d}.txt",
-            "added_by": "threshold_bench",
+                "chunk_index": i % 10,
-            "filed_at": datetime.now().isoformat(),
+                "added_by": "threshold_bench",
-        })
+                "filed_at": datetime.now().isoformat(),
            }
        )
        if len(docs) >= batch_size:
            col.add(documents=docs, ids=ids, metadatas=metas)
@@ -77,9 +77,7 @@ class TestSearchRecallAtScale:
        total_needle_queries = min(10, len(needle_info))
        for needle in needle_info[:total_needle_queries]:
-            result = search_memories(
+            result = search_memories(needle["query"], palace_path=palace_path, n_results=10)
                needle["query"], palace_path=palace_path, n_results=10
            )
            if "error" in result:
                continue
@@ -150,8 +148,12 @@ class TestSearchFilteredVsUnfiltered:
        record_metric("search_filter", "avg_unfiltered_ms", round(avg_unfiltered, 1))
        record_metric("search_filter", "avg_filtered_ms", round(avg_filtered, 1))
        record_metric("search_filter", "latency_improvement_pct", round(latency_improvement, 1))
-        record_metric("search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3))
+        record_metric(
-        record_metric("search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3))
+            "search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3)
        )
        record_metric(
            "search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3)
        )
@pytest.mark.benchmark
@@ -167,9 +169,16 @@ class TestConcurrentSearch:
        from mempalace.searcher import search_memories
        queries = [
-            "authentication", "database", "deployment", "error handling",
+            "authentication",
-            "testing", "monitoring", "caching", "middleware",
+            "database",
-            "serialization", "validation",
+            "deployment",
            "error handling",
            "testing",
            "monitoring",
            "caching",
            "middleware",
            "serialization",
            "validation",
        ] * 3  # 30 total queries
        def run_search(query):