fix: resolve formatting, regression logic, and pytest defaults

- Run ruff format on all benchmark files (fixes CI lint job) - Fix check_regression() substring ambiguity: ordered keyword matching so "latency_improvement_pct" is correctly classified as higher-is-better - Update stale comments in conftest.py referencing wrong fixture - Add pytest addopts to skip benchmark/slow/stress markers by default
2026-04-08 10:56:39 -03:00
parent 7e4db33061
commit ebc26f3960
12 changed files with 383 additions and 138 deletions
@@ -65,6 +65,7 @@ quote-style = "double"
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 pythonpath = ["."]
+addopts = "-m 'not benchmark and not slow and not stress'"
 markers = [
    "benchmark: scale/performance benchmark tests",
    "slow: tests that take more than 30 seconds",
@@ -96,8 +96,7 @@ def pytest_terminal_summary(terminalreporter, config):
    if not report_path:
        return

-    # Collect results from the session fixture if available
-    # The results are written by individual tests via bench_results fixture
+    # Collect results written by individual tests via record_metric()
    import platform
    import subprocess

@@ -129,7 +128,7 @@ def pytest_terminal_summary(terminalreporter, config):
        "results": {},
    }

-    # Read results from a temp file written by the bench_results fixture
+    # Read results from the temp file written by record_metric()
    results_file = os.path.join(tempfile.gettempdir(), "mempalace_bench_results.json")
    if os.path.exists(results_file):
        try:
@@ -20,42 +20,150 @@ import yaml
 # ── Scale configurations ─────────────────────────────────────────────────

 SCALE_CONFIGS = {
-    "small": {"drawers": 1_000, "wings": 3, "rooms_per_wing": 5, "kg_entities": 50, "kg_triples": 200, "needles": 20, "search_queries": 20},
-    "medium": {"drawers": 10_000, "wings": 8, "rooms_per_wing": 12, "kg_entities": 200, "kg_triples": 2_000, "needles": 50, "search_queries": 50},
-    "large": {"drawers": 50_000, "wings": 15, "rooms_per_wing": 20, "kg_entities": 500, "kg_triples": 10_000, "needles": 100, "search_queries": 100},
-    "stress": {"drawers": 100_000, "wings": 25, "rooms_per_wing": 30, "kg_entities": 1_000, "kg_triples": 50_000, "needles": 200, "search_queries": 200},
+    "small": {
+        "drawers": 1_000,
+        "wings": 3,
+        "rooms_per_wing": 5,
+        "kg_entities": 50,
+        "kg_triples": 200,
+        "needles": 20,
+        "search_queries": 20,
+    },
+    "medium": {
+        "drawers": 10_000,
+        "wings": 8,
+        "rooms_per_wing": 12,
+        "kg_entities": 200,
+        "kg_triples": 2_000,
+        "needles": 50,
+        "search_queries": 50,
+    },
+    "large": {
+        "drawers": 50_000,
+        "wings": 15,
+        "rooms_per_wing": 20,
+        "kg_entities": 500,
+        "kg_triples": 10_000,
+        "needles": 100,
+        "search_queries": 100,
+    },
+    "stress": {
+        "drawers": 100_000,
+        "wings": 25,
+        "rooms_per_wing": 30,
+        "kg_entities": 1_000,
+        "kg_triples": 50_000,
+        "needles": 200,
+        "search_queries": 200,
+    },
 }

 # ── Vocabulary banks for realistic content ───────────────────────────────

 WING_NAMES = [
-    "webapp", "backend_api", "mobile_app", "data_pipeline", "ml_platform",
-    "devops", "auth_service", "payments", "analytics", "docs_site",
-    "cli_tool", "dashboard", "notification_service", "search_engine",
-    "user_mgmt", "inventory", "reporting", "testing_infra", "monitoring",
-    "email_service", "chat_bot", "file_storage", "scheduler", "gateway",
+    "webapp",
+    "backend_api",
+    "mobile_app",
+    "data_pipeline",
+    "ml_platform",
+    "devops",
+    "auth_service",
+    "payments",
+    "analytics",
+    "docs_site",
+    "cli_tool",
+    "dashboard",
+    "notification_service",
+    "search_engine",
+    "user_mgmt",
+    "inventory",
+    "reporting",
+    "testing_infra",
+    "monitoring",
+    "email_service",
+    "chat_bot",
+    "file_storage",
+    "scheduler",
+    "gateway",
    "marketplace",
 ]

 ROOM_NAMES = [
-    "backend", "frontend", "api", "database", "auth", "tests", "docs",
-    "config", "deployment", "models", "views", "controllers", "middleware",
-    "utils", "schemas", "migrations", "fixtures", "scripts", "styles",
-    "components", "hooks", "services", "routes", "templates", "static",
-    "media", "logging", "cache", "queue", "workers",
+    "backend",
+    "frontend",
+    "api",
+    "database",
+    "auth",
+    "tests",
+    "docs",
+    "config",
+    "deployment",
+    "models",
+    "views",
+    "controllers",
+    "middleware",
+    "utils",
+    "schemas",
+    "migrations",
+    "fixtures",
+    "scripts",
+    "styles",
+    "components",
+    "hooks",
+    "services",
+    "routes",
+    "templates",
+    "static",
+    "media",
+    "logging",
+    "cache",
+    "queue",
+    "workers",
 ]

 TECH_TERMS = [
-    "authentication", "authorization", "middleware", "endpoint", "REST API",
-    "GraphQL", "WebSocket", "database migration", "ORM", "query optimization",
-    "caching strategy", "load balancer", "rate limiting", "pagination",
-    "serialization", "validation", "error handling", "logging framework",
-    "monitoring", "deployment pipeline", "CI/CD", "containerization",
-    "microservice", "event sourcing", "message queue", "pub/sub",
-    "connection pooling", "session management", "token refresh", "CORS",
-    "SSL termination", "health check", "circuit breaker", "retry logic",
-    "batch processing", "stream processing", "data pipeline", "ETL",
-    "feature flag", "A/B testing", "blue-green deployment", "canary release",
+    "authentication",
+    "authorization",
+    "middleware",
+    "endpoint",
+    "REST API",
+    "GraphQL",
+    "WebSocket",
+    "database migration",
+    "ORM",
+    "query optimization",
+    "caching strategy",
+    "load balancer",
+    "rate limiting",
+    "pagination",
+    "serialization",
+    "validation",
+    "error handling",
+    "logging framework",
+    "monitoring",
+    "deployment pipeline",
+    "CI/CD",
+    "containerization",
+    "microservice",
+    "event sourcing",
+    "message queue",
+    "pub/sub",
+    "connection pooling",
+    "session management",
+    "token refresh",
+    "CORS",
+    "SSL termination",
+    "health check",
+    "circuit breaker",
+    "retry logic",
+    "batch processing",
+    "stream processing",
+    "data pipeline",
+    "ETL",
+    "feature flag",
+    "A/B testing",
+    "blue-green deployment",
+    "canary release",
 ]

 CODE_SNIPPETS = [
@@ -75,17 +183,51 @@ PROSE_TEMPLATES = [
 ]

 ENTITY_NAMES = [
-    "Alice", "Bob", "Carol", "Dave", "Eve", "Frank", "Grace", "Heidi",
-    "Ivan", "Judy", "Karl", "Linda", "Mike", "Nina", "Oscar", "Pat",
-    "Quinn", "Rita", "Steve", "Tina", "Ursula", "Victor", "Wendy", "Xander",
+    "Alice",
+    "Bob",
+    "Carol",
+    "Dave",
+    "Eve",
+    "Frank",
+    "Grace",
+    "Heidi",
+    "Ivan",
+    "Judy",
+    "Karl",
+    "Linda",
+    "Mike",
+    "Nina",
+    "Oscar",
+    "Pat",
+    "Quinn",
+    "Rita",
+    "Steve",
+    "Tina",
+    "Ursula",
+    "Victor",
+    "Wendy",
+    "Xander",
 ]

 ENTITY_TYPES = ["person", "project", "tool", "concept", "team", "service"]

 PREDICATES = [
-    "works_on", "manages", "reports_to", "collaborates_with", "created",
-    "maintains", "uses", "depends_on", "replaced", "reviewed", "deployed",
-    "tested", "documented", "mentors", "leads", "contributes_to",
+    "works_on",
+    "manages",
+    "reports_to",
+    "collaborates_with",
+    "created",
+    "maintains",
+    "uses",
+    "depends_on",
+    "replaced",
+    "reviewed",
+    "deployed",
+    "tested",
+    "documented",
+    "mentors",
+    "leads",
+    "contributes_to",
 ]


@@ -136,13 +278,19 @@ class PalaceDataGenerator:
            room = self.rng.choice(self.rooms_by_wing[wing])
            needle_id = f"NEEDLE_{i:04d}"
            content = f"{needle_id}: {topic}. This is a unique planted needle for recall benchmarking at scale."
-            self.needles.append({
+            self.needles.append(
+                {
                    "id": needle_id,
                    "content": content,
                    "wing": wing,
                    "room": room,
-                "query": topic.split(" uses ")[0] if " uses " in topic else topic.split(" set to ")[0] if " set to " in topic else topic[:60],
-            })
+                    "query": topic.split(" uses ")[0]
+                    if " uses " in topic
+                    else topic.split(" set to ")[0]
+                    if " set to " in topic
+                    else topic[:60],
+                }
+            )

    def _random_text(self, min_chars=600, max_chars=900):
        """Generate a random text block of realistic content."""
@@ -159,21 +307,25 @@ class PalaceDataGenerator:
                    component=self.rng.choice(ROOM_NAMES),
                    task=self.rng.choice(TECH_TERMS),
                    month=self.rng.choice(["January", "February", "March", "April", "May"]),
-                    quality=self.rng.choice(["performance", "readability", "test coverage", "latency"]),
+                    quality=self.rng.choice(
+                        ["performance", "readability", "test coverage", "latency"]
+                    ),
                    decision=self.rng.choice(TECH_TERMS),
                    condition=self.rng.choice(TECH_TERMS) + " is null",
                    cause=self.rng.choice(["race condition", "null pointer", "timeout", "OOM"]),
                    fix="adding " + self.rng.choice(TECH_TERMS),
                    test_file=f"test_{self.rng.choice(ROOM_NAMES)}.py",
                    old_tech=self.rng.choice(["MySQL", "Flask", "REST", "Jenkins"]),
-                    new_tech=self.rng.choice(["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]),
+                    new_tech=self.rng.choice(
+                        ["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]
+                    ),
                    reason=self.rng.choice(TECH_TERMS),
-                    date=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
+                    date=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}",
                    percent=self.rng.randint(10, 80),
                    topic=self.rng.choice(TECH_TERMS),
                    person=self.rng.choice(ENTITY_NAMES),
                    action=self.rng.choice(["refactor", "migrate", "optimize", "test"]),
-                    deadline=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
+                    deadline=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}",
                    followup=self.rng.choice(TECH_TERMS),
                    feature_name=self.rng.choice(TECH_TERMS),
                    capability=self.rng.choice(TECH_TERMS),
@@ -182,7 +334,12 @@ class PalaceDataGenerator:
                )
            else:
                words = self.rng.sample(TECH_TERMS, min(5, len(TECH_TERMS)))
-                text = " ".join(words) + ". " + self.rng.choice(TECH_TERMS) + " implementation details follow.\n"
+                text = (
+                    " ".join(words)
+                    + ". "
+                    + self.rng.choice(TECH_TERMS)
+                    + " implementation details follow.\n"
+                )
            parts.append(text)
            total += len(text)
        return "\n".join(parts)[:max_chars]
@@ -270,15 +427,24 @@ class PalaceDataGenerator:
                needle_id = f"drawer_{needle['wing']}_{needle['room']}_{hashlib.md5(needle['id'].encode()).hexdigest()[:16]}"
                docs.append(needle["content"])
                ids.append(needle_id)
-                metas.append({
+                metas.append(
+                    {
                        "wing": needle["wing"],
                        "room": needle["room"],
                        "source_file": f"needle_{needle['id']}.txt",
                        "chunk_index": 0,
                        "added_by": "benchmark",
                        "filed_at": datetime.now().isoformat(),
-                })
-                needle_info.append({"id": needle_id, "query": needle["query"], "wing": needle["wing"], "room": needle["room"]})
+                    }
+                )
+                needle_info.append(
+                    {
+                        "id": needle_id,
+                        "query": needle["query"],
+                        "wing": needle["wing"],
+                        "room": needle["room"],
+                    }
+                )

        # Fill remaining drawers with realistic content
        remaining = n_drawers - len(docs)
@@ -291,14 +457,16 @@ class PalaceDataGenerator:

            docs.append(content)
            ids.append(drawer_id)
-            metas.append({
+            metas.append(
+                {
                    "wing": wing,
                    "room": room,
                    "source_file": f"generated_{i:06d}.txt",
                    "chunk_index": i % 10,
                    "added_by": "benchmark",
                    "filed_at": datetime.now().isoformat(),
-            })
+                }
+            )

            # Flush in batches
            if len(docs) >= batch_size:
@@ -351,7 +519,9 @@ class PalaceDataGenerator:
            valid_to = None
            if self.rng.random() < 0.3:
                end_offset = self.rng.randint(30, 365)
-                valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime("%Y-%m-%d")
+                valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime(
+                    "%Y-%m-%d"
+                )
            triples.append((subject, predicate, obj, valid_from, valid_to))

        return entities, triples
@@ -371,24 +541,28 @@ class PalaceDataGenerator:
        # Half are needle queries (known-good answers)
        n_needle = min(n_queries // 2, len(self.needles))
        for needle in self.needles[:n_needle]:
-            queries.append({
+            queries.append(
+                {
                    "query": needle["query"],
                    "expected_wing": needle["wing"],
                    "expected_room": needle["room"],
                    "needle_id": needle["id"],
                    "is_needle": True,
-            })
+                }
+            )

        # Other half are generic queries (measure latency, not recall)
        n_generic = n_queries - n_needle
        for _ in range(n_generic):
-            queries.append({
+            queries.append(
+                {
                    "query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS),
                    "expected_wing": None,
                    "expected_room": None,
                    "needle_id": None,
                    "is_needle": False,
-            })
+                }
+            )

        self.rng.shuffle(queries)
        return queries
@@ -45,16 +45,45 @@ def check_regression(current_report: str, baseline_report: str, threshold: float
        baseline = json.load(f)

    regressions = []
-    # Metrics where HIGHER is worse (latency, memory, etc.)
-    higher_is_worse = {
-        "latency", "rss", "memory", "oom", "lock_failures", "elapsed",
-        "p50_ms", "p95_ms", "p99_ms", "rss_delta_mb", "peak_rss_mb",
-    }
-    # Metrics where LOWER is worse (throughput, recall, etc.)
-    lower_is_worse = {
-        "recall", "throughput", "per_sec", "files_per_sec", "drawers_per_sec",
-        "triples_per_sec", "improvement",
-    }
+    # Keywords for metric direction — checked in order, first match wins.
+    # "improvement" is checked before "latency" so that composite names
+    # like "latency_improvement_pct" are classified correctly.
+    _higher_is_better_kw = [
+        "improvement",
+        "recall",
+        "throughput",
+        "per_sec",
+        "files_per_sec",
+        "drawers_per_sec",
+        "triples_per_sec",
+        "speedup",
+    ]
+    _higher_is_worse_kw = [
+        "latency",
+        "rss",
+        "memory",
+        "oom",
+        "lock_failures",
+        "elapsed",
+        "p50_ms",
+        "p95_ms",
+        "p99_ms",
+        "rss_delta_mb",
+        "peak_rss_mb",
+        "errors",
+        "failures",
+    ]
+
+    def _metric_direction(name: str) -> str:
+        """Return 'higher_better', 'higher_worse', or 'unknown'."""
+        low = name.lower()
+        for kw in _higher_is_better_kw:
+            if kw in low:
+                return "higher_better"
+        for kw in _higher_is_worse_kw:
+            if kw in low:
+                return "higher_worse"
+        return "unknown"

    for category in baseline.get("results", {}):
        if category not in current.get("results", {}):
@@ -68,23 +97,21 @@ def check_regression(current_report: str, baseline_report: str, threshold: float
            if base_val == 0:
                continue

-            # Determine direction
-            is_latency_like = any(kw in metric.lower() for kw in higher_is_worse)
-            is_throughput_like = any(kw in metric.lower() for kw in lower_is_worse)
+            direction = _metric_direction(metric)

-            if is_latency_like:
+            if direction == "higher_worse":
                # Higher is worse — check if current exceeds baseline by threshold
                if curr_val > base_val * (1 + threshold):
                    pct = ((curr_val - base_val) / base_val) * 100
                    regressions.append(
-                        f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)"
+                        f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)"
                    )
-            elif is_throughput_like:
+            elif direction == "higher_better":
                # Lower is worse — check if current is below baseline by threshold
                if curr_val < base_val * (1 - threshold):
                    pct = ((curr_val - base_val) / base_val) * 100
                    regressions.append(
-                        f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)"
+                        f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)"
                    )

    return regressions
@@ -143,7 +143,10 @@ class TestBulkInsertPerformance:
            batch_end = min(batch_start + batch_size, n_docs)
            batch_docs = contents[batch_start:batch_end]
            batch_ids = [f"batch_{i}" for i in range(batch_start, batch_end)]
-            batch_metas = [{"wing": "test", "room": "bench", "chunk_index": i} for i in range(batch_start, batch_end)]
+            batch_metas = [
+                {"wing": "test", "room": "bench", "chunk_index": i}
+                for i in range(batch_start, batch_end)
+            ]
            col_batch.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
        batched_ms = (time.perf_counter() - start) * 1000

@@ -125,7 +125,9 @@ class TestChunkThroughput:
        chunks_per_sec = total_chunks / max(elapsed, 0.001)
        kb_per_sec = (len(content) * n_iterations / 1024) / max(elapsed, 0.001)

-        record_metric("chunking", f"chunks_per_sec_at_{content_size_kb}kb", round(chunks_per_sec, 1))
+        record_metric(
+            "chunking", f"chunks_per_sec_at_{content_size_kb}kb", round(chunks_per_sec, 1)
+        )
        record_metric("chunking", f"kb_per_sec_at_{content_size_kb}kb", round(kb_per_sec, 1))


@@ -160,4 +162,8 @@ class TestReingestSkipOverhead:

        record_metric("reingest", "skip_check_elapsed_sec", round(skip_elapsed, 2))
        record_metric("reingest", "files_checked", files_written)
-        record_metric("reingest", "skip_check_per_file_ms", round(skip_elapsed * 1000 / max(files_written, 1), 1))
+        record_metric(
+            "reingest",
+            "skip_check_per_file_ms",
+            round(skip_elapsed * 1000 / max(files_written, 1), 1),
+        )
@@ -36,9 +36,7 @@ class TestTripleInsertionRate:
        # Measure triple insertion
        start = time.perf_counter()
        for subject, predicate, obj, valid_from, valid_to in triples:
-            kg.add_triple(
-                subject, predicate, obj, valid_from=valid_from, valid_to=valid_to
-            )
+            kg.add_triple(subject, predicate, obj, valid_from=valid_from, valid_to=valid_to)
        elapsed = time.perf_counter() - start

        triples_per_sec = n_triples / max(elapsed, 0.001)
@@ -128,7 +126,9 @@ class TestTemporalQueryAccuracy:
        kg.add_entity("ProjectB", "project")

        # Alice worked on ProjectA from 2024-01 to 2024-06
-        kg.add_triple("Alice", "works_on", "ProjectA", valid_from="2024-01-01", valid_to="2024-06-30")
+        kg.add_triple(
+            "Alice", "works_on", "ProjectA", valid_from="2024-01-01", valid_to="2024-06-30"
+        )
        # Alice worked on ProjectB from 2024-07 onwards
        kg.add_triple("Alice", "works_on", "ProjectB", valid_from="2024-07-01")

@@ -145,8 +145,16 @@ class TestTemporalQueryAccuracy:
        # Query Alice as of September 2024 — should find ProjectB
        result_sept = kg.query_entity("Alice", as_of="2024-09-15")

-        record_metric("kg_temporal", "march_query_results", len(result_march) if isinstance(result_march, list) else 0)
-        record_metric("kg_temporal", "sept_query_results", len(result_sept) if isinstance(result_sept, list) else 0)
+        record_metric(
+            "kg_temporal",
+            "march_query_results",
+            len(result_march) if isinstance(result_march, list) else 0,
+        )
+        record_metric(
+            "kg_temporal",
+            "sept_query_results",
+            len(result_sept) if isinstance(result_sept, list) else 0,
+        )


@pytest.mark.benchmark
@@ -230,7 +238,9 @@ class TestSQLiteConcurrentAccess:
            fails = 0
            for i in range(50):
                try:
-                    kg.add_triple(f"E_{i % 50}", "new_rel", f"E_{(i + 7) % 50}", valid_from="2025-06-01")
+                    kg.add_triple(
+                        f"E_{i % 50}", "new_rel", f"E_{(i + 7) % 50}", valid_from="2025-06-01"
+                    )
                except Exception:
                    fails += 1
            write_errors.append(fails)
@@ -116,7 +116,9 @@ class TestLayer1UnboundedFetch:
        record_metric("layer1_filter", "unfiltered_ms", round(unfiltered_ms, 1))
        record_metric("layer1_filter", "filtered_ms", round(filtered_ms, 1))
        if unfiltered_ms > 0:
-            record_metric("layer1_filter", "speedup_pct", round((1 - filtered_ms / unfiltered_ms) * 100, 1))
+            record_metric(
+                "layer1_filter", "speedup_pct", round((1 - filtered_ms / unfiltered_ms) * 100, 1)
+            )


@pytest.mark.benchmark
@@ -146,7 +148,9 @@ class TestWakeUpTokenBudget:
        record_metric("wakeup_budget", f"tokens_at_{n_drawers}", token_estimate)
        record_metric("wakeup_budget", f"chars_at_{n_drawers}", len(text))

-        assert token_estimate < 1200, f"Wake-up exceeded budget: ~{token_estimate} tokens at {n_drawers} drawers"
+        assert token_estimate < 1200, (
+            f"Wake-up exceeded budget: ~{token_estimate} tokens at {n_drawers} drawers"
+        )


@pytest.mark.benchmark
@@ -63,7 +63,9 @@ class TestSearchMemoryProfile:
        record_metric("memory_search", "rss_end_mb", round(end_rss, 2))
        record_metric("memory_search", "rss_growth_mb", round(growth, 2))
        record_metric("memory_search", "n_calls", n_calls)
-        record_metric("memory_search", "growth_per_100_calls_mb", round(growth / (n_calls / 100), 2))
+        record_metric(
+            "memory_search", "growth_per_100_calls_mb", round(growth / (n_calls / 100), 2)
+        )


@pytest.mark.benchmark
@@ -166,11 +168,13 @@ class TestHeapSnapshot:
        stats = snap_after.compare_to(snap_before, "lineno")
        top_allocators = []
        for stat in stats[:10]:
-            top_allocators.append({
+            top_allocators.append(
+                {
                    "file": str(stat.traceback),
                    "size_kb": round(stat.size / 1024, 1),
                    "count": stat.count,
-            })
+                }
+            )

        total_growth_kb = sum(s["size_kb"] for s in top_allocators)
        record_metric("heap_search", "top_10_growth_kb", round(total_growth_kb, 1))
@@ -123,8 +123,12 @@ class TestFilterLatencyBenefit:
        record_metric("filter_latency", "avg_wing_filtered_ms", round(avg_wing, 1))
        record_metric("filter_latency", "avg_room_filtered_ms", round(avg_room, 1))
        if avg_none > 0:
-            record_metric("filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1))
-            record_metric("filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1))
+            record_metric(
+                "filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1)
+            )
+            record_metric(
+                "filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1)
+            )


@pytest.mark.benchmark
@@ -61,14 +61,16 @@ def _populate_single_room(palace_path, n_drawers, n_needles=10):
        drawer_id = f"drawer_single_room_{hashlib.md5(needle_id.encode()).hexdigest()[:16]}"
        docs.append(content)
        ids.append(drawer_id)
-        metas.append({
+        metas.append(
+            {
                "wing": "concentrated",
                "room": "single_room",
                "source_file": f"needle_{i}.txt",
                "chunk_index": 0,
                "added_by": "threshold_bench",
                "filed_at": datetime.now().isoformat(),
-        })
+            }
+        )

    # Fill with noise — all in the SAME room
    remaining = n_drawers - len(docs)
@@ -77,14 +79,16 @@ def _populate_single_room(palace_path, n_drawers, n_needles=10):
        drawer_id = f"drawer_single_room_{hashlib.md5(f'noise_{i}'.encode()).hexdigest()[:16]}"
        docs.append(content)
        ids.append(drawer_id)
-        metas.append({
+        metas.append(
+            {
                "wing": "concentrated",
                "room": "single_room",
                "source_file": f"noise_{i:06d}.txt",
                "chunk_index": i % 10,
                "added_by": "threshold_bench",
                "filed_at": datetime.now().isoformat(),
-        })
+            }
+        )

        if len(docs) >= batch_size:
            col.add(documents=docs, ids=ids, metadatas=metas)
@@ -77,9 +77,7 @@ class TestSearchRecallAtScale:
        total_needle_queries = min(10, len(needle_info))

        for needle in needle_info[:total_needle_queries]:
-            result = search_memories(
-                needle["query"], palace_path=palace_path, n_results=10
-            )
+            result = search_memories(needle["query"], palace_path=palace_path, n_results=10)
            if "error" in result:
                continue

@@ -150,8 +148,12 @@ class TestSearchFilteredVsUnfiltered:
        record_metric("search_filter", "avg_unfiltered_ms", round(avg_unfiltered, 1))
        record_metric("search_filter", "avg_filtered_ms", round(avg_filtered, 1))
        record_metric("search_filter", "latency_improvement_pct", round(latency_improvement, 1))
-        record_metric("search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3))
-        record_metric("search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3))
+        record_metric(
+            "search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3)
+        )
+        record_metric(
+            "search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3)
+        )


@pytest.mark.benchmark
@@ -167,9 +169,16 @@ class TestConcurrentSearch:
        from mempalace.searcher import search_memories

        queries = [
-            "authentication", "database", "deployment", "error handling",
-            "testing", "monitoring", "caching", "middleware",
-            "serialization", "validation",
+            "authentication",
+            "database",
+            "deployment",
+            "error handling",
+            "testing",
+            "monitoring",
+            "caching",
+            "middleware",
+            "serialization",
+            "validation",
        ] * 3  # 30 total queries

        def run_search(query):