bench: add per-room recall threshold test

Concentrates all drawers into a single wing+room to isolate the embedding model's retrieval limit independent of palace filtering. Confirms recall degrades to ~0.4-0.5 at 5K drawers per room even with wing+room filters applied — the spatial structure helps by keeping buckets small, but can't fix the underlying embedding ceiling.
2026-04-08 05:01:51 -03:00
parent 7b89291334
commit e8017ca2ec
2 changed files with 187 additions and 6 deletions
@@ -1,6 +1,6 @@
 # MemPalace Scale Benchmark Suite

-94 tests that benchmark mempalace at scale to validate real-world performance limits.
+106 tests that benchmark mempalace at scale to validate real-world performance limits.

 ## Why

@@ -11,6 +11,7 @@ MemPalace has strong academic scores (96.6% R@5 on LongMemEval) but no empirical
 - Modified files are never re-ingested — what's the skip-check cost at scale?
 - How does query latency degrade as the palace grows from 1K to 100K drawers?
 - Does wing/room filtering actually improve retrieval, and by how much?
+- At what per-room drawer count does recall break regardless of filtering?

 This suite finds those answers.

@@ -20,7 +21,7 @@ This suite finds those answers.
 # Fast smoke test (~2 min)
 uv run pytest tests/benchmarks/ -v --bench-scale=small -m "benchmark and not slow"

-# Full small scale (~30 min)
+# Full small scale (~35 min)
 uv run pytest tests/benchmarks/ -v --bench-scale=small

 # Medium scale with JSON report
@@ -61,6 +62,7 @@ uv run pytest tests/benchmarks/ -v --bench-scale=stress -m stress
 | File | What it tests |
 |------|--------------|
 | `test_palace_boost.py` | Retrieval improvement from wing/room filtering at different scales |
+| `test_recall_threshold.py` | Per-room recall ceiling — isolates embedding model limit with all drawers in one bucket |
 | `test_knowledge_graph_bench.py` | Triple insertion rate, temporal query accuracy, SQLite concurrent access |
 | `test_layers_bench.py` | MemoryStack wake-up cost, Layer1 unbounded fetch, token budget compliance |

@@ -68,10 +70,10 @@ uv run pytest tests/benchmarks/ -v --bench-scale=stress -m stress

 ```
 tests/benchmarks/
-  conftest.py          # --bench-scale / --bench-report CLI options, fixtures, markers
-  data_generator.py    # Deterministic data factory (seeded RNG, planted needles)
-  report.py            # JSON report writer + regression checker
-  test_*.py            # 8 test modules (94 tests total)
+  conftest.py              # --bench-scale / --bench-report CLI options, fixtures, markers
+  data_generator.py        # Deterministic data factory (seeded RNG, planted needles)
+  report.py                # JSON report writer + regression checker
+  test_*.py                # 9 test modules (106 tests total)
 ```

 ### Data Generator
@@ -0,0 +1,179 @@
+"""
+Recall threshold test — find the per-bucket size where retrieval breaks.
+
+The palace_boost tests showed room-filtered recall of 1.0, but only because
+each room had ~333 drawers. This test concentrates ALL drawers into a single
+wing+room to find the actual embedding model limit.
+"""
+
+import hashlib
+import os
+import time
+from datetime import datetime
+
+import chromadb
+import pytest
+
+from tests.benchmarks.data_generator import PalaceDataGenerator
+from tests.benchmarks.report import record_metric
+
+
+NEEDLE_TOPICS = [
+    "Fibonacci sequence optimization uses memoization with O(n) space complexity",
+    "PostgreSQL vacuum autovacuum threshold set to 50 percent for table users",
+    "Redis cluster failover timeout configured at 30 seconds with sentinel monitoring",
+    "Kubernetes horizontal pod autoscaler targets 70 percent CPU utilization",
+    "GraphQL subscription uses WebSocket transport with heartbeat interval 25 seconds",
+    "JWT token rotation policy requires refresh every 15 minutes with sliding window",
+    "Elasticsearch index sharding strategy uses 5 primary shards with 1 replica each",
+    "Docker multi-stage build reduces image size from 1.2GB to 180MB for production",
+    "Apache Kafka consumer group rebalance timeout set to 45 seconds",
+    "MongoDB change streams resume token persisted every 100 operations",
+]
+
+NEEDLE_QUERIES = [
+    "Fibonacci sequence optimization memoization",
+    "PostgreSQL vacuum autovacuum threshold",
+    "Redis cluster failover timeout sentinel",
+    "Kubernetes horizontal pod autoscaler CPU",
+    "GraphQL subscription WebSocket heartbeat",
+    "JWT token rotation policy refresh",
+    "Elasticsearch index sharding primary replica",
+    "Docker multi-stage build image size production",
+    "Apache Kafka consumer group rebalance",
+    "MongoDB change streams resume token",
+]
+
+
+def _populate_single_room(palace_path, n_drawers, n_needles=10):
+    """Pack all drawers into one wing+room, plant needles, return queries."""
+    gen = PalaceDataGenerator(seed=42, scale="small")
+    os.makedirs(palace_path, exist_ok=True)
+    client = chromadb.PersistentClient(path=palace_path)
+    col = client.get_or_create_collection("mempalace_drawers")
+
+    batch_size = 500
+    docs, ids, metas = [], [], []
+
+    # Plant needles
+    for i in range(n_needles):
+        needle_id = f"NEEDLE_{i:04d}"
+        content = f"{needle_id}: {NEEDLE_TOPICS[i]}. Unique planted needle for threshold test."
+        drawer_id = f"drawer_single_room_{hashlib.md5(needle_id.encode()).hexdigest()[:16]}"
+        docs.append(content)
+        ids.append(drawer_id)
+        metas.append({
+            "wing": "concentrated",
+            "room": "single_room",
+            "source_file": f"needle_{i}.txt",
+            "chunk_index": 0,
+            "added_by": "threshold_bench",
+            "filed_at": datetime.now().isoformat(),
+        })
+
+    # Fill with noise — all in the SAME room
+    remaining = n_drawers - len(docs)
+    for i in range(remaining):
+        content = gen._random_text(400, 800)
+        drawer_id = f"drawer_single_room_{hashlib.md5(f'noise_{i}'.encode()).hexdigest()[:16]}"
+        docs.append(content)
+        ids.append(drawer_id)
+        metas.append({
+            "wing": "concentrated",
+            "room": "single_room",
+            "source_file": f"noise_{i:06d}.txt",
+            "chunk_index": i % 10,
+            "added_by": "threshold_bench",
+            "filed_at": datetime.now().isoformat(),
+        })
+
+        if len(docs) >= batch_size:
+            col.add(documents=docs, ids=ids, metadatas=metas)
+            docs, ids, metas = [], [], []
+
+    if docs:
+        col.add(documents=docs, ids=ids, metadatas=metas)
+
+    return client, col
+
+
+@pytest.mark.benchmark
+class TestRecallThresholdSingleRoom:
+    """
+    All drawers in one room — isolates the embedding model's retrieval limit.
+
+    Room filtering can't help here. This is the true ceiling.
+    """
+
+    SIZES = [250, 500, 1_000, 2_000, 3_000, 5_000]
+
+    @pytest.mark.parametrize("n_drawers", SIZES)
+    def test_single_room_recall(self, n_drawers, tmp_path):
+        """Recall@5 and @10 with all drawers in one bucket."""
+        palace_path = str(tmp_path / "palace")
+        _populate_single_room(palace_path, n_drawers, n_needles=10)
+
+        from mempalace.searcher import search_memories
+
+        hits_at_5 = 0
+        hits_at_10 = 0
+        n_queries = len(NEEDLE_QUERIES)
+
+        for i, query in enumerate(NEEDLE_QUERIES):
+            result = search_memories(
+                query,
+                palace_path=palace_path,
+                wing="concentrated",
+                room="single_room",
+                n_results=10,
+            )
+            if "error" in result:
+                continue
+
+            texts = [h["text"] for h in result.get("results", [])]
+            needle_id = f"NEEDLE_{i:04d}"
+
+            found_at_5 = any(needle_id in t for t in texts[:5])
+            found_at_10 = any(needle_id in t for t in texts[:10])
+
+            if found_at_5:
+                hits_at_5 += 1
+            if found_at_10:
+                hits_at_10 += 1
+
+        recall_5 = hits_at_5 / n_queries
+        recall_10 = hits_at_10 / n_queries
+
+        record_metric("single_room_recall", f"recall_at_5_at_{n_drawers}", round(recall_5, 3))
+        record_metric("single_room_recall", f"recall_at_10_at_{n_drawers}", round(recall_10, 3))
+
+    @pytest.mark.parametrize("n_drawers", SIZES)
+    def test_single_room_no_filter_recall(self, n_drawers, tmp_path):
+        """Same test but WITHOUT wing/room filter — pure unfiltered search."""
+        palace_path = str(tmp_path / "palace")
+        _populate_single_room(palace_path, n_drawers, n_needles=10)
+
+        from mempalace.searcher import search_memories
+
+        hits_at_5 = 0
+        hits_at_10 = 0
+        n_queries = len(NEEDLE_QUERIES)
+
+        for i, query in enumerate(NEEDLE_QUERIES):
+            result = search_memories(query, palace_path=palace_path, n_results=10)
+            if "error" in result:
+                continue
+
+            texts = [h["text"] for h in result.get("results", [])]
+            needle_id = f"NEEDLE_{i:04d}"
+
+            if any(needle_id in t for t in texts[:5]):
+                hits_at_5 += 1
+            if any(needle_id in t for t in texts[:10]):
+                hits_at_10 += 1
+
+        recall_5 = hits_at_5 / n_queries
+        recall_10 = hits_at_10 / n_queries
+
+        record_metric("single_room_unfiltered", f"recall_at_5_at_{n_drawers}", round(recall_5, 3))
+        record_metric("single_room_unfiltered", f"recall_at_10_at_{n_drawers}", round(recall_10, 3))