bench: add per-room recall threshold test

Concentrates all drawers into a single wing+room to isolate the
embedding model's retrieval limit independent of palace filtering.
Confirms recall degrades to ~0.4-0.5 at 5K drawers per room even
with wing+room filters applied — the spatial structure helps by
keeping buckets small, but can't fix the underlying embedding ceiling.
This commit is contained in:
Igor Lins e Silva
2026-04-08 05:01:51 -03:00
parent 7b89291334
commit e8017ca2ec
2 changed files with 187 additions and 6 deletions
+8 -6
View File
@@ -1,6 +1,6 @@
# MemPalace Scale Benchmark Suite
94 tests that benchmark mempalace at scale to validate real-world performance limits.
106 tests that benchmark mempalace at scale to validate real-world performance limits.
## Why
@@ -11,6 +11,7 @@ MemPalace has strong academic scores (96.6% R@5 on LongMemEval) but no empirical
- Modified files are never re-ingested — what's the skip-check cost at scale?
- How does query latency degrade as the palace grows from 1K to 100K drawers?
- Does wing/room filtering actually improve retrieval, and by how much?
- At what per-room drawer count does recall break regardless of filtering?
This suite finds those answers.
@@ -20,7 +21,7 @@ This suite finds those answers.
# Fast smoke test (~2 min)
uv run pytest tests/benchmarks/ -v --bench-scale=small -m "benchmark and not slow"
# Full small scale (~30 min)
# Full small scale (~35 min)
uv run pytest tests/benchmarks/ -v --bench-scale=small
# Medium scale with JSON report
@@ -61,6 +62,7 @@ uv run pytest tests/benchmarks/ -v --bench-scale=stress -m stress
| File | What it tests |
|------|--------------|
| `test_palace_boost.py` | Retrieval improvement from wing/room filtering at different scales |
| `test_recall_threshold.py` | Per-room recall ceiling — isolates embedding model limit with all drawers in one bucket |
| `test_knowledge_graph_bench.py` | Triple insertion rate, temporal query accuracy, SQLite concurrent access |
| `test_layers_bench.py` | MemoryStack wake-up cost, Layer1 unbounded fetch, token budget compliance |
@@ -68,10 +70,10 @@ uv run pytest tests/benchmarks/ -v --bench-scale=stress -m stress
```
tests/benchmarks/
conftest.py # --bench-scale / --bench-report CLI options, fixtures, markers
data_generator.py # Deterministic data factory (seeded RNG, planted needles)
report.py # JSON report writer + regression checker
test_*.py # 8 test modules (94 tests total)
conftest.py # --bench-scale / --bench-report CLI options, fixtures, markers
data_generator.py # Deterministic data factory (seeded RNG, planted needles)
report.py # JSON report writer + regression checker
test_*.py # 9 test modules (106 tests total)
```
### Data Generator
+179
View File
@@ -0,0 +1,179 @@
"""
Recall threshold test — find the per-bucket size where retrieval breaks.
The palace_boost tests showed room-filtered recall of 1.0, but only because
each room had ~333 drawers. This test concentrates ALL drawers into a single
wing+room to find the actual embedding model limit.
"""
import hashlib
import os
import time
from datetime import datetime
import chromadb
import pytest
from tests.benchmarks.data_generator import PalaceDataGenerator
from tests.benchmarks.report import record_metric
NEEDLE_TOPICS = [
"Fibonacci sequence optimization uses memoization with O(n) space complexity",
"PostgreSQL vacuum autovacuum threshold set to 50 percent for table users",
"Redis cluster failover timeout configured at 30 seconds with sentinel monitoring",
"Kubernetes horizontal pod autoscaler targets 70 percent CPU utilization",
"GraphQL subscription uses WebSocket transport with heartbeat interval 25 seconds",
"JWT token rotation policy requires refresh every 15 minutes with sliding window",
"Elasticsearch index sharding strategy uses 5 primary shards with 1 replica each",
"Docker multi-stage build reduces image size from 1.2GB to 180MB for production",
"Apache Kafka consumer group rebalance timeout set to 45 seconds",
"MongoDB change streams resume token persisted every 100 operations",
]
NEEDLE_QUERIES = [
"Fibonacci sequence optimization memoization",
"PostgreSQL vacuum autovacuum threshold",
"Redis cluster failover timeout sentinel",
"Kubernetes horizontal pod autoscaler CPU",
"GraphQL subscription WebSocket heartbeat",
"JWT token rotation policy refresh",
"Elasticsearch index sharding primary replica",
"Docker multi-stage build image size production",
"Apache Kafka consumer group rebalance",
"MongoDB change streams resume token",
]
def _populate_single_room(palace_path, n_drawers, n_needles=10):
"""Pack all drawers into one wing+room, plant needles, return queries."""
gen = PalaceDataGenerator(seed=42, scale="small")
os.makedirs(palace_path, exist_ok=True)
client = chromadb.PersistentClient(path=palace_path)
col = client.get_or_create_collection("mempalace_drawers")
batch_size = 500
docs, ids, metas = [], [], []
# Plant needles
for i in range(n_needles):
needle_id = f"NEEDLE_{i:04d}"
content = f"{needle_id}: {NEEDLE_TOPICS[i]}. Unique planted needle for threshold test."
drawer_id = f"drawer_single_room_{hashlib.md5(needle_id.encode()).hexdigest()[:16]}"
docs.append(content)
ids.append(drawer_id)
metas.append({
"wing": "concentrated",
"room": "single_room",
"source_file": f"needle_{i}.txt",
"chunk_index": 0,
"added_by": "threshold_bench",
"filed_at": datetime.now().isoformat(),
})
# Fill with noise — all in the SAME room
remaining = n_drawers - len(docs)
for i in range(remaining):
content = gen._random_text(400, 800)
drawer_id = f"drawer_single_room_{hashlib.md5(f'noise_{i}'.encode()).hexdigest()[:16]}"
docs.append(content)
ids.append(drawer_id)
metas.append({
"wing": "concentrated",
"room": "single_room",
"source_file": f"noise_{i:06d}.txt",
"chunk_index": i % 10,
"added_by": "threshold_bench",
"filed_at": datetime.now().isoformat(),
})
if len(docs) >= batch_size:
col.add(documents=docs, ids=ids, metadatas=metas)
docs, ids, metas = [], [], []
if docs:
col.add(documents=docs, ids=ids, metadatas=metas)
return client, col
@pytest.mark.benchmark
class TestRecallThresholdSingleRoom:
"""
All drawers in one room — isolates the embedding model's retrieval limit.
Room filtering can't help here. This is the true ceiling.
"""
SIZES = [250, 500, 1_000, 2_000, 3_000, 5_000]
@pytest.mark.parametrize("n_drawers", SIZES)
def test_single_room_recall(self, n_drawers, tmp_path):
"""Recall@5 and @10 with all drawers in one bucket."""
palace_path = str(tmp_path / "palace")
_populate_single_room(palace_path, n_drawers, n_needles=10)
from mempalace.searcher import search_memories
hits_at_5 = 0
hits_at_10 = 0
n_queries = len(NEEDLE_QUERIES)
for i, query in enumerate(NEEDLE_QUERIES):
result = search_memories(
query,
palace_path=palace_path,
wing="concentrated",
room="single_room",
n_results=10,
)
if "error" in result:
continue
texts = [h["text"] for h in result.get("results", [])]
needle_id = f"NEEDLE_{i:04d}"
found_at_5 = any(needle_id in t for t in texts[:5])
found_at_10 = any(needle_id in t for t in texts[:10])
if found_at_5:
hits_at_5 += 1
if found_at_10:
hits_at_10 += 1
recall_5 = hits_at_5 / n_queries
recall_10 = hits_at_10 / n_queries
record_metric("single_room_recall", f"recall_at_5_at_{n_drawers}", round(recall_5, 3))
record_metric("single_room_recall", f"recall_at_10_at_{n_drawers}", round(recall_10, 3))
@pytest.mark.parametrize("n_drawers", SIZES)
def test_single_room_no_filter_recall(self, n_drawers, tmp_path):
"""Same test but WITHOUT wing/room filter — pure unfiltered search."""
palace_path = str(tmp_path / "palace")
_populate_single_room(palace_path, n_drawers, n_needles=10)
from mempalace.searcher import search_memories
hits_at_5 = 0
hits_at_10 = 0
n_queries = len(NEEDLE_QUERIES)
for i, query in enumerate(NEEDLE_QUERIES):
result = search_memories(query, palace_path=palace_path, n_results=10)
if "error" in result:
continue
texts = [h["text"] for h in result.get("results", [])]
needle_id = f"NEEDLE_{i:04d}"
if any(needle_id in t for t in texts[:5]):
hits_at_5 += 1
if any(needle_id in t for t in texts[:10]):
hits_at_10 += 1
recall_5 = hits_at_5 / n_queries
recall_10 = hits_at_10 / n_queries
record_metric("single_room_unfiltered", f"recall_at_5_at_{n_drawers}", round(recall_5, 3))
record_metric("single_room_unfiltered", f"recall_at_10_at_{n_drawers}", round(recall_10, 3))