bench: add scale benchmark suite (94 tests)
Benchmark mempalace at configurable scale (1K–100K drawers) to find real-world performance limits. Tests cover MCP tool OOM thresholds, ChromaDB query degradation, search recall@k, mining throughput, knowledge graph concurrency, memory leak detection, palace boost quantification, and Layer1 unbounded fetch behavior. - tests/benchmarks/ with 8 test modules + data generator + report system - Deterministic data factory with planted needles for recall measurement - JSON report output with regression detection (--bench-report flag) - CI benchmark job on PRs at small scale - psutil added as dev dependency for RSS tracking
This commit is contained in:
@@ -0,0 +1,225 @@
|
||||
"""
|
||||
Search performance benchmarks.
|
||||
|
||||
Measures query latency, recall@k, and concurrent search behavior
|
||||
as palace size grows. Uses planted needles for recall measurement.
|
||||
"""
|
||||
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.benchmarks.data_generator import PalaceDataGenerator
|
||||
from tests.benchmarks.report import record_metric
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
class TestSearchLatencyVsSize:
|
||||
"""Query latency scaling as palace grows."""
|
||||
|
||||
SIZES = [500, 1_000, 2_500, 5_000]
|
||||
|
||||
@pytest.mark.parametrize("n_drawers", SIZES)
|
||||
def test_search_latency_curve(self, n_drawers, tmp_path, bench_scale):
|
||||
"""Measure average search latency at different palace sizes."""
|
||||
gen = PalaceDataGenerator(seed=42, scale=bench_scale)
|
||||
palace_path = str(tmp_path / "palace")
|
||||
gen.populate_palace_directly(palace_path, n_drawers=n_drawers, include_needles=False)
|
||||
|
||||
from mempalace.searcher import search_memories
|
||||
|
||||
queries = [
|
||||
"authentication middleware",
|
||||
"database optimization",
|
||||
"error handling patterns",
|
||||
"deployment configuration",
|
||||
"testing strategy",
|
||||
]
|
||||
|
||||
latencies = []
|
||||
for q in queries:
|
||||
start = time.perf_counter()
|
||||
result = search_memories(q, palace_path=palace_path, n_results=5)
|
||||
elapsed_ms = (time.perf_counter() - start) * 1000
|
||||
latencies.append(elapsed_ms)
|
||||
assert "error" not in result
|
||||
|
||||
avg_ms = sum(latencies) / len(latencies)
|
||||
sorted_lat = sorted(latencies)
|
||||
p50_ms = sorted_lat[len(sorted_lat) // 2]
|
||||
p95_ms = sorted_lat[int(len(sorted_lat) * 0.95)]
|
||||
|
||||
record_metric("search", f"avg_latency_ms_at_{n_drawers}", round(avg_ms, 1))
|
||||
record_metric("search", f"p50_ms_at_{n_drawers}", round(p50_ms, 1))
|
||||
record_metric("search", f"p95_ms_at_{n_drawers}", round(p95_ms, 1))
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
class TestSearchRecallAtScale:
|
||||
"""Planted needle recall — does accuracy degrade as palace grows?"""
|
||||
|
||||
SIZES = [500, 1_000, 2_500, 5_000]
|
||||
|
||||
@pytest.mark.parametrize("n_drawers", SIZES)
|
||||
def test_recall_at_k(self, n_drawers, tmp_path, bench_scale):
|
||||
"""Recall@5 and Recall@10 using planted needles."""
|
||||
gen = PalaceDataGenerator(seed=42, scale=bench_scale)
|
||||
palace_path = str(tmp_path / "palace")
|
||||
_, _, needle_info = gen.populate_palace_directly(
|
||||
palace_path, n_drawers=n_drawers, include_needles=True
|
||||
)
|
||||
|
||||
from mempalace.searcher import search_memories
|
||||
|
||||
hits_at_5 = 0
|
||||
hits_at_10 = 0
|
||||
total_needle_queries = min(10, len(needle_info))
|
||||
|
||||
for needle in needle_info[:total_needle_queries]:
|
||||
result = search_memories(
|
||||
needle["query"], palace_path=palace_path, n_results=10
|
||||
)
|
||||
if "error" in result:
|
||||
continue
|
||||
|
||||
texts = [h["text"] for h in result.get("results", [])]
|
||||
|
||||
# Check if needle content appears in top 5
|
||||
found_at_5 = any("NEEDLE_" in t for t in texts[:5])
|
||||
found_at_10 = any("NEEDLE_" in t for t in texts[:10])
|
||||
|
||||
if found_at_5:
|
||||
hits_at_5 += 1
|
||||
if found_at_10:
|
||||
hits_at_10 += 1
|
||||
|
||||
recall_at_5 = hits_at_5 / max(total_needle_queries, 1)
|
||||
recall_at_10 = hits_at_10 / max(total_needle_queries, 1)
|
||||
|
||||
record_metric("search_recall", f"recall_at_5_at_{n_drawers}", round(recall_at_5, 3))
|
||||
record_metric("search_recall", f"recall_at_10_at_{n_drawers}", round(recall_at_10, 3))
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
class TestSearchFilteredVsUnfiltered:
|
||||
"""Compare search performance with and without wing/room filters."""
|
||||
|
||||
def test_filter_impact(self, tmp_path, bench_scale):
|
||||
"""Measure latency and recall difference with wing filtering."""
|
||||
gen = PalaceDataGenerator(seed=42, scale=bench_scale)
|
||||
palace_path = str(tmp_path / "palace")
|
||||
_, _, needle_info = gen.populate_palace_directly(
|
||||
palace_path, n_drawers=2_000, include_needles=True
|
||||
)
|
||||
|
||||
from mempalace.searcher import search_memories
|
||||
|
||||
filtered_latencies = []
|
||||
unfiltered_latencies = []
|
||||
filtered_hits = 0
|
||||
unfiltered_hits = 0
|
||||
n_queries = min(10, len(needle_info))
|
||||
|
||||
for needle in needle_info[:n_queries]:
|
||||
# Unfiltered
|
||||
start = time.perf_counter()
|
||||
result_unfiltered = search_memories(
|
||||
needle["query"], palace_path=palace_path, n_results=5
|
||||
)
|
||||
unfiltered_latencies.append((time.perf_counter() - start) * 1000)
|
||||
if any("NEEDLE_" in h["text"] for h in result_unfiltered.get("results", [])[:5]):
|
||||
unfiltered_hits += 1
|
||||
|
||||
# Filtered by wing
|
||||
start = time.perf_counter()
|
||||
result_filtered = search_memories(
|
||||
needle["query"],
|
||||
palace_path=palace_path,
|
||||
wing=needle["wing"],
|
||||
n_results=5,
|
||||
)
|
||||
filtered_latencies.append((time.perf_counter() - start) * 1000)
|
||||
if any("NEEDLE_" in h["text"] for h in result_filtered.get("results", [])[:5]):
|
||||
filtered_hits += 1
|
||||
|
||||
avg_unfiltered = sum(unfiltered_latencies) / max(len(unfiltered_latencies), 1)
|
||||
avg_filtered = sum(filtered_latencies) / max(len(filtered_latencies), 1)
|
||||
latency_improvement = ((avg_unfiltered - avg_filtered) / max(avg_unfiltered, 0.01)) * 100
|
||||
|
||||
record_metric("search_filter", "avg_unfiltered_ms", round(avg_unfiltered, 1))
|
||||
record_metric("search_filter", "avg_filtered_ms", round(avg_filtered, 1))
|
||||
record_metric("search_filter", "latency_improvement_pct", round(latency_improvement, 1))
|
||||
record_metric("search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3))
|
||||
record_metric("search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3))
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
class TestConcurrentSearch:
|
||||
"""Concurrent query performance — tests PersistentClient contention."""
|
||||
|
||||
def test_concurrent_queries(self, tmp_path):
|
||||
"""Issue N simultaneous queries and measure p50/p95/p99."""
|
||||
gen = PalaceDataGenerator(seed=42, scale="small")
|
||||
palace_path = str(tmp_path / "palace")
|
||||
gen.populate_palace_directly(palace_path, n_drawers=2_000, include_needles=False)
|
||||
|
||||
from mempalace.searcher import search_memories
|
||||
|
||||
queries = [
|
||||
"authentication", "database", "deployment", "error handling",
|
||||
"testing", "monitoring", "caching", "middleware",
|
||||
"serialization", "validation",
|
||||
] * 3 # 30 total queries
|
||||
|
||||
def run_search(query):
|
||||
start = time.perf_counter()
|
||||
result = search_memories(query, palace_path=palace_path, n_results=5)
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
return elapsed, "error" not in result
|
||||
|
||||
# Concurrent execution
|
||||
latencies = []
|
||||
errors = 0
|
||||
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||
futures = {executor.submit(run_search, q): q for q in queries}
|
||||
for future in as_completed(futures):
|
||||
elapsed, success = future.result()
|
||||
latencies.append(elapsed)
|
||||
if not success:
|
||||
errors += 1
|
||||
|
||||
sorted_lat = sorted(latencies)
|
||||
n = len(sorted_lat)
|
||||
|
||||
record_metric("concurrent_search", "p50_ms", round(sorted_lat[n // 2], 1))
|
||||
record_metric("concurrent_search", "p95_ms", round(sorted_lat[int(n * 0.95)], 1))
|
||||
record_metric("concurrent_search", "p99_ms", round(sorted_lat[int(n * 0.99)], 1))
|
||||
record_metric("concurrent_search", "avg_ms", round(sum(sorted_lat) / n, 1))
|
||||
record_metric("concurrent_search", "error_count", errors)
|
||||
record_metric("concurrent_search", "total_queries", len(queries))
|
||||
record_metric("concurrent_search", "workers", 4)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
class TestSearchNResultsScaling:
|
||||
"""How does n_results affect query latency?"""
|
||||
|
||||
@pytest.mark.parametrize("n_results", [1, 5, 10, 25, 50])
|
||||
def test_n_results_latency(self, n_results, tmp_path):
|
||||
gen = PalaceDataGenerator(seed=42, scale="small")
|
||||
palace_path = str(tmp_path / "palace")
|
||||
gen.populate_palace_directly(palace_path, n_drawers=2_000, include_needles=False)
|
||||
|
||||
from mempalace.searcher import search_memories
|
||||
|
||||
latencies = []
|
||||
for _ in range(5):
|
||||
start = time.perf_counter()
|
||||
result = search_memories(
|
||||
"authentication middleware", palace_path=palace_path, n_results=n_results
|
||||
)
|
||||
latencies.append((time.perf_counter() - start) * 1000)
|
||||
|
||||
avg_ms = sum(latencies) / len(latencies)
|
||||
record_metric("search_n_results", f"avg_ms_at_n_{n_results}", round(avg_ms, 1))
|
||||
Reference in New Issue
Block a user