2026-04-07 19:39:06 -03:00
|
|
|
"""
|
|
|
|
|
Palace boost validation — does wing/room filtering actually help?
|
|
|
|
|
|
|
|
|
|
Quantifies the retrieval improvement from the palace spatial metaphor.
|
|
|
|
|
Uses planted needles to measure recall with and without filtering
|
|
|
|
|
at different scales.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
from tests.benchmarks.data_generator import PalaceDataGenerator
|
|
|
|
|
from tests.benchmarks.report import record_metric
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.benchmark
|
|
|
|
|
class TestFilteredVsUnfilteredRecall:
|
|
|
|
|
"""Quantify palace boost: recall improvement from wing/room filtering."""
|
|
|
|
|
|
|
|
|
|
SIZES = [1_000, 2_500, 5_000]
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("n_drawers", SIZES)
|
|
|
|
|
def test_palace_boost_recall(self, n_drawers, tmp_path, bench_scale):
|
|
|
|
|
"""Compare recall@5 with/without wing filter at increasing scale."""
|
|
|
|
|
gen = PalaceDataGenerator(seed=42, scale=bench_scale)
|
|
|
|
|
palace_path = str(tmp_path / "palace")
|
|
|
|
|
_, _, needle_info = gen.populate_palace_directly(
|
|
|
|
|
palace_path, n_drawers=n_drawers, include_needles=True
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
from mempalace.searcher import search_memories
|
|
|
|
|
|
|
|
|
|
n_queries = min(10, len(needle_info))
|
|
|
|
|
unfiltered_hits = 0
|
|
|
|
|
wing_filtered_hits = 0
|
|
|
|
|
room_filtered_hits = 0
|
|
|
|
|
|
|
|
|
|
for needle in needle_info[:n_queries]:
|
|
|
|
|
# Unfiltered search
|
|
|
|
|
result = search_memories(needle["query"], palace_path=palace_path, n_results=5)
|
|
|
|
|
texts = [h["text"] for h in result.get("results", [])]
|
|
|
|
|
if any("NEEDLE_" in t for t in texts[:5]):
|
|
|
|
|
unfiltered_hits += 1
|
|
|
|
|
|
|
|
|
|
# Wing-filtered search
|
|
|
|
|
result = search_memories(
|
|
|
|
|
needle["query"], palace_path=palace_path, wing=needle["wing"], n_results=5
|
|
|
|
|
)
|
|
|
|
|
texts = [h["text"] for h in result.get("results", [])]
|
|
|
|
|
if any("NEEDLE_" in t for t in texts[:5]):
|
|
|
|
|
wing_filtered_hits += 1
|
|
|
|
|
|
|
|
|
|
# Wing+room filtered search
|
|
|
|
|
result = search_memories(
|
|
|
|
|
needle["query"],
|
|
|
|
|
palace_path=palace_path,
|
|
|
|
|
wing=needle["wing"],
|
|
|
|
|
room=needle["room"],
|
|
|
|
|
n_results=5,
|
|
|
|
|
)
|
|
|
|
|
texts = [h["text"] for h in result.get("results", [])]
|
|
|
|
|
if any("NEEDLE_" in t for t in texts[:5]):
|
|
|
|
|
room_filtered_hits += 1
|
|
|
|
|
|
|
|
|
|
recall_none = unfiltered_hits / max(n_queries, 1)
|
|
|
|
|
recall_wing = wing_filtered_hits / max(n_queries, 1)
|
|
|
|
|
recall_room = room_filtered_hits / max(n_queries, 1)
|
|
|
|
|
|
|
|
|
|
boost_wing = recall_wing - recall_none
|
|
|
|
|
boost_room = recall_room - recall_none
|
|
|
|
|
|
|
|
|
|
record_metric("palace_boost", f"recall_unfiltered_at_{n_drawers}", round(recall_none, 3))
|
|
|
|
|
record_metric("palace_boost", f"recall_wing_filtered_at_{n_drawers}", round(recall_wing, 3))
|
|
|
|
|
record_metric("palace_boost", f"recall_room_filtered_at_{n_drawers}", round(recall_room, 3))
|
|
|
|
|
record_metric("palace_boost", f"wing_boost_at_{n_drawers}", round(boost_wing, 3))
|
|
|
|
|
record_metric("palace_boost", f"room_boost_at_{n_drawers}", round(boost_room, 3))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.benchmark
|
|
|
|
|
class TestFilterLatencyBenefit:
|
|
|
|
|
"""Does filtering reduce query latency by narrowing the search space?"""
|
|
|
|
|
|
|
|
|
|
def test_filter_speedup(self, tmp_path, bench_scale):
|
|
|
|
|
"""Compare latency: no filter vs wing vs wing+room."""
|
|
|
|
|
gen = PalaceDataGenerator(seed=42, scale=bench_scale)
|
|
|
|
|
palace_path = str(tmp_path / "palace")
|
|
|
|
|
gen.populate_palace_directly(palace_path, n_drawers=5_000, include_needles=False)
|
|
|
|
|
|
|
|
|
|
from mempalace.searcher import search_memories
|
|
|
|
|
|
|
|
|
|
wing = gen.wings[0]
|
|
|
|
|
room = gen.rooms_by_wing[wing][0]
|
|
|
|
|
query = "authentication middleware optimization"
|
|
|
|
|
n_runs = 10
|
|
|
|
|
|
|
|
|
|
# No filter
|
|
|
|
|
latencies_none = []
|
|
|
|
|
for _ in range(n_runs):
|
|
|
|
|
start = time.perf_counter()
|
|
|
|
|
search_memories(query, palace_path=palace_path, n_results=5)
|
|
|
|
|
latencies_none.append((time.perf_counter() - start) * 1000)
|
|
|
|
|
|
|
|
|
|
# Wing filter
|
|
|
|
|
latencies_wing = []
|
|
|
|
|
for _ in range(n_runs):
|
|
|
|
|
start = time.perf_counter()
|
|
|
|
|
search_memories(query, palace_path=palace_path, wing=wing, n_results=5)
|
|
|
|
|
latencies_wing.append((time.perf_counter() - start) * 1000)
|
|
|
|
|
|
|
|
|
|
# Wing + room filter
|
|
|
|
|
latencies_room = []
|
|
|
|
|
for _ in range(n_runs):
|
|
|
|
|
start = time.perf_counter()
|
|
|
|
|
search_memories(query, palace_path=palace_path, wing=wing, room=room, n_results=5)
|
|
|
|
|
latencies_room.append((time.perf_counter() - start) * 1000)
|
|
|
|
|
|
|
|
|
|
avg_none = sum(latencies_none) / len(latencies_none)
|
|
|
|
|
avg_wing = sum(latencies_wing) / len(latencies_wing)
|
|
|
|
|
avg_room = sum(latencies_room) / len(latencies_room)
|
|
|
|
|
|
|
|
|
|
record_metric("filter_latency", "avg_unfiltered_ms", round(avg_none, 1))
|
|
|
|
|
record_metric("filter_latency", "avg_wing_filtered_ms", round(avg_wing, 1))
|
|
|
|
|
record_metric("filter_latency", "avg_room_filtered_ms", round(avg_room, 1))
|
|
|
|
|
if avg_none > 0:
|
2026-04-08 10:56:39 -03:00
|
|
|
record_metric(
|
|
|
|
|
"filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1)
|
|
|
|
|
)
|
|
|
|
|
record_metric(
|
|
|
|
|
"filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1)
|
|
|
|
|
)
|
2026-04-07 19:39:06 -03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.benchmark
|
|
|
|
|
class TestBoostAtIncreasingScale:
|
|
|
|
|
"""Does the palace boost increase as the palace grows?"""
|
|
|
|
|
|
|
|
|
|
def test_boost_scaling(self, tmp_path, bench_scale):
|
|
|
|
|
"""Measure wing-filtered recall improvement at multiple sizes."""
|
|
|
|
|
sizes = [500, 1_000, 2_500]
|
|
|
|
|
boosts = []
|
|
|
|
|
|
|
|
|
|
for size in sizes:
|
|
|
|
|
gen = PalaceDataGenerator(seed=42, scale=bench_scale)
|
|
|
|
|
palace_path = str(tmp_path / f"palace_{size}")
|
|
|
|
|
_, _, needle_info = gen.populate_palace_directly(
|
|
|
|
|
palace_path, n_drawers=size, include_needles=True
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
from mempalace.searcher import search_memories
|
|
|
|
|
|
|
|
|
|
n_queries = min(8, len(needle_info))
|
|
|
|
|
unfiltered_hits = 0
|
|
|
|
|
filtered_hits = 0
|
|
|
|
|
|
|
|
|
|
for needle in needle_info[:n_queries]:
|
|
|
|
|
result = search_memories(needle["query"], palace_path=palace_path, n_results=5)
|
|
|
|
|
if any("NEEDLE_" in h["text"] for h in result.get("results", [])[:5]):
|
|
|
|
|
unfiltered_hits += 1
|
|
|
|
|
|
|
|
|
|
result = search_memories(
|
|
|
|
|
needle["query"], palace_path=palace_path, wing=needle["wing"], n_results=5
|
|
|
|
|
)
|
|
|
|
|
if any("NEEDLE_" in h["text"] for h in result.get("results", [])[:5]):
|
|
|
|
|
filtered_hits += 1
|
|
|
|
|
|
|
|
|
|
recall_none = unfiltered_hits / max(n_queries, 1)
|
|
|
|
|
recall_filtered = filtered_hits / max(n_queries, 1)
|
|
|
|
|
boost = recall_filtered - recall_none
|
|
|
|
|
boosts.append({"size": size, "boost": boost})
|
|
|
|
|
|
|
|
|
|
record_metric("boost_scaling", "boosts_by_size", boosts)
|
|
|
|
|
# Check if boost increases with scale (the hypothesis)
|
|
|
|
|
if len(boosts) >= 2:
|
|
|
|
|
trend_positive = boosts[-1]["boost"] >= boosts[0]["boost"]
|
|
|
|
|
record_metric("boost_scaling", "trend_positive", trend_positive)
|