fix: resolve formatting, regression logic, and pytest defaults

- Run ruff format on all benchmark files (fixes CI lint job)
- Fix check_regression() substring ambiguity: ordered keyword matching
  so "latency_improvement_pct" is correctly classified as higher-is-better
- Update stale comments in conftest.py referencing wrong fixture
- Add pytest addopts to skip benchmark/slow/stress markers by default
This commit is contained in:
Igor Lins e Silva
2026-04-08 10:56:39 -03:00
parent 7e4db33061
commit ebc26f3960
12 changed files with 383 additions and 138 deletions
+1
View File
@@ -65,6 +65,7 @@ quote-style = "double"
[tool.pytest.ini_options] [tool.pytest.ini_options]
testpaths = ["tests"] testpaths = ["tests"]
pythonpath = ["."] pythonpath = ["."]
addopts = "-m 'not benchmark and not slow and not stress'"
markers = [ markers = [
"benchmark: scale/performance benchmark tests", "benchmark: scale/performance benchmark tests",
"slow: tests that take more than 30 seconds", "slow: tests that take more than 30 seconds",
+2 -3
View File
@@ -96,8 +96,7 @@ def pytest_terminal_summary(terminalreporter, config):
if not report_path: if not report_path:
return return
# Collect results from the session fixture if available # Collect results written by individual tests via record_metric()
# The results are written by individual tests via bench_results fixture
import platform import platform
import subprocess import subprocess
@@ -129,7 +128,7 @@ def pytest_terminal_summary(terminalreporter, config):
"results": {}, "results": {},
} }
# Read results from a temp file written by the bench_results fixture # Read results from the temp file written by record_metric()
results_file = os.path.join(tempfile.gettempdir(), "mempalace_bench_results.json") results_file = os.path.join(tempfile.gettempdir(), "mempalace_bench_results.json")
if os.path.exists(results_file): if os.path.exists(results_file):
try: try:
+248 -74
View File
@@ -20,42 +20,150 @@ import yaml
# ── Scale configurations ───────────────────────────────────────────────── # ── Scale configurations ─────────────────────────────────────────────────
SCALE_CONFIGS = { SCALE_CONFIGS = {
"small": {"drawers": 1_000, "wings": 3, "rooms_per_wing": 5, "kg_entities": 50, "kg_triples": 200, "needles": 20, "search_queries": 20}, "small": {
"medium": {"drawers": 10_000, "wings": 8, "rooms_per_wing": 12, "kg_entities": 200, "kg_triples": 2_000, "needles": 50, "search_queries": 50}, "drawers": 1_000,
"large": {"drawers": 50_000, "wings": 15, "rooms_per_wing": 20, "kg_entities": 500, "kg_triples": 10_000, "needles": 100, "search_queries": 100}, "wings": 3,
"stress": {"drawers": 100_000, "wings": 25, "rooms_per_wing": 30, "kg_entities": 1_000, "kg_triples": 50_000, "needles": 200, "search_queries": 200}, "rooms_per_wing": 5,
"kg_entities": 50,
"kg_triples": 200,
"needles": 20,
"search_queries": 20,
},
"medium": {
"drawers": 10_000,
"wings": 8,
"rooms_per_wing": 12,
"kg_entities": 200,
"kg_triples": 2_000,
"needles": 50,
"search_queries": 50,
},
"large": {
"drawers": 50_000,
"wings": 15,
"rooms_per_wing": 20,
"kg_entities": 500,
"kg_triples": 10_000,
"needles": 100,
"search_queries": 100,
},
"stress": {
"drawers": 100_000,
"wings": 25,
"rooms_per_wing": 30,
"kg_entities": 1_000,
"kg_triples": 50_000,
"needles": 200,
"search_queries": 200,
},
} }
# ── Vocabulary banks for realistic content ─────────────────────────────── # ── Vocabulary banks for realistic content ───────────────────────────────
WING_NAMES = [ WING_NAMES = [
"webapp", "backend_api", "mobile_app", "data_pipeline", "ml_platform", "webapp",
"devops", "auth_service", "payments", "analytics", "docs_site", "backend_api",
"cli_tool", "dashboard", "notification_service", "search_engine", "mobile_app",
"user_mgmt", "inventory", "reporting", "testing_infra", "monitoring", "data_pipeline",
"email_service", "chat_bot", "file_storage", "scheduler", "gateway", "ml_platform",
"devops",
"auth_service",
"payments",
"analytics",
"docs_site",
"cli_tool",
"dashboard",
"notification_service",
"search_engine",
"user_mgmt",
"inventory",
"reporting",
"testing_infra",
"monitoring",
"email_service",
"chat_bot",
"file_storage",
"scheduler",
"gateway",
"marketplace", "marketplace",
] ]
ROOM_NAMES = [ ROOM_NAMES = [
"backend", "frontend", "api", "database", "auth", "tests", "docs", "backend",
"config", "deployment", "models", "views", "controllers", "middleware", "frontend",
"utils", "schemas", "migrations", "fixtures", "scripts", "styles", "api",
"components", "hooks", "services", "routes", "templates", "static", "database",
"media", "logging", "cache", "queue", "workers", "auth",
"tests",
"docs",
"config",
"deployment",
"models",
"views",
"controllers",
"middleware",
"utils",
"schemas",
"migrations",
"fixtures",
"scripts",
"styles",
"components",
"hooks",
"services",
"routes",
"templates",
"static",
"media",
"logging",
"cache",
"queue",
"workers",
] ]
TECH_TERMS = [ TECH_TERMS = [
"authentication", "authorization", "middleware", "endpoint", "REST API", "authentication",
"GraphQL", "WebSocket", "database migration", "ORM", "query optimization", "authorization",
"caching strategy", "load balancer", "rate limiting", "pagination", "middleware",
"serialization", "validation", "error handling", "logging framework", "endpoint",
"monitoring", "deployment pipeline", "CI/CD", "containerization", "REST API",
"microservice", "event sourcing", "message queue", "pub/sub", "GraphQL",
"connection pooling", "session management", "token refresh", "CORS", "WebSocket",
"SSL termination", "health check", "circuit breaker", "retry logic", "database migration",
"batch processing", "stream processing", "data pipeline", "ETL", "ORM",
"feature flag", "A/B testing", "blue-green deployment", "canary release", "query optimization",
"caching strategy",
"load balancer",
"rate limiting",
"pagination",
"serialization",
"validation",
"error handling",
"logging framework",
"monitoring",
"deployment pipeline",
"CI/CD",
"containerization",
"microservice",
"event sourcing",
"message queue",
"pub/sub",
"connection pooling",
"session management",
"token refresh",
"CORS",
"SSL termination",
"health check",
"circuit breaker",
"retry logic",
"batch processing",
"stream processing",
"data pipeline",
"ETL",
"feature flag",
"A/B testing",
"blue-green deployment",
"canary release",
] ]
CODE_SNIPPETS = [ CODE_SNIPPETS = [
@@ -75,17 +183,51 @@ PROSE_TEMPLATES = [
] ]
ENTITY_NAMES = [ ENTITY_NAMES = [
"Alice", "Bob", "Carol", "Dave", "Eve", "Frank", "Grace", "Heidi", "Alice",
"Ivan", "Judy", "Karl", "Linda", "Mike", "Nina", "Oscar", "Pat", "Bob",
"Quinn", "Rita", "Steve", "Tina", "Ursula", "Victor", "Wendy", "Xander", "Carol",
"Dave",
"Eve",
"Frank",
"Grace",
"Heidi",
"Ivan",
"Judy",
"Karl",
"Linda",
"Mike",
"Nina",
"Oscar",
"Pat",
"Quinn",
"Rita",
"Steve",
"Tina",
"Ursula",
"Victor",
"Wendy",
"Xander",
] ]
ENTITY_TYPES = ["person", "project", "tool", "concept", "team", "service"] ENTITY_TYPES = ["person", "project", "tool", "concept", "team", "service"]
PREDICATES = [ PREDICATES = [
"works_on", "manages", "reports_to", "collaborates_with", "created", "works_on",
"maintains", "uses", "depends_on", "replaced", "reviewed", "deployed", "manages",
"tested", "documented", "mentors", "leads", "contributes_to", "reports_to",
"collaborates_with",
"created",
"maintains",
"uses",
"depends_on",
"replaced",
"reviewed",
"deployed",
"tested",
"documented",
"mentors",
"leads",
"contributes_to",
] ]
@@ -136,13 +278,19 @@ class PalaceDataGenerator:
room = self.rng.choice(self.rooms_by_wing[wing]) room = self.rng.choice(self.rooms_by_wing[wing])
needle_id = f"NEEDLE_{i:04d}" needle_id = f"NEEDLE_{i:04d}"
content = f"{needle_id}: {topic}. This is a unique planted needle for recall benchmarking at scale." content = f"{needle_id}: {topic}. This is a unique planted needle for recall benchmarking at scale."
self.needles.append({ self.needles.append(
"id": needle_id, {
"content": content, "id": needle_id,
"wing": wing, "content": content,
"room": room, "wing": wing,
"query": topic.split(" uses ")[0] if " uses " in topic else topic.split(" set to ")[0] if " set to " in topic else topic[:60], "room": room,
}) "query": topic.split(" uses ")[0]
if " uses " in topic
else topic.split(" set to ")[0]
if " set to " in topic
else topic[:60],
}
)
def _random_text(self, min_chars=600, max_chars=900): def _random_text(self, min_chars=600, max_chars=900):
"""Generate a random text block of realistic content.""" """Generate a random text block of realistic content."""
@@ -159,21 +307,25 @@ class PalaceDataGenerator:
component=self.rng.choice(ROOM_NAMES), component=self.rng.choice(ROOM_NAMES),
task=self.rng.choice(TECH_TERMS), task=self.rng.choice(TECH_TERMS),
month=self.rng.choice(["January", "February", "March", "April", "May"]), month=self.rng.choice(["January", "February", "March", "April", "May"]),
quality=self.rng.choice(["performance", "readability", "test coverage", "latency"]), quality=self.rng.choice(
["performance", "readability", "test coverage", "latency"]
),
decision=self.rng.choice(TECH_TERMS), decision=self.rng.choice(TECH_TERMS),
condition=self.rng.choice(TECH_TERMS) + " is null", condition=self.rng.choice(TECH_TERMS) + " is null",
cause=self.rng.choice(["race condition", "null pointer", "timeout", "OOM"]), cause=self.rng.choice(["race condition", "null pointer", "timeout", "OOM"]),
fix="adding " + self.rng.choice(TECH_TERMS), fix="adding " + self.rng.choice(TECH_TERMS),
test_file=f"test_{self.rng.choice(ROOM_NAMES)}.py", test_file=f"test_{self.rng.choice(ROOM_NAMES)}.py",
old_tech=self.rng.choice(["MySQL", "Flask", "REST", "Jenkins"]), old_tech=self.rng.choice(["MySQL", "Flask", "REST", "Jenkins"]),
new_tech=self.rng.choice(["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]), new_tech=self.rng.choice(
["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]
),
reason=self.rng.choice(TECH_TERMS), reason=self.rng.choice(TECH_TERMS),
date=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}", date=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}",
percent=self.rng.randint(10, 80), percent=self.rng.randint(10, 80),
topic=self.rng.choice(TECH_TERMS), topic=self.rng.choice(TECH_TERMS),
person=self.rng.choice(ENTITY_NAMES), person=self.rng.choice(ENTITY_NAMES),
action=self.rng.choice(["refactor", "migrate", "optimize", "test"]), action=self.rng.choice(["refactor", "migrate", "optimize", "test"]),
deadline=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}", deadline=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}",
followup=self.rng.choice(TECH_TERMS), followup=self.rng.choice(TECH_TERMS),
feature_name=self.rng.choice(TECH_TERMS), feature_name=self.rng.choice(TECH_TERMS),
capability=self.rng.choice(TECH_TERMS), capability=self.rng.choice(TECH_TERMS),
@@ -182,7 +334,12 @@ class PalaceDataGenerator:
) )
else: else:
words = self.rng.sample(TECH_TERMS, min(5, len(TECH_TERMS))) words = self.rng.sample(TECH_TERMS, min(5, len(TECH_TERMS)))
text = " ".join(words) + ". " + self.rng.choice(TECH_TERMS) + " implementation details follow.\n" text = (
" ".join(words)
+ ". "
+ self.rng.choice(TECH_TERMS)
+ " implementation details follow.\n"
)
parts.append(text) parts.append(text)
total += len(text) total += len(text)
return "\n".join(parts)[:max_chars] return "\n".join(parts)[:max_chars]
@@ -270,15 +427,24 @@ class PalaceDataGenerator:
needle_id = f"drawer_{needle['wing']}_{needle['room']}_{hashlib.md5(needle['id'].encode()).hexdigest()[:16]}" needle_id = f"drawer_{needle['wing']}_{needle['room']}_{hashlib.md5(needle['id'].encode()).hexdigest()[:16]}"
docs.append(needle["content"]) docs.append(needle["content"])
ids.append(needle_id) ids.append(needle_id)
metas.append({ metas.append(
"wing": needle["wing"], {
"room": needle["room"], "wing": needle["wing"],
"source_file": f"needle_{needle['id']}.txt", "room": needle["room"],
"chunk_index": 0, "source_file": f"needle_{needle['id']}.txt",
"added_by": "benchmark", "chunk_index": 0,
"filed_at": datetime.now().isoformat(), "added_by": "benchmark",
}) "filed_at": datetime.now().isoformat(),
needle_info.append({"id": needle_id, "query": needle["query"], "wing": needle["wing"], "room": needle["room"]}) }
)
needle_info.append(
{
"id": needle_id,
"query": needle["query"],
"wing": needle["wing"],
"room": needle["room"],
}
)
# Fill remaining drawers with realistic content # Fill remaining drawers with realistic content
remaining = n_drawers - len(docs) remaining = n_drawers - len(docs)
@@ -291,14 +457,16 @@ class PalaceDataGenerator:
docs.append(content) docs.append(content)
ids.append(drawer_id) ids.append(drawer_id)
metas.append({ metas.append(
"wing": wing, {
"room": room, "wing": wing,
"source_file": f"generated_{i:06d}.txt", "room": room,
"chunk_index": i % 10, "source_file": f"generated_{i:06d}.txt",
"added_by": "benchmark", "chunk_index": i % 10,
"filed_at": datetime.now().isoformat(), "added_by": "benchmark",
}) "filed_at": datetime.now().isoformat(),
}
)
# Flush in batches # Flush in batches
if len(docs) >= batch_size: if len(docs) >= batch_size:
@@ -351,7 +519,9 @@ class PalaceDataGenerator:
valid_to = None valid_to = None
if self.rng.random() < 0.3: if self.rng.random() < 0.3:
end_offset = self.rng.randint(30, 365) end_offset = self.rng.randint(30, 365)
valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime("%Y-%m-%d") valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime(
"%Y-%m-%d"
)
triples.append((subject, predicate, obj, valid_from, valid_to)) triples.append((subject, predicate, obj, valid_from, valid_to))
return entities, triples return entities, triples
@@ -371,24 +541,28 @@ class PalaceDataGenerator:
# Half are needle queries (known-good answers) # Half are needle queries (known-good answers)
n_needle = min(n_queries // 2, len(self.needles)) n_needle = min(n_queries // 2, len(self.needles))
for needle in self.needles[:n_needle]: for needle in self.needles[:n_needle]:
queries.append({ queries.append(
"query": needle["query"], {
"expected_wing": needle["wing"], "query": needle["query"],
"expected_room": needle["room"], "expected_wing": needle["wing"],
"needle_id": needle["id"], "expected_room": needle["room"],
"is_needle": True, "needle_id": needle["id"],
}) "is_needle": True,
}
)
# Other half are generic queries (measure latency, not recall) # Other half are generic queries (measure latency, not recall)
n_generic = n_queries - n_needle n_generic = n_queries - n_needle
for _ in range(n_generic): for _ in range(n_generic):
queries.append({ queries.append(
"query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS), {
"expected_wing": None, "query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS),
"expected_room": None, "expected_wing": None,
"needle_id": None, "expected_room": None,
"is_needle": False, "needle_id": None,
}) "is_needle": False,
}
)
self.rng.shuffle(queries) self.rng.shuffle(queries)
return queries return queries
+44 -17
View File
@@ -45,16 +45,45 @@ def check_regression(current_report: str, baseline_report: str, threshold: float
baseline = json.load(f) baseline = json.load(f)
regressions = [] regressions = []
# Metrics where HIGHER is worse (latency, memory, etc.) # Keywords for metric direction — checked in order, first match wins.
higher_is_worse = { # "improvement" is checked before "latency" so that composite names
"latency", "rss", "memory", "oom", "lock_failures", "elapsed", # like "latency_improvement_pct" are classified correctly.
"p50_ms", "p95_ms", "p99_ms", "rss_delta_mb", "peak_rss_mb", _higher_is_better_kw = [
} "improvement",
# Metrics where LOWER is worse (throughput, recall, etc.) "recall",
lower_is_worse = { "throughput",
"recall", "throughput", "per_sec", "files_per_sec", "drawers_per_sec", "per_sec",
"triples_per_sec", "improvement", "files_per_sec",
} "drawers_per_sec",
"triples_per_sec",
"speedup",
]
_higher_is_worse_kw = [
"latency",
"rss",
"memory",
"oom",
"lock_failures",
"elapsed",
"p50_ms",
"p95_ms",
"p99_ms",
"rss_delta_mb",
"peak_rss_mb",
"errors",
"failures",
]
def _metric_direction(name: str) -> str:
"""Return 'higher_better', 'higher_worse', or 'unknown'."""
low = name.lower()
for kw in _higher_is_better_kw:
if kw in low:
return "higher_better"
for kw in _higher_is_worse_kw:
if kw in low:
return "higher_worse"
return "unknown"
for category in baseline.get("results", {}): for category in baseline.get("results", {}):
if category not in current.get("results", {}): if category not in current.get("results", {}):
@@ -68,23 +97,21 @@ def check_regression(current_report: str, baseline_report: str, threshold: float
if base_val == 0: if base_val == 0:
continue continue
# Determine direction direction = _metric_direction(metric)
is_latency_like = any(kw in metric.lower() for kw in higher_is_worse)
is_throughput_like = any(kw in metric.lower() for kw in lower_is_worse)
if is_latency_like: if direction == "higher_worse":
# Higher is worse — check if current exceeds baseline by threshold # Higher is worse — check if current exceeds baseline by threshold
if curr_val > base_val * (1 + threshold): if curr_val > base_val * (1 + threshold):
pct = ((curr_val - base_val) / base_val) * 100 pct = ((curr_val - base_val) / base_val) * 100
regressions.append( regressions.append(
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)" f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)"
) )
elif is_throughput_like: elif direction == "higher_better":
# Lower is worse — check if current is below baseline by threshold # Lower is worse — check if current is below baseline by threshold
if curr_val < base_val * (1 - threshold): if curr_val < base_val * (1 - threshold):
pct = ((curr_val - base_val) / base_val) * 100 pct = ((curr_val - base_val) / base_val) * 100
regressions.append( regressions.append(
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)" f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)"
) )
return regressions return regressions
+4 -1
View File
@@ -143,7 +143,10 @@ class TestBulkInsertPerformance:
batch_end = min(batch_start + batch_size, n_docs) batch_end = min(batch_start + batch_size, n_docs)
batch_docs = contents[batch_start:batch_end] batch_docs = contents[batch_start:batch_end]
batch_ids = [f"batch_{i}" for i in range(batch_start, batch_end)] batch_ids = [f"batch_{i}" for i in range(batch_start, batch_end)]
batch_metas = [{"wing": "test", "room": "bench", "chunk_index": i} for i in range(batch_start, batch_end)] batch_metas = [
{"wing": "test", "room": "bench", "chunk_index": i}
for i in range(batch_start, batch_end)
]
col_batch.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas) col_batch.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
batched_ms = (time.perf_counter() - start) * 1000 batched_ms = (time.perf_counter() - start) * 1000
+8 -2
View File
@@ -125,7 +125,9 @@ class TestChunkThroughput:
chunks_per_sec = total_chunks / max(elapsed, 0.001) chunks_per_sec = total_chunks / max(elapsed, 0.001)
kb_per_sec = (len(content) * n_iterations / 1024) / max(elapsed, 0.001) kb_per_sec = (len(content) * n_iterations / 1024) / max(elapsed, 0.001)
record_metric("chunking", f"chunks_per_sec_at_{content_size_kb}kb", round(chunks_per_sec, 1)) record_metric(
"chunking", f"chunks_per_sec_at_{content_size_kb}kb", round(chunks_per_sec, 1)
)
record_metric("chunking", f"kb_per_sec_at_{content_size_kb}kb", round(kb_per_sec, 1)) record_metric("chunking", f"kb_per_sec_at_{content_size_kb}kb", round(kb_per_sec, 1))
@@ -160,4 +162,8 @@ class TestReingestSkipOverhead:
record_metric("reingest", "skip_check_elapsed_sec", round(skip_elapsed, 2)) record_metric("reingest", "skip_check_elapsed_sec", round(skip_elapsed, 2))
record_metric("reingest", "files_checked", files_written) record_metric("reingest", "files_checked", files_written)
record_metric("reingest", "skip_check_per_file_ms", round(skip_elapsed * 1000 / max(files_written, 1), 1)) record_metric(
"reingest",
"skip_check_per_file_ms",
round(skip_elapsed * 1000 / max(files_written, 1), 1),
)
+17 -7
View File
@@ -36,9 +36,7 @@ class TestTripleInsertionRate:
# Measure triple insertion # Measure triple insertion
start = time.perf_counter() start = time.perf_counter()
for subject, predicate, obj, valid_from, valid_to in triples: for subject, predicate, obj, valid_from, valid_to in triples:
kg.add_triple( kg.add_triple(subject, predicate, obj, valid_from=valid_from, valid_to=valid_to)
subject, predicate, obj, valid_from=valid_from, valid_to=valid_to
)
elapsed = time.perf_counter() - start elapsed = time.perf_counter() - start
triples_per_sec = n_triples / max(elapsed, 0.001) triples_per_sec = n_triples / max(elapsed, 0.001)
@@ -128,7 +126,9 @@ class TestTemporalQueryAccuracy:
kg.add_entity("ProjectB", "project") kg.add_entity("ProjectB", "project")
# Alice worked on ProjectA from 2024-01 to 2024-06 # Alice worked on ProjectA from 2024-01 to 2024-06
kg.add_triple("Alice", "works_on", "ProjectA", valid_from="2024-01-01", valid_to="2024-06-30") kg.add_triple(
"Alice", "works_on", "ProjectA", valid_from="2024-01-01", valid_to="2024-06-30"
)
# Alice worked on ProjectB from 2024-07 onwards # Alice worked on ProjectB from 2024-07 onwards
kg.add_triple("Alice", "works_on", "ProjectB", valid_from="2024-07-01") kg.add_triple("Alice", "works_on", "ProjectB", valid_from="2024-07-01")
@@ -145,8 +145,16 @@ class TestTemporalQueryAccuracy:
# Query Alice as of September 2024 — should find ProjectB # Query Alice as of September 2024 — should find ProjectB
result_sept = kg.query_entity("Alice", as_of="2024-09-15") result_sept = kg.query_entity("Alice", as_of="2024-09-15")
record_metric("kg_temporal", "march_query_results", len(result_march) if isinstance(result_march, list) else 0) record_metric(
record_metric("kg_temporal", "sept_query_results", len(result_sept) if isinstance(result_sept, list) else 0) "kg_temporal",
"march_query_results",
len(result_march) if isinstance(result_march, list) else 0,
)
record_metric(
"kg_temporal",
"sept_query_results",
len(result_sept) if isinstance(result_sept, list) else 0,
)
@pytest.mark.benchmark @pytest.mark.benchmark
@@ -230,7 +238,9 @@ class TestSQLiteConcurrentAccess:
fails = 0 fails = 0
for i in range(50): for i in range(50):
try: try:
kg.add_triple(f"E_{i % 50}", "new_rel", f"E_{(i + 7) % 50}", valid_from="2025-06-01") kg.add_triple(
f"E_{i % 50}", "new_rel", f"E_{(i + 7) % 50}", valid_from="2025-06-01"
)
except Exception: except Exception:
fails += 1 fails += 1
write_errors.append(fails) write_errors.append(fails)
+6 -2
View File
@@ -116,7 +116,9 @@ class TestLayer1UnboundedFetch:
record_metric("layer1_filter", "unfiltered_ms", round(unfiltered_ms, 1)) record_metric("layer1_filter", "unfiltered_ms", round(unfiltered_ms, 1))
record_metric("layer1_filter", "filtered_ms", round(filtered_ms, 1)) record_metric("layer1_filter", "filtered_ms", round(filtered_ms, 1))
if unfiltered_ms > 0: if unfiltered_ms > 0:
record_metric("layer1_filter", "speedup_pct", round((1 - filtered_ms / unfiltered_ms) * 100, 1)) record_metric(
"layer1_filter", "speedup_pct", round((1 - filtered_ms / unfiltered_ms) * 100, 1)
)
@pytest.mark.benchmark @pytest.mark.benchmark
@@ -146,7 +148,9 @@ class TestWakeUpTokenBudget:
record_metric("wakeup_budget", f"tokens_at_{n_drawers}", token_estimate) record_metric("wakeup_budget", f"tokens_at_{n_drawers}", token_estimate)
record_metric("wakeup_budget", f"chars_at_{n_drawers}", len(text)) record_metric("wakeup_budget", f"chars_at_{n_drawers}", len(text))
assert token_estimate < 1200, f"Wake-up exceeded budget: ~{token_estimate} tokens at {n_drawers} drawers" assert token_estimate < 1200, (
f"Wake-up exceeded budget: ~{token_estimate} tokens at {n_drawers} drawers"
)
@pytest.mark.benchmark @pytest.mark.benchmark
+10 -6
View File
@@ -63,7 +63,9 @@ class TestSearchMemoryProfile:
record_metric("memory_search", "rss_end_mb", round(end_rss, 2)) record_metric("memory_search", "rss_end_mb", round(end_rss, 2))
record_metric("memory_search", "rss_growth_mb", round(growth, 2)) record_metric("memory_search", "rss_growth_mb", round(growth, 2))
record_metric("memory_search", "n_calls", n_calls) record_metric("memory_search", "n_calls", n_calls)
record_metric("memory_search", "growth_per_100_calls_mb", round(growth / (n_calls / 100), 2)) record_metric(
"memory_search", "growth_per_100_calls_mb", round(growth / (n_calls / 100), 2)
)
@pytest.mark.benchmark @pytest.mark.benchmark
@@ -166,11 +168,13 @@ class TestHeapSnapshot:
stats = snap_after.compare_to(snap_before, "lineno") stats = snap_after.compare_to(snap_before, "lineno")
top_allocators = [] top_allocators = []
for stat in stats[:10]: for stat in stats[:10]:
top_allocators.append({ top_allocators.append(
"file": str(stat.traceback), {
"size_kb": round(stat.size / 1024, 1), "file": str(stat.traceback),
"count": stat.count, "size_kb": round(stat.size / 1024, 1),
}) "count": stat.count,
}
)
total_growth_kb = sum(s["size_kb"] for s in top_allocators) total_growth_kb = sum(s["size_kb"] for s in top_allocators)
record_metric("heap_search", "top_10_growth_kb", round(total_growth_kb, 1)) record_metric("heap_search", "top_10_growth_kb", round(total_growth_kb, 1))
+6 -2
View File
@@ -123,8 +123,12 @@ class TestFilterLatencyBenefit:
record_metric("filter_latency", "avg_wing_filtered_ms", round(avg_wing, 1)) record_metric("filter_latency", "avg_wing_filtered_ms", round(avg_wing, 1))
record_metric("filter_latency", "avg_room_filtered_ms", round(avg_room, 1)) record_metric("filter_latency", "avg_room_filtered_ms", round(avg_room, 1))
if avg_none > 0: if avg_none > 0:
record_metric("filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1)) record_metric(
record_metric("filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1)) "filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1)
)
record_metric(
"filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1)
)
@pytest.mark.benchmark @pytest.mark.benchmark
+20 -16
View File
@@ -61,14 +61,16 @@ def _populate_single_room(palace_path, n_drawers, n_needles=10):
drawer_id = f"drawer_single_room_{hashlib.md5(needle_id.encode()).hexdigest()[:16]}" drawer_id = f"drawer_single_room_{hashlib.md5(needle_id.encode()).hexdigest()[:16]}"
docs.append(content) docs.append(content)
ids.append(drawer_id) ids.append(drawer_id)
metas.append({ metas.append(
"wing": "concentrated", {
"room": "single_room", "wing": "concentrated",
"source_file": f"needle_{i}.txt", "room": "single_room",
"chunk_index": 0, "source_file": f"needle_{i}.txt",
"added_by": "threshold_bench", "chunk_index": 0,
"filed_at": datetime.now().isoformat(), "added_by": "threshold_bench",
}) "filed_at": datetime.now().isoformat(),
}
)
# Fill with noise — all in the SAME room # Fill with noise — all in the SAME room
remaining = n_drawers - len(docs) remaining = n_drawers - len(docs)
@@ -77,14 +79,16 @@ def _populate_single_room(palace_path, n_drawers, n_needles=10):
drawer_id = f"drawer_single_room_{hashlib.md5(f'noise_{i}'.encode()).hexdigest()[:16]}" drawer_id = f"drawer_single_room_{hashlib.md5(f'noise_{i}'.encode()).hexdigest()[:16]}"
docs.append(content) docs.append(content)
ids.append(drawer_id) ids.append(drawer_id)
metas.append({ metas.append(
"wing": "concentrated", {
"room": "single_room", "wing": "concentrated",
"source_file": f"noise_{i:06d}.txt", "room": "single_room",
"chunk_index": i % 10, "source_file": f"noise_{i:06d}.txt",
"added_by": "threshold_bench", "chunk_index": i % 10,
"filed_at": datetime.now().isoformat(), "added_by": "threshold_bench",
}) "filed_at": datetime.now().isoformat(),
}
)
if len(docs) >= batch_size: if len(docs) >= batch_size:
col.add(documents=docs, ids=ids, metadatas=metas) col.add(documents=docs, ids=ids, metadatas=metas)
+17 -8
View File
@@ -77,9 +77,7 @@ class TestSearchRecallAtScale:
total_needle_queries = min(10, len(needle_info)) total_needle_queries = min(10, len(needle_info))
for needle in needle_info[:total_needle_queries]: for needle in needle_info[:total_needle_queries]:
result = search_memories( result = search_memories(needle["query"], palace_path=palace_path, n_results=10)
needle["query"], palace_path=palace_path, n_results=10
)
if "error" in result: if "error" in result:
continue continue
@@ -150,8 +148,12 @@ class TestSearchFilteredVsUnfiltered:
record_metric("search_filter", "avg_unfiltered_ms", round(avg_unfiltered, 1)) record_metric("search_filter", "avg_unfiltered_ms", round(avg_unfiltered, 1))
record_metric("search_filter", "avg_filtered_ms", round(avg_filtered, 1)) record_metric("search_filter", "avg_filtered_ms", round(avg_filtered, 1))
record_metric("search_filter", "latency_improvement_pct", round(latency_improvement, 1)) record_metric("search_filter", "latency_improvement_pct", round(latency_improvement, 1))
record_metric("search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3)) record_metric(
record_metric("search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3)) "search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3)
)
record_metric(
"search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3)
)
@pytest.mark.benchmark @pytest.mark.benchmark
@@ -167,9 +169,16 @@ class TestConcurrentSearch:
from mempalace.searcher import search_memories from mempalace.searcher import search_memories
queries = [ queries = [
"authentication", "database", "deployment", "error handling", "authentication",
"testing", "monitoring", "caching", "middleware", "database",
"serialization", "validation", "deployment",
"error handling",
"testing",
"monitoring",
"caching",
"middleware",
"serialization",
"validation",
] * 3 # 30 total queries ] * 3 # 30 total queries
def run_search(query): def run_search(query):