From ebc26f396052f91d12c04fa9cc6f90f278a27e8f Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Wed, 8 Apr 2026 10:56:39 -0300 Subject: [PATCH] fix: resolve formatting, regression logic, and pytest defaults - Run ruff format on all benchmark files (fixes CI lint job) - Fix check_regression() substring ambiguity: ordered keyword matching so "latency_improvement_pct" is correctly classified as higher-is-better - Update stale comments in conftest.py referencing wrong fixture - Add pytest addopts to skip benchmark/slow/stress markers by default --- pyproject.toml | 1 + tests/benchmarks/conftest.py | 5 +- tests/benchmarks/data_generator.py | 322 ++++++++++++++---- tests/benchmarks/report.py | 61 +++- tests/benchmarks/test_chromadb_stress.py | 5 +- tests/benchmarks/test_ingest_bench.py | 10 +- .../benchmarks/test_knowledge_graph_bench.py | 24 +- tests/benchmarks/test_layers_bench.py | 8 +- tests/benchmarks/test_memory_profile.py | 16 +- tests/benchmarks/test_palace_boost.py | 8 +- tests/benchmarks/test_recall_threshold.py | 36 +- tests/benchmarks/test_search_bench.py | 25 +- 12 files changed, 383 insertions(+), 138 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 92e6732..9166a43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,7 @@ quote-style = "double" [tool.pytest.ini_options] testpaths = ["tests"] pythonpath = ["."] +addopts = "-m 'not benchmark and not slow and not stress'" markers = [ "benchmark: scale/performance benchmark tests", "slow: tests that take more than 30 seconds", diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py index 8852a3b..bd3f201 100644 --- a/tests/benchmarks/conftest.py +++ b/tests/benchmarks/conftest.py @@ -96,8 +96,7 @@ def pytest_terminal_summary(terminalreporter, config): if not report_path: return - # Collect results from the session fixture if available - # The results are written by individual tests via bench_results fixture + # Collect results written by individual tests via record_metric() import platform import subprocess @@ -129,7 +128,7 @@ def pytest_terminal_summary(terminalreporter, config): "results": {}, } - # Read results from a temp file written by the bench_results fixture + # Read results from the temp file written by record_metric() results_file = os.path.join(tempfile.gettempdir(), "mempalace_bench_results.json") if os.path.exists(results_file): try: diff --git a/tests/benchmarks/data_generator.py b/tests/benchmarks/data_generator.py index 7dd093b..0184239 100644 --- a/tests/benchmarks/data_generator.py +++ b/tests/benchmarks/data_generator.py @@ -20,42 +20,150 @@ import yaml # ── Scale configurations ───────────────────────────────────────────────── SCALE_CONFIGS = { - "small": {"drawers": 1_000, "wings": 3, "rooms_per_wing": 5, "kg_entities": 50, "kg_triples": 200, "needles": 20, "search_queries": 20}, - "medium": {"drawers": 10_000, "wings": 8, "rooms_per_wing": 12, "kg_entities": 200, "kg_triples": 2_000, "needles": 50, "search_queries": 50}, - "large": {"drawers": 50_000, "wings": 15, "rooms_per_wing": 20, "kg_entities": 500, "kg_triples": 10_000, "needles": 100, "search_queries": 100}, - "stress": {"drawers": 100_000, "wings": 25, "rooms_per_wing": 30, "kg_entities": 1_000, "kg_triples": 50_000, "needles": 200, "search_queries": 200}, + "small": { + "drawers": 1_000, + "wings": 3, + "rooms_per_wing": 5, + "kg_entities": 50, + "kg_triples": 200, + "needles": 20, + "search_queries": 20, + }, + "medium": { + "drawers": 10_000, + "wings": 8, + "rooms_per_wing": 12, + "kg_entities": 200, + "kg_triples": 2_000, + "needles": 50, + "search_queries": 50, + }, + "large": { + "drawers": 50_000, + "wings": 15, + "rooms_per_wing": 20, + "kg_entities": 500, + "kg_triples": 10_000, + "needles": 100, + "search_queries": 100, + }, + "stress": { + "drawers": 100_000, + "wings": 25, + "rooms_per_wing": 30, + "kg_entities": 1_000, + "kg_triples": 50_000, + "needles": 200, + "search_queries": 200, + }, } # ── Vocabulary banks for realistic content ─────────────────────────────── WING_NAMES = [ - "webapp", "backend_api", "mobile_app", "data_pipeline", "ml_platform", - "devops", "auth_service", "payments", "analytics", "docs_site", - "cli_tool", "dashboard", "notification_service", "search_engine", - "user_mgmt", "inventory", "reporting", "testing_infra", "monitoring", - "email_service", "chat_bot", "file_storage", "scheduler", "gateway", + "webapp", + "backend_api", + "mobile_app", + "data_pipeline", + "ml_platform", + "devops", + "auth_service", + "payments", + "analytics", + "docs_site", + "cli_tool", + "dashboard", + "notification_service", + "search_engine", + "user_mgmt", + "inventory", + "reporting", + "testing_infra", + "monitoring", + "email_service", + "chat_bot", + "file_storage", + "scheduler", + "gateway", "marketplace", ] ROOM_NAMES = [ - "backend", "frontend", "api", "database", "auth", "tests", "docs", - "config", "deployment", "models", "views", "controllers", "middleware", - "utils", "schemas", "migrations", "fixtures", "scripts", "styles", - "components", "hooks", "services", "routes", "templates", "static", - "media", "logging", "cache", "queue", "workers", + "backend", + "frontend", + "api", + "database", + "auth", + "tests", + "docs", + "config", + "deployment", + "models", + "views", + "controllers", + "middleware", + "utils", + "schemas", + "migrations", + "fixtures", + "scripts", + "styles", + "components", + "hooks", + "services", + "routes", + "templates", + "static", + "media", + "logging", + "cache", + "queue", + "workers", ] TECH_TERMS = [ - "authentication", "authorization", "middleware", "endpoint", "REST API", - "GraphQL", "WebSocket", "database migration", "ORM", "query optimization", - "caching strategy", "load balancer", "rate limiting", "pagination", - "serialization", "validation", "error handling", "logging framework", - "monitoring", "deployment pipeline", "CI/CD", "containerization", - "microservice", "event sourcing", "message queue", "pub/sub", - "connection pooling", "session management", "token refresh", "CORS", - "SSL termination", "health check", "circuit breaker", "retry logic", - "batch processing", "stream processing", "data pipeline", "ETL", - "feature flag", "A/B testing", "blue-green deployment", "canary release", + "authentication", + "authorization", + "middleware", + "endpoint", + "REST API", + "GraphQL", + "WebSocket", + "database migration", + "ORM", + "query optimization", + "caching strategy", + "load balancer", + "rate limiting", + "pagination", + "serialization", + "validation", + "error handling", + "logging framework", + "monitoring", + "deployment pipeline", + "CI/CD", + "containerization", + "microservice", + "event sourcing", + "message queue", + "pub/sub", + "connection pooling", + "session management", + "token refresh", + "CORS", + "SSL termination", + "health check", + "circuit breaker", + "retry logic", + "batch processing", + "stream processing", + "data pipeline", + "ETL", + "feature flag", + "A/B testing", + "blue-green deployment", + "canary release", ] CODE_SNIPPETS = [ @@ -75,17 +183,51 @@ PROSE_TEMPLATES = [ ] ENTITY_NAMES = [ - "Alice", "Bob", "Carol", "Dave", "Eve", "Frank", "Grace", "Heidi", - "Ivan", "Judy", "Karl", "Linda", "Mike", "Nina", "Oscar", "Pat", - "Quinn", "Rita", "Steve", "Tina", "Ursula", "Victor", "Wendy", "Xander", + "Alice", + "Bob", + "Carol", + "Dave", + "Eve", + "Frank", + "Grace", + "Heidi", + "Ivan", + "Judy", + "Karl", + "Linda", + "Mike", + "Nina", + "Oscar", + "Pat", + "Quinn", + "Rita", + "Steve", + "Tina", + "Ursula", + "Victor", + "Wendy", + "Xander", ] ENTITY_TYPES = ["person", "project", "tool", "concept", "team", "service"] PREDICATES = [ - "works_on", "manages", "reports_to", "collaborates_with", "created", - "maintains", "uses", "depends_on", "replaced", "reviewed", "deployed", - "tested", "documented", "mentors", "leads", "contributes_to", + "works_on", + "manages", + "reports_to", + "collaborates_with", + "created", + "maintains", + "uses", + "depends_on", + "replaced", + "reviewed", + "deployed", + "tested", + "documented", + "mentors", + "leads", + "contributes_to", ] @@ -136,13 +278,19 @@ class PalaceDataGenerator: room = self.rng.choice(self.rooms_by_wing[wing]) needle_id = f"NEEDLE_{i:04d}" content = f"{needle_id}: {topic}. This is a unique planted needle for recall benchmarking at scale." - self.needles.append({ - "id": needle_id, - "content": content, - "wing": wing, - "room": room, - "query": topic.split(" uses ")[0] if " uses " in topic else topic.split(" set to ")[0] if " set to " in topic else topic[:60], - }) + self.needles.append( + { + "id": needle_id, + "content": content, + "wing": wing, + "room": room, + "query": topic.split(" uses ")[0] + if " uses " in topic + else topic.split(" set to ")[0] + if " set to " in topic + else topic[:60], + } + ) def _random_text(self, min_chars=600, max_chars=900): """Generate a random text block of realistic content.""" @@ -159,21 +307,25 @@ class PalaceDataGenerator: component=self.rng.choice(ROOM_NAMES), task=self.rng.choice(TECH_TERMS), month=self.rng.choice(["January", "February", "March", "April", "May"]), - quality=self.rng.choice(["performance", "readability", "test coverage", "latency"]), + quality=self.rng.choice( + ["performance", "readability", "test coverage", "latency"] + ), decision=self.rng.choice(TECH_TERMS), condition=self.rng.choice(TECH_TERMS) + " is null", cause=self.rng.choice(["race condition", "null pointer", "timeout", "OOM"]), fix="adding " + self.rng.choice(TECH_TERMS), test_file=f"test_{self.rng.choice(ROOM_NAMES)}.py", old_tech=self.rng.choice(["MySQL", "Flask", "REST", "Jenkins"]), - new_tech=self.rng.choice(["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]), + new_tech=self.rng.choice( + ["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"] + ), reason=self.rng.choice(TECH_TERMS), - date=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}", + date=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}", percent=self.rng.randint(10, 80), topic=self.rng.choice(TECH_TERMS), person=self.rng.choice(ENTITY_NAMES), action=self.rng.choice(["refactor", "migrate", "optimize", "test"]), - deadline=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}", + deadline=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}", followup=self.rng.choice(TECH_TERMS), feature_name=self.rng.choice(TECH_TERMS), capability=self.rng.choice(TECH_TERMS), @@ -182,7 +334,12 @@ class PalaceDataGenerator: ) else: words = self.rng.sample(TECH_TERMS, min(5, len(TECH_TERMS))) - text = " ".join(words) + ". " + self.rng.choice(TECH_TERMS) + " implementation details follow.\n" + text = ( + " ".join(words) + + ". " + + self.rng.choice(TECH_TERMS) + + " implementation details follow.\n" + ) parts.append(text) total += len(text) return "\n".join(parts)[:max_chars] @@ -270,15 +427,24 @@ class PalaceDataGenerator: needle_id = f"drawer_{needle['wing']}_{needle['room']}_{hashlib.md5(needle['id'].encode()).hexdigest()[:16]}" docs.append(needle["content"]) ids.append(needle_id) - metas.append({ - "wing": needle["wing"], - "room": needle["room"], - "source_file": f"needle_{needle['id']}.txt", - "chunk_index": 0, - "added_by": "benchmark", - "filed_at": datetime.now().isoformat(), - }) - needle_info.append({"id": needle_id, "query": needle["query"], "wing": needle["wing"], "room": needle["room"]}) + metas.append( + { + "wing": needle["wing"], + "room": needle["room"], + "source_file": f"needle_{needle['id']}.txt", + "chunk_index": 0, + "added_by": "benchmark", + "filed_at": datetime.now().isoformat(), + } + ) + needle_info.append( + { + "id": needle_id, + "query": needle["query"], + "wing": needle["wing"], + "room": needle["room"], + } + ) # Fill remaining drawers with realistic content remaining = n_drawers - len(docs) @@ -291,14 +457,16 @@ class PalaceDataGenerator: docs.append(content) ids.append(drawer_id) - metas.append({ - "wing": wing, - "room": room, - "source_file": f"generated_{i:06d}.txt", - "chunk_index": i % 10, - "added_by": "benchmark", - "filed_at": datetime.now().isoformat(), - }) + metas.append( + { + "wing": wing, + "room": room, + "source_file": f"generated_{i:06d}.txt", + "chunk_index": i % 10, + "added_by": "benchmark", + "filed_at": datetime.now().isoformat(), + } + ) # Flush in batches if len(docs) >= batch_size: @@ -351,7 +519,9 @@ class PalaceDataGenerator: valid_to = None if self.rng.random() < 0.3: end_offset = self.rng.randint(30, 365) - valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime("%Y-%m-%d") + valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime( + "%Y-%m-%d" + ) triples.append((subject, predicate, obj, valid_from, valid_to)) return entities, triples @@ -371,24 +541,28 @@ class PalaceDataGenerator: # Half are needle queries (known-good answers) n_needle = min(n_queries // 2, len(self.needles)) for needle in self.needles[:n_needle]: - queries.append({ - "query": needle["query"], - "expected_wing": needle["wing"], - "expected_room": needle["room"], - "needle_id": needle["id"], - "is_needle": True, - }) + queries.append( + { + "query": needle["query"], + "expected_wing": needle["wing"], + "expected_room": needle["room"], + "needle_id": needle["id"], + "is_needle": True, + } + ) # Other half are generic queries (measure latency, not recall) n_generic = n_queries - n_needle for _ in range(n_generic): - queries.append({ - "query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS), - "expected_wing": None, - "expected_room": None, - "needle_id": None, - "is_needle": False, - }) + queries.append( + { + "query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS), + "expected_wing": None, + "expected_room": None, + "needle_id": None, + "is_needle": False, + } + ) self.rng.shuffle(queries) return queries diff --git a/tests/benchmarks/report.py b/tests/benchmarks/report.py index 87009ca..61ac937 100644 --- a/tests/benchmarks/report.py +++ b/tests/benchmarks/report.py @@ -45,16 +45,45 @@ def check_regression(current_report: str, baseline_report: str, threshold: float baseline = json.load(f) regressions = [] - # Metrics where HIGHER is worse (latency, memory, etc.) - higher_is_worse = { - "latency", "rss", "memory", "oom", "lock_failures", "elapsed", - "p50_ms", "p95_ms", "p99_ms", "rss_delta_mb", "peak_rss_mb", - } - # Metrics where LOWER is worse (throughput, recall, etc.) - lower_is_worse = { - "recall", "throughput", "per_sec", "files_per_sec", "drawers_per_sec", - "triples_per_sec", "improvement", - } + # Keywords for metric direction — checked in order, first match wins. + # "improvement" is checked before "latency" so that composite names + # like "latency_improvement_pct" are classified correctly. + _higher_is_better_kw = [ + "improvement", + "recall", + "throughput", + "per_sec", + "files_per_sec", + "drawers_per_sec", + "triples_per_sec", + "speedup", + ] + _higher_is_worse_kw = [ + "latency", + "rss", + "memory", + "oom", + "lock_failures", + "elapsed", + "p50_ms", + "p95_ms", + "p99_ms", + "rss_delta_mb", + "peak_rss_mb", + "errors", + "failures", + ] + + def _metric_direction(name: str) -> str: + """Return 'higher_better', 'higher_worse', or 'unknown'.""" + low = name.lower() + for kw in _higher_is_better_kw: + if kw in low: + return "higher_better" + for kw in _higher_is_worse_kw: + if kw in low: + return "higher_worse" + return "unknown" for category in baseline.get("results", {}): if category not in current.get("results", {}): @@ -68,23 +97,21 @@ def check_regression(current_report: str, baseline_report: str, threshold: float if base_val == 0: continue - # Determine direction - is_latency_like = any(kw in metric.lower() for kw in higher_is_worse) - is_throughput_like = any(kw in metric.lower() for kw in lower_is_worse) + direction = _metric_direction(metric) - if is_latency_like: + if direction == "higher_worse": # Higher is worse — check if current exceeds baseline by threshold if curr_val > base_val * (1 + threshold): pct = ((curr_val - base_val) / base_val) * 100 regressions.append( - f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)" + f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)" ) - elif is_throughput_like: + elif direction == "higher_better": # Lower is worse — check if current is below baseline by threshold if curr_val < base_val * (1 - threshold): pct = ((curr_val - base_val) / base_val) * 100 regressions.append( - f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)" + f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)" ) return regressions diff --git a/tests/benchmarks/test_chromadb_stress.py b/tests/benchmarks/test_chromadb_stress.py index 4e998a0..1a77529 100644 --- a/tests/benchmarks/test_chromadb_stress.py +++ b/tests/benchmarks/test_chromadb_stress.py @@ -143,7 +143,10 @@ class TestBulkInsertPerformance: batch_end = min(batch_start + batch_size, n_docs) batch_docs = contents[batch_start:batch_end] batch_ids = [f"batch_{i}" for i in range(batch_start, batch_end)] - batch_metas = [{"wing": "test", "room": "bench", "chunk_index": i} for i in range(batch_start, batch_end)] + batch_metas = [ + {"wing": "test", "room": "bench", "chunk_index": i} + for i in range(batch_start, batch_end) + ] col_batch.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas) batched_ms = (time.perf_counter() - start) * 1000 diff --git a/tests/benchmarks/test_ingest_bench.py b/tests/benchmarks/test_ingest_bench.py index 6703d11..2b4ea5b 100644 --- a/tests/benchmarks/test_ingest_bench.py +++ b/tests/benchmarks/test_ingest_bench.py @@ -125,7 +125,9 @@ class TestChunkThroughput: chunks_per_sec = total_chunks / max(elapsed, 0.001) kb_per_sec = (len(content) * n_iterations / 1024) / max(elapsed, 0.001) - record_metric("chunking", f"chunks_per_sec_at_{content_size_kb}kb", round(chunks_per_sec, 1)) + record_metric( + "chunking", f"chunks_per_sec_at_{content_size_kb}kb", round(chunks_per_sec, 1) + ) record_metric("chunking", f"kb_per_sec_at_{content_size_kb}kb", round(kb_per_sec, 1)) @@ -160,4 +162,8 @@ class TestReingestSkipOverhead: record_metric("reingest", "skip_check_elapsed_sec", round(skip_elapsed, 2)) record_metric("reingest", "files_checked", files_written) - record_metric("reingest", "skip_check_per_file_ms", round(skip_elapsed * 1000 / max(files_written, 1), 1)) + record_metric( + "reingest", + "skip_check_per_file_ms", + round(skip_elapsed * 1000 / max(files_written, 1), 1), + ) diff --git a/tests/benchmarks/test_knowledge_graph_bench.py b/tests/benchmarks/test_knowledge_graph_bench.py index c9897fb..60236bc 100644 --- a/tests/benchmarks/test_knowledge_graph_bench.py +++ b/tests/benchmarks/test_knowledge_graph_bench.py @@ -36,9 +36,7 @@ class TestTripleInsertionRate: # Measure triple insertion start = time.perf_counter() for subject, predicate, obj, valid_from, valid_to in triples: - kg.add_triple( - subject, predicate, obj, valid_from=valid_from, valid_to=valid_to - ) + kg.add_triple(subject, predicate, obj, valid_from=valid_from, valid_to=valid_to) elapsed = time.perf_counter() - start triples_per_sec = n_triples / max(elapsed, 0.001) @@ -128,7 +126,9 @@ class TestTemporalQueryAccuracy: kg.add_entity("ProjectB", "project") # Alice worked on ProjectA from 2024-01 to 2024-06 - kg.add_triple("Alice", "works_on", "ProjectA", valid_from="2024-01-01", valid_to="2024-06-30") + kg.add_triple( + "Alice", "works_on", "ProjectA", valid_from="2024-01-01", valid_to="2024-06-30" + ) # Alice worked on ProjectB from 2024-07 onwards kg.add_triple("Alice", "works_on", "ProjectB", valid_from="2024-07-01") @@ -145,8 +145,16 @@ class TestTemporalQueryAccuracy: # Query Alice as of September 2024 — should find ProjectB result_sept = kg.query_entity("Alice", as_of="2024-09-15") - record_metric("kg_temporal", "march_query_results", len(result_march) if isinstance(result_march, list) else 0) - record_metric("kg_temporal", "sept_query_results", len(result_sept) if isinstance(result_sept, list) else 0) + record_metric( + "kg_temporal", + "march_query_results", + len(result_march) if isinstance(result_march, list) else 0, + ) + record_metric( + "kg_temporal", + "sept_query_results", + len(result_sept) if isinstance(result_sept, list) else 0, + ) @pytest.mark.benchmark @@ -230,7 +238,9 @@ class TestSQLiteConcurrentAccess: fails = 0 for i in range(50): try: - kg.add_triple(f"E_{i % 50}", "new_rel", f"E_{(i + 7) % 50}", valid_from="2025-06-01") + kg.add_triple( + f"E_{i % 50}", "new_rel", f"E_{(i + 7) % 50}", valid_from="2025-06-01" + ) except Exception: fails += 1 write_errors.append(fails) diff --git a/tests/benchmarks/test_layers_bench.py b/tests/benchmarks/test_layers_bench.py index 2237209..b9604d7 100644 --- a/tests/benchmarks/test_layers_bench.py +++ b/tests/benchmarks/test_layers_bench.py @@ -116,7 +116,9 @@ class TestLayer1UnboundedFetch: record_metric("layer1_filter", "unfiltered_ms", round(unfiltered_ms, 1)) record_metric("layer1_filter", "filtered_ms", round(filtered_ms, 1)) if unfiltered_ms > 0: - record_metric("layer1_filter", "speedup_pct", round((1 - filtered_ms / unfiltered_ms) * 100, 1)) + record_metric( + "layer1_filter", "speedup_pct", round((1 - filtered_ms / unfiltered_ms) * 100, 1) + ) @pytest.mark.benchmark @@ -146,7 +148,9 @@ class TestWakeUpTokenBudget: record_metric("wakeup_budget", f"tokens_at_{n_drawers}", token_estimate) record_metric("wakeup_budget", f"chars_at_{n_drawers}", len(text)) - assert token_estimate < 1200, f"Wake-up exceeded budget: ~{token_estimate} tokens at {n_drawers} drawers" + assert token_estimate < 1200, ( + f"Wake-up exceeded budget: ~{token_estimate} tokens at {n_drawers} drawers" + ) @pytest.mark.benchmark diff --git a/tests/benchmarks/test_memory_profile.py b/tests/benchmarks/test_memory_profile.py index 769c501..b299b2d 100644 --- a/tests/benchmarks/test_memory_profile.py +++ b/tests/benchmarks/test_memory_profile.py @@ -63,7 +63,9 @@ class TestSearchMemoryProfile: record_metric("memory_search", "rss_end_mb", round(end_rss, 2)) record_metric("memory_search", "rss_growth_mb", round(growth, 2)) record_metric("memory_search", "n_calls", n_calls) - record_metric("memory_search", "growth_per_100_calls_mb", round(growth / (n_calls / 100), 2)) + record_metric( + "memory_search", "growth_per_100_calls_mb", round(growth / (n_calls / 100), 2) + ) @pytest.mark.benchmark @@ -166,11 +168,13 @@ class TestHeapSnapshot: stats = snap_after.compare_to(snap_before, "lineno") top_allocators = [] for stat in stats[:10]: - top_allocators.append({ - "file": str(stat.traceback), - "size_kb": round(stat.size / 1024, 1), - "count": stat.count, - }) + top_allocators.append( + { + "file": str(stat.traceback), + "size_kb": round(stat.size / 1024, 1), + "count": stat.count, + } + ) total_growth_kb = sum(s["size_kb"] for s in top_allocators) record_metric("heap_search", "top_10_growth_kb", round(total_growth_kb, 1)) diff --git a/tests/benchmarks/test_palace_boost.py b/tests/benchmarks/test_palace_boost.py index 6994313..ca90784 100644 --- a/tests/benchmarks/test_palace_boost.py +++ b/tests/benchmarks/test_palace_boost.py @@ -123,8 +123,12 @@ class TestFilterLatencyBenefit: record_metric("filter_latency", "avg_wing_filtered_ms", round(avg_wing, 1)) record_metric("filter_latency", "avg_room_filtered_ms", round(avg_room, 1)) if avg_none > 0: - record_metric("filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1)) - record_metric("filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1)) + record_metric( + "filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1) + ) + record_metric( + "filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1) + ) @pytest.mark.benchmark diff --git a/tests/benchmarks/test_recall_threshold.py b/tests/benchmarks/test_recall_threshold.py index e2c14ac..afe2323 100644 --- a/tests/benchmarks/test_recall_threshold.py +++ b/tests/benchmarks/test_recall_threshold.py @@ -61,14 +61,16 @@ def _populate_single_room(palace_path, n_drawers, n_needles=10): drawer_id = f"drawer_single_room_{hashlib.md5(needle_id.encode()).hexdigest()[:16]}" docs.append(content) ids.append(drawer_id) - metas.append({ - "wing": "concentrated", - "room": "single_room", - "source_file": f"needle_{i}.txt", - "chunk_index": 0, - "added_by": "threshold_bench", - "filed_at": datetime.now().isoformat(), - }) + metas.append( + { + "wing": "concentrated", + "room": "single_room", + "source_file": f"needle_{i}.txt", + "chunk_index": 0, + "added_by": "threshold_bench", + "filed_at": datetime.now().isoformat(), + } + ) # Fill with noise — all in the SAME room remaining = n_drawers - len(docs) @@ -77,14 +79,16 @@ def _populate_single_room(palace_path, n_drawers, n_needles=10): drawer_id = f"drawer_single_room_{hashlib.md5(f'noise_{i}'.encode()).hexdigest()[:16]}" docs.append(content) ids.append(drawer_id) - metas.append({ - "wing": "concentrated", - "room": "single_room", - "source_file": f"noise_{i:06d}.txt", - "chunk_index": i % 10, - "added_by": "threshold_bench", - "filed_at": datetime.now().isoformat(), - }) + metas.append( + { + "wing": "concentrated", + "room": "single_room", + "source_file": f"noise_{i:06d}.txt", + "chunk_index": i % 10, + "added_by": "threshold_bench", + "filed_at": datetime.now().isoformat(), + } + ) if len(docs) >= batch_size: col.add(documents=docs, ids=ids, metadatas=metas) diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py index 5c2559e..3cb7785 100644 --- a/tests/benchmarks/test_search_bench.py +++ b/tests/benchmarks/test_search_bench.py @@ -77,9 +77,7 @@ class TestSearchRecallAtScale: total_needle_queries = min(10, len(needle_info)) for needle in needle_info[:total_needle_queries]: - result = search_memories( - needle["query"], palace_path=palace_path, n_results=10 - ) + result = search_memories(needle["query"], palace_path=palace_path, n_results=10) if "error" in result: continue @@ -150,8 +148,12 @@ class TestSearchFilteredVsUnfiltered: record_metric("search_filter", "avg_unfiltered_ms", round(avg_unfiltered, 1)) record_metric("search_filter", "avg_filtered_ms", round(avg_filtered, 1)) record_metric("search_filter", "latency_improvement_pct", round(latency_improvement, 1)) - record_metric("search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3)) - record_metric("search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3)) + record_metric( + "search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3) + ) + record_metric( + "search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3) + ) @pytest.mark.benchmark @@ -167,9 +169,16 @@ class TestConcurrentSearch: from mempalace.searcher import search_memories queries = [ - "authentication", "database", "deployment", "error handling", - "testing", "monitoring", "caching", "middleware", - "serialization", "validation", + "authentication", + "database", + "deployment", + "error handling", + "testing", + "monitoring", + "caching", + "middleware", + "serialization", + "validation", ] * 3 # 30 total queries def run_search(query):