fix: resolve formatting, regression logic, and pytest defaults
- Run ruff format on all benchmark files (fixes CI lint job) - Fix check_regression() substring ambiguity: ordered keyword matching so "latency_improvement_pct" is correctly classified as higher-is-better - Update stale comments in conftest.py referencing wrong fixture - Add pytest addopts to skip benchmark/slow/stress markers by default
This commit is contained in:
@@ -65,6 +65,7 @@ quote-style = "double"
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
pythonpath = ["."]
|
||||
addopts = "-m 'not benchmark and not slow and not stress'"
|
||||
markers = [
|
||||
"benchmark: scale/performance benchmark tests",
|
||||
"slow: tests that take more than 30 seconds",
|
||||
|
||||
@@ -96,8 +96,7 @@ def pytest_terminal_summary(terminalreporter, config):
|
||||
if not report_path:
|
||||
return
|
||||
|
||||
# Collect results from the session fixture if available
|
||||
# The results are written by individual tests via bench_results fixture
|
||||
# Collect results written by individual tests via record_metric()
|
||||
import platform
|
||||
import subprocess
|
||||
|
||||
@@ -129,7 +128,7 @@ def pytest_terminal_summary(terminalreporter, config):
|
||||
"results": {},
|
||||
}
|
||||
|
||||
# Read results from a temp file written by the bench_results fixture
|
||||
# Read results from the temp file written by record_metric()
|
||||
results_file = os.path.join(tempfile.gettempdir(), "mempalace_bench_results.json")
|
||||
if os.path.exists(results_file):
|
||||
try:
|
||||
|
||||
@@ -20,42 +20,150 @@ import yaml
|
||||
# ── Scale configurations ─────────────────────────────────────────────────
|
||||
|
||||
SCALE_CONFIGS = {
|
||||
"small": {"drawers": 1_000, "wings": 3, "rooms_per_wing": 5, "kg_entities": 50, "kg_triples": 200, "needles": 20, "search_queries": 20},
|
||||
"medium": {"drawers": 10_000, "wings": 8, "rooms_per_wing": 12, "kg_entities": 200, "kg_triples": 2_000, "needles": 50, "search_queries": 50},
|
||||
"large": {"drawers": 50_000, "wings": 15, "rooms_per_wing": 20, "kg_entities": 500, "kg_triples": 10_000, "needles": 100, "search_queries": 100},
|
||||
"stress": {"drawers": 100_000, "wings": 25, "rooms_per_wing": 30, "kg_entities": 1_000, "kg_triples": 50_000, "needles": 200, "search_queries": 200},
|
||||
"small": {
|
||||
"drawers": 1_000,
|
||||
"wings": 3,
|
||||
"rooms_per_wing": 5,
|
||||
"kg_entities": 50,
|
||||
"kg_triples": 200,
|
||||
"needles": 20,
|
||||
"search_queries": 20,
|
||||
},
|
||||
"medium": {
|
||||
"drawers": 10_000,
|
||||
"wings": 8,
|
||||
"rooms_per_wing": 12,
|
||||
"kg_entities": 200,
|
||||
"kg_triples": 2_000,
|
||||
"needles": 50,
|
||||
"search_queries": 50,
|
||||
},
|
||||
"large": {
|
||||
"drawers": 50_000,
|
||||
"wings": 15,
|
||||
"rooms_per_wing": 20,
|
||||
"kg_entities": 500,
|
||||
"kg_triples": 10_000,
|
||||
"needles": 100,
|
||||
"search_queries": 100,
|
||||
},
|
||||
"stress": {
|
||||
"drawers": 100_000,
|
||||
"wings": 25,
|
||||
"rooms_per_wing": 30,
|
||||
"kg_entities": 1_000,
|
||||
"kg_triples": 50_000,
|
||||
"needles": 200,
|
||||
"search_queries": 200,
|
||||
},
|
||||
}
|
||||
|
||||
# ── Vocabulary banks for realistic content ───────────────────────────────
|
||||
|
||||
WING_NAMES = [
|
||||
"webapp", "backend_api", "mobile_app", "data_pipeline", "ml_platform",
|
||||
"devops", "auth_service", "payments", "analytics", "docs_site",
|
||||
"cli_tool", "dashboard", "notification_service", "search_engine",
|
||||
"user_mgmt", "inventory", "reporting", "testing_infra", "monitoring",
|
||||
"email_service", "chat_bot", "file_storage", "scheduler", "gateway",
|
||||
"webapp",
|
||||
"backend_api",
|
||||
"mobile_app",
|
||||
"data_pipeline",
|
||||
"ml_platform",
|
||||
"devops",
|
||||
"auth_service",
|
||||
"payments",
|
||||
"analytics",
|
||||
"docs_site",
|
||||
"cli_tool",
|
||||
"dashboard",
|
||||
"notification_service",
|
||||
"search_engine",
|
||||
"user_mgmt",
|
||||
"inventory",
|
||||
"reporting",
|
||||
"testing_infra",
|
||||
"monitoring",
|
||||
"email_service",
|
||||
"chat_bot",
|
||||
"file_storage",
|
||||
"scheduler",
|
||||
"gateway",
|
||||
"marketplace",
|
||||
]
|
||||
|
||||
ROOM_NAMES = [
|
||||
"backend", "frontend", "api", "database", "auth", "tests", "docs",
|
||||
"config", "deployment", "models", "views", "controllers", "middleware",
|
||||
"utils", "schemas", "migrations", "fixtures", "scripts", "styles",
|
||||
"components", "hooks", "services", "routes", "templates", "static",
|
||||
"media", "logging", "cache", "queue", "workers",
|
||||
"backend",
|
||||
"frontend",
|
||||
"api",
|
||||
"database",
|
||||
"auth",
|
||||
"tests",
|
||||
"docs",
|
||||
"config",
|
||||
"deployment",
|
||||
"models",
|
||||
"views",
|
||||
"controllers",
|
||||
"middleware",
|
||||
"utils",
|
||||
"schemas",
|
||||
"migrations",
|
||||
"fixtures",
|
||||
"scripts",
|
||||
"styles",
|
||||
"components",
|
||||
"hooks",
|
||||
"services",
|
||||
"routes",
|
||||
"templates",
|
||||
"static",
|
||||
"media",
|
||||
"logging",
|
||||
"cache",
|
||||
"queue",
|
||||
"workers",
|
||||
]
|
||||
|
||||
TECH_TERMS = [
|
||||
"authentication", "authorization", "middleware", "endpoint", "REST API",
|
||||
"GraphQL", "WebSocket", "database migration", "ORM", "query optimization",
|
||||
"caching strategy", "load balancer", "rate limiting", "pagination",
|
||||
"serialization", "validation", "error handling", "logging framework",
|
||||
"monitoring", "deployment pipeline", "CI/CD", "containerization",
|
||||
"microservice", "event sourcing", "message queue", "pub/sub",
|
||||
"connection pooling", "session management", "token refresh", "CORS",
|
||||
"SSL termination", "health check", "circuit breaker", "retry logic",
|
||||
"batch processing", "stream processing", "data pipeline", "ETL",
|
||||
"feature flag", "A/B testing", "blue-green deployment", "canary release",
|
||||
"authentication",
|
||||
"authorization",
|
||||
"middleware",
|
||||
"endpoint",
|
||||
"REST API",
|
||||
"GraphQL",
|
||||
"WebSocket",
|
||||
"database migration",
|
||||
"ORM",
|
||||
"query optimization",
|
||||
"caching strategy",
|
||||
"load balancer",
|
||||
"rate limiting",
|
||||
"pagination",
|
||||
"serialization",
|
||||
"validation",
|
||||
"error handling",
|
||||
"logging framework",
|
||||
"monitoring",
|
||||
"deployment pipeline",
|
||||
"CI/CD",
|
||||
"containerization",
|
||||
"microservice",
|
||||
"event sourcing",
|
||||
"message queue",
|
||||
"pub/sub",
|
||||
"connection pooling",
|
||||
"session management",
|
||||
"token refresh",
|
||||
"CORS",
|
||||
"SSL termination",
|
||||
"health check",
|
||||
"circuit breaker",
|
||||
"retry logic",
|
||||
"batch processing",
|
||||
"stream processing",
|
||||
"data pipeline",
|
||||
"ETL",
|
||||
"feature flag",
|
||||
"A/B testing",
|
||||
"blue-green deployment",
|
||||
"canary release",
|
||||
]
|
||||
|
||||
CODE_SNIPPETS = [
|
||||
@@ -75,17 +183,51 @@ PROSE_TEMPLATES = [
|
||||
]
|
||||
|
||||
ENTITY_NAMES = [
|
||||
"Alice", "Bob", "Carol", "Dave", "Eve", "Frank", "Grace", "Heidi",
|
||||
"Ivan", "Judy", "Karl", "Linda", "Mike", "Nina", "Oscar", "Pat",
|
||||
"Quinn", "Rita", "Steve", "Tina", "Ursula", "Victor", "Wendy", "Xander",
|
||||
"Alice",
|
||||
"Bob",
|
||||
"Carol",
|
||||
"Dave",
|
||||
"Eve",
|
||||
"Frank",
|
||||
"Grace",
|
||||
"Heidi",
|
||||
"Ivan",
|
||||
"Judy",
|
||||
"Karl",
|
||||
"Linda",
|
||||
"Mike",
|
||||
"Nina",
|
||||
"Oscar",
|
||||
"Pat",
|
||||
"Quinn",
|
||||
"Rita",
|
||||
"Steve",
|
||||
"Tina",
|
||||
"Ursula",
|
||||
"Victor",
|
||||
"Wendy",
|
||||
"Xander",
|
||||
]
|
||||
|
||||
ENTITY_TYPES = ["person", "project", "tool", "concept", "team", "service"]
|
||||
|
||||
PREDICATES = [
|
||||
"works_on", "manages", "reports_to", "collaborates_with", "created",
|
||||
"maintains", "uses", "depends_on", "replaced", "reviewed", "deployed",
|
||||
"tested", "documented", "mentors", "leads", "contributes_to",
|
||||
"works_on",
|
||||
"manages",
|
||||
"reports_to",
|
||||
"collaborates_with",
|
||||
"created",
|
||||
"maintains",
|
||||
"uses",
|
||||
"depends_on",
|
||||
"replaced",
|
||||
"reviewed",
|
||||
"deployed",
|
||||
"tested",
|
||||
"documented",
|
||||
"mentors",
|
||||
"leads",
|
||||
"contributes_to",
|
||||
]
|
||||
|
||||
|
||||
@@ -136,13 +278,19 @@ class PalaceDataGenerator:
|
||||
room = self.rng.choice(self.rooms_by_wing[wing])
|
||||
needle_id = f"NEEDLE_{i:04d}"
|
||||
content = f"{needle_id}: {topic}. This is a unique planted needle for recall benchmarking at scale."
|
||||
self.needles.append({
|
||||
self.needles.append(
|
||||
{
|
||||
"id": needle_id,
|
||||
"content": content,
|
||||
"wing": wing,
|
||||
"room": room,
|
||||
"query": topic.split(" uses ")[0] if " uses " in topic else topic.split(" set to ")[0] if " set to " in topic else topic[:60],
|
||||
})
|
||||
"query": topic.split(" uses ")[0]
|
||||
if " uses " in topic
|
||||
else topic.split(" set to ")[0]
|
||||
if " set to " in topic
|
||||
else topic[:60],
|
||||
}
|
||||
)
|
||||
|
||||
def _random_text(self, min_chars=600, max_chars=900):
|
||||
"""Generate a random text block of realistic content."""
|
||||
@@ -159,21 +307,25 @@ class PalaceDataGenerator:
|
||||
component=self.rng.choice(ROOM_NAMES),
|
||||
task=self.rng.choice(TECH_TERMS),
|
||||
month=self.rng.choice(["January", "February", "March", "April", "May"]),
|
||||
quality=self.rng.choice(["performance", "readability", "test coverage", "latency"]),
|
||||
quality=self.rng.choice(
|
||||
["performance", "readability", "test coverage", "latency"]
|
||||
),
|
||||
decision=self.rng.choice(TECH_TERMS),
|
||||
condition=self.rng.choice(TECH_TERMS) + " is null",
|
||||
cause=self.rng.choice(["race condition", "null pointer", "timeout", "OOM"]),
|
||||
fix="adding " + self.rng.choice(TECH_TERMS),
|
||||
test_file=f"test_{self.rng.choice(ROOM_NAMES)}.py",
|
||||
old_tech=self.rng.choice(["MySQL", "Flask", "REST", "Jenkins"]),
|
||||
new_tech=self.rng.choice(["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]),
|
||||
new_tech=self.rng.choice(
|
||||
["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]
|
||||
),
|
||||
reason=self.rng.choice(TECH_TERMS),
|
||||
date=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
|
||||
date=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}",
|
||||
percent=self.rng.randint(10, 80),
|
||||
topic=self.rng.choice(TECH_TERMS),
|
||||
person=self.rng.choice(ENTITY_NAMES),
|
||||
action=self.rng.choice(["refactor", "migrate", "optimize", "test"]),
|
||||
deadline=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
|
||||
deadline=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}",
|
||||
followup=self.rng.choice(TECH_TERMS),
|
||||
feature_name=self.rng.choice(TECH_TERMS),
|
||||
capability=self.rng.choice(TECH_TERMS),
|
||||
@@ -182,7 +334,12 @@ class PalaceDataGenerator:
|
||||
)
|
||||
else:
|
||||
words = self.rng.sample(TECH_TERMS, min(5, len(TECH_TERMS)))
|
||||
text = " ".join(words) + ". " + self.rng.choice(TECH_TERMS) + " implementation details follow.\n"
|
||||
text = (
|
||||
" ".join(words)
|
||||
+ ". "
|
||||
+ self.rng.choice(TECH_TERMS)
|
||||
+ " implementation details follow.\n"
|
||||
)
|
||||
parts.append(text)
|
||||
total += len(text)
|
||||
return "\n".join(parts)[:max_chars]
|
||||
@@ -270,15 +427,24 @@ class PalaceDataGenerator:
|
||||
needle_id = f"drawer_{needle['wing']}_{needle['room']}_{hashlib.md5(needle['id'].encode()).hexdigest()[:16]}"
|
||||
docs.append(needle["content"])
|
||||
ids.append(needle_id)
|
||||
metas.append({
|
||||
metas.append(
|
||||
{
|
||||
"wing": needle["wing"],
|
||||
"room": needle["room"],
|
||||
"source_file": f"needle_{needle['id']}.txt",
|
||||
"chunk_index": 0,
|
||||
"added_by": "benchmark",
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
})
|
||||
needle_info.append({"id": needle_id, "query": needle["query"], "wing": needle["wing"], "room": needle["room"]})
|
||||
}
|
||||
)
|
||||
needle_info.append(
|
||||
{
|
||||
"id": needle_id,
|
||||
"query": needle["query"],
|
||||
"wing": needle["wing"],
|
||||
"room": needle["room"],
|
||||
}
|
||||
)
|
||||
|
||||
# Fill remaining drawers with realistic content
|
||||
remaining = n_drawers - len(docs)
|
||||
@@ -291,14 +457,16 @@ class PalaceDataGenerator:
|
||||
|
||||
docs.append(content)
|
||||
ids.append(drawer_id)
|
||||
metas.append({
|
||||
metas.append(
|
||||
{
|
||||
"wing": wing,
|
||||
"room": room,
|
||||
"source_file": f"generated_{i:06d}.txt",
|
||||
"chunk_index": i % 10,
|
||||
"added_by": "benchmark",
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
# Flush in batches
|
||||
if len(docs) >= batch_size:
|
||||
@@ -351,7 +519,9 @@ class PalaceDataGenerator:
|
||||
valid_to = None
|
||||
if self.rng.random() < 0.3:
|
||||
end_offset = self.rng.randint(30, 365)
|
||||
valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime("%Y-%m-%d")
|
||||
valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime(
|
||||
"%Y-%m-%d"
|
||||
)
|
||||
triples.append((subject, predicate, obj, valid_from, valid_to))
|
||||
|
||||
return entities, triples
|
||||
@@ -371,24 +541,28 @@ class PalaceDataGenerator:
|
||||
# Half are needle queries (known-good answers)
|
||||
n_needle = min(n_queries // 2, len(self.needles))
|
||||
for needle in self.needles[:n_needle]:
|
||||
queries.append({
|
||||
queries.append(
|
||||
{
|
||||
"query": needle["query"],
|
||||
"expected_wing": needle["wing"],
|
||||
"expected_room": needle["room"],
|
||||
"needle_id": needle["id"],
|
||||
"is_needle": True,
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
# Other half are generic queries (measure latency, not recall)
|
||||
n_generic = n_queries - n_needle
|
||||
for _ in range(n_generic):
|
||||
queries.append({
|
||||
queries.append(
|
||||
{
|
||||
"query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS),
|
||||
"expected_wing": None,
|
||||
"expected_room": None,
|
||||
"needle_id": None,
|
||||
"is_needle": False,
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
self.rng.shuffle(queries)
|
||||
return queries
|
||||
|
||||
+44
-17
@@ -45,16 +45,45 @@ def check_regression(current_report: str, baseline_report: str, threshold: float
|
||||
baseline = json.load(f)
|
||||
|
||||
regressions = []
|
||||
# Metrics where HIGHER is worse (latency, memory, etc.)
|
||||
higher_is_worse = {
|
||||
"latency", "rss", "memory", "oom", "lock_failures", "elapsed",
|
||||
"p50_ms", "p95_ms", "p99_ms", "rss_delta_mb", "peak_rss_mb",
|
||||
}
|
||||
# Metrics where LOWER is worse (throughput, recall, etc.)
|
||||
lower_is_worse = {
|
||||
"recall", "throughput", "per_sec", "files_per_sec", "drawers_per_sec",
|
||||
"triples_per_sec", "improvement",
|
||||
}
|
||||
# Keywords for metric direction — checked in order, first match wins.
|
||||
# "improvement" is checked before "latency" so that composite names
|
||||
# like "latency_improvement_pct" are classified correctly.
|
||||
_higher_is_better_kw = [
|
||||
"improvement",
|
||||
"recall",
|
||||
"throughput",
|
||||
"per_sec",
|
||||
"files_per_sec",
|
||||
"drawers_per_sec",
|
||||
"triples_per_sec",
|
||||
"speedup",
|
||||
]
|
||||
_higher_is_worse_kw = [
|
||||
"latency",
|
||||
"rss",
|
||||
"memory",
|
||||
"oom",
|
||||
"lock_failures",
|
||||
"elapsed",
|
||||
"p50_ms",
|
||||
"p95_ms",
|
||||
"p99_ms",
|
||||
"rss_delta_mb",
|
||||
"peak_rss_mb",
|
||||
"errors",
|
||||
"failures",
|
||||
]
|
||||
|
||||
def _metric_direction(name: str) -> str:
|
||||
"""Return 'higher_better', 'higher_worse', or 'unknown'."""
|
||||
low = name.lower()
|
||||
for kw in _higher_is_better_kw:
|
||||
if kw in low:
|
||||
return "higher_better"
|
||||
for kw in _higher_is_worse_kw:
|
||||
if kw in low:
|
||||
return "higher_worse"
|
||||
return "unknown"
|
||||
|
||||
for category in baseline.get("results", {}):
|
||||
if category not in current.get("results", {}):
|
||||
@@ -68,23 +97,21 @@ def check_regression(current_report: str, baseline_report: str, threshold: float
|
||||
if base_val == 0:
|
||||
continue
|
||||
|
||||
# Determine direction
|
||||
is_latency_like = any(kw in metric.lower() for kw in higher_is_worse)
|
||||
is_throughput_like = any(kw in metric.lower() for kw in lower_is_worse)
|
||||
direction = _metric_direction(metric)
|
||||
|
||||
if is_latency_like:
|
||||
if direction == "higher_worse":
|
||||
# Higher is worse — check if current exceeds baseline by threshold
|
||||
if curr_val > base_val * (1 + threshold):
|
||||
pct = ((curr_val - base_val) / base_val) * 100
|
||||
regressions.append(
|
||||
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)"
|
||||
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)"
|
||||
)
|
||||
elif is_throughput_like:
|
||||
elif direction == "higher_better":
|
||||
# Lower is worse — check if current is below baseline by threshold
|
||||
if curr_val < base_val * (1 - threshold):
|
||||
pct = ((curr_val - base_val) / base_val) * 100
|
||||
regressions.append(
|
||||
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)"
|
||||
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)"
|
||||
)
|
||||
|
||||
return regressions
|
||||
|
||||
@@ -143,7 +143,10 @@ class TestBulkInsertPerformance:
|
||||
batch_end = min(batch_start + batch_size, n_docs)
|
||||
batch_docs = contents[batch_start:batch_end]
|
||||
batch_ids = [f"batch_{i}" for i in range(batch_start, batch_end)]
|
||||
batch_metas = [{"wing": "test", "room": "bench", "chunk_index": i} for i in range(batch_start, batch_end)]
|
||||
batch_metas = [
|
||||
{"wing": "test", "room": "bench", "chunk_index": i}
|
||||
for i in range(batch_start, batch_end)
|
||||
]
|
||||
col_batch.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
||||
batched_ms = (time.perf_counter() - start) * 1000
|
||||
|
||||
|
||||
@@ -125,7 +125,9 @@ class TestChunkThroughput:
|
||||
chunks_per_sec = total_chunks / max(elapsed, 0.001)
|
||||
kb_per_sec = (len(content) * n_iterations / 1024) / max(elapsed, 0.001)
|
||||
|
||||
record_metric("chunking", f"chunks_per_sec_at_{content_size_kb}kb", round(chunks_per_sec, 1))
|
||||
record_metric(
|
||||
"chunking", f"chunks_per_sec_at_{content_size_kb}kb", round(chunks_per_sec, 1)
|
||||
)
|
||||
record_metric("chunking", f"kb_per_sec_at_{content_size_kb}kb", round(kb_per_sec, 1))
|
||||
|
||||
|
||||
@@ -160,4 +162,8 @@ class TestReingestSkipOverhead:
|
||||
|
||||
record_metric("reingest", "skip_check_elapsed_sec", round(skip_elapsed, 2))
|
||||
record_metric("reingest", "files_checked", files_written)
|
||||
record_metric("reingest", "skip_check_per_file_ms", round(skip_elapsed * 1000 / max(files_written, 1), 1))
|
||||
record_metric(
|
||||
"reingest",
|
||||
"skip_check_per_file_ms",
|
||||
round(skip_elapsed * 1000 / max(files_written, 1), 1),
|
||||
)
|
||||
|
||||
@@ -36,9 +36,7 @@ class TestTripleInsertionRate:
|
||||
# Measure triple insertion
|
||||
start = time.perf_counter()
|
||||
for subject, predicate, obj, valid_from, valid_to in triples:
|
||||
kg.add_triple(
|
||||
subject, predicate, obj, valid_from=valid_from, valid_to=valid_to
|
||||
)
|
||||
kg.add_triple(subject, predicate, obj, valid_from=valid_from, valid_to=valid_to)
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
triples_per_sec = n_triples / max(elapsed, 0.001)
|
||||
@@ -128,7 +126,9 @@ class TestTemporalQueryAccuracy:
|
||||
kg.add_entity("ProjectB", "project")
|
||||
|
||||
# Alice worked on ProjectA from 2024-01 to 2024-06
|
||||
kg.add_triple("Alice", "works_on", "ProjectA", valid_from="2024-01-01", valid_to="2024-06-30")
|
||||
kg.add_triple(
|
||||
"Alice", "works_on", "ProjectA", valid_from="2024-01-01", valid_to="2024-06-30"
|
||||
)
|
||||
# Alice worked on ProjectB from 2024-07 onwards
|
||||
kg.add_triple("Alice", "works_on", "ProjectB", valid_from="2024-07-01")
|
||||
|
||||
@@ -145,8 +145,16 @@ class TestTemporalQueryAccuracy:
|
||||
# Query Alice as of September 2024 — should find ProjectB
|
||||
result_sept = kg.query_entity("Alice", as_of="2024-09-15")
|
||||
|
||||
record_metric("kg_temporal", "march_query_results", len(result_march) if isinstance(result_march, list) else 0)
|
||||
record_metric("kg_temporal", "sept_query_results", len(result_sept) if isinstance(result_sept, list) else 0)
|
||||
record_metric(
|
||||
"kg_temporal",
|
||||
"march_query_results",
|
||||
len(result_march) if isinstance(result_march, list) else 0,
|
||||
)
|
||||
record_metric(
|
||||
"kg_temporal",
|
||||
"sept_query_results",
|
||||
len(result_sept) if isinstance(result_sept, list) else 0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
@@ -230,7 +238,9 @@ class TestSQLiteConcurrentAccess:
|
||||
fails = 0
|
||||
for i in range(50):
|
||||
try:
|
||||
kg.add_triple(f"E_{i % 50}", "new_rel", f"E_{(i + 7) % 50}", valid_from="2025-06-01")
|
||||
kg.add_triple(
|
||||
f"E_{i % 50}", "new_rel", f"E_{(i + 7) % 50}", valid_from="2025-06-01"
|
||||
)
|
||||
except Exception:
|
||||
fails += 1
|
||||
write_errors.append(fails)
|
||||
|
||||
@@ -116,7 +116,9 @@ class TestLayer1UnboundedFetch:
|
||||
record_metric("layer1_filter", "unfiltered_ms", round(unfiltered_ms, 1))
|
||||
record_metric("layer1_filter", "filtered_ms", round(filtered_ms, 1))
|
||||
if unfiltered_ms > 0:
|
||||
record_metric("layer1_filter", "speedup_pct", round((1 - filtered_ms / unfiltered_ms) * 100, 1))
|
||||
record_metric(
|
||||
"layer1_filter", "speedup_pct", round((1 - filtered_ms / unfiltered_ms) * 100, 1)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
@@ -146,7 +148,9 @@ class TestWakeUpTokenBudget:
|
||||
record_metric("wakeup_budget", f"tokens_at_{n_drawers}", token_estimate)
|
||||
record_metric("wakeup_budget", f"chars_at_{n_drawers}", len(text))
|
||||
|
||||
assert token_estimate < 1200, f"Wake-up exceeded budget: ~{token_estimate} tokens at {n_drawers} drawers"
|
||||
assert token_estimate < 1200, (
|
||||
f"Wake-up exceeded budget: ~{token_estimate} tokens at {n_drawers} drawers"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
|
||||
@@ -63,7 +63,9 @@ class TestSearchMemoryProfile:
|
||||
record_metric("memory_search", "rss_end_mb", round(end_rss, 2))
|
||||
record_metric("memory_search", "rss_growth_mb", round(growth, 2))
|
||||
record_metric("memory_search", "n_calls", n_calls)
|
||||
record_metric("memory_search", "growth_per_100_calls_mb", round(growth / (n_calls / 100), 2))
|
||||
record_metric(
|
||||
"memory_search", "growth_per_100_calls_mb", round(growth / (n_calls / 100), 2)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
@@ -166,11 +168,13 @@ class TestHeapSnapshot:
|
||||
stats = snap_after.compare_to(snap_before, "lineno")
|
||||
top_allocators = []
|
||||
for stat in stats[:10]:
|
||||
top_allocators.append({
|
||||
top_allocators.append(
|
||||
{
|
||||
"file": str(stat.traceback),
|
||||
"size_kb": round(stat.size / 1024, 1),
|
||||
"count": stat.count,
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
total_growth_kb = sum(s["size_kb"] for s in top_allocators)
|
||||
record_metric("heap_search", "top_10_growth_kb", round(total_growth_kb, 1))
|
||||
|
||||
@@ -123,8 +123,12 @@ class TestFilterLatencyBenefit:
|
||||
record_metric("filter_latency", "avg_wing_filtered_ms", round(avg_wing, 1))
|
||||
record_metric("filter_latency", "avg_room_filtered_ms", round(avg_room, 1))
|
||||
if avg_none > 0:
|
||||
record_metric("filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1))
|
||||
record_metric("filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1))
|
||||
record_metric(
|
||||
"filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1)
|
||||
)
|
||||
record_metric(
|
||||
"filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
|
||||
@@ -61,14 +61,16 @@ def _populate_single_room(palace_path, n_drawers, n_needles=10):
|
||||
drawer_id = f"drawer_single_room_{hashlib.md5(needle_id.encode()).hexdigest()[:16]}"
|
||||
docs.append(content)
|
||||
ids.append(drawer_id)
|
||||
metas.append({
|
||||
metas.append(
|
||||
{
|
||||
"wing": "concentrated",
|
||||
"room": "single_room",
|
||||
"source_file": f"needle_{i}.txt",
|
||||
"chunk_index": 0,
|
||||
"added_by": "threshold_bench",
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
# Fill with noise — all in the SAME room
|
||||
remaining = n_drawers - len(docs)
|
||||
@@ -77,14 +79,16 @@ def _populate_single_room(palace_path, n_drawers, n_needles=10):
|
||||
drawer_id = f"drawer_single_room_{hashlib.md5(f'noise_{i}'.encode()).hexdigest()[:16]}"
|
||||
docs.append(content)
|
||||
ids.append(drawer_id)
|
||||
metas.append({
|
||||
metas.append(
|
||||
{
|
||||
"wing": "concentrated",
|
||||
"room": "single_room",
|
||||
"source_file": f"noise_{i:06d}.txt",
|
||||
"chunk_index": i % 10,
|
||||
"added_by": "threshold_bench",
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
if len(docs) >= batch_size:
|
||||
col.add(documents=docs, ids=ids, metadatas=metas)
|
||||
|
||||
@@ -77,9 +77,7 @@ class TestSearchRecallAtScale:
|
||||
total_needle_queries = min(10, len(needle_info))
|
||||
|
||||
for needle in needle_info[:total_needle_queries]:
|
||||
result = search_memories(
|
||||
needle["query"], palace_path=palace_path, n_results=10
|
||||
)
|
||||
result = search_memories(needle["query"], palace_path=palace_path, n_results=10)
|
||||
if "error" in result:
|
||||
continue
|
||||
|
||||
@@ -150,8 +148,12 @@ class TestSearchFilteredVsUnfiltered:
|
||||
record_metric("search_filter", "avg_unfiltered_ms", round(avg_unfiltered, 1))
|
||||
record_metric("search_filter", "avg_filtered_ms", round(avg_filtered, 1))
|
||||
record_metric("search_filter", "latency_improvement_pct", round(latency_improvement, 1))
|
||||
record_metric("search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3))
|
||||
record_metric("search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3))
|
||||
record_metric(
|
||||
"search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3)
|
||||
)
|
||||
record_metric(
|
||||
"search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
@@ -167,9 +169,16 @@ class TestConcurrentSearch:
|
||||
from mempalace.searcher import search_memories
|
||||
|
||||
queries = [
|
||||
"authentication", "database", "deployment", "error handling",
|
||||
"testing", "monitoring", "caching", "middleware",
|
||||
"serialization", "validation",
|
||||
"authentication",
|
||||
"database",
|
||||
"deployment",
|
||||
"error handling",
|
||||
"testing",
|
||||
"monitoring",
|
||||
"caching",
|
||||
"middleware",
|
||||
"serialization",
|
||||
"validation",
|
||||
] * 3 # 30 total queries
|
||||
|
||||
def run_search(query):
|
||||
|
||||
Reference in New Issue
Block a user