fix: resolve formatting, regression logic, and pytest defaults
- Run ruff format on all benchmark files (fixes CI lint job) - Fix check_regression() substring ambiguity: ordered keyword matching so "latency_improvement_pct" is correctly classified as higher-is-better - Update stale comments in conftest.py referencing wrong fixture - Add pytest addopts to skip benchmark/slow/stress markers by default
This commit is contained in:
@@ -65,6 +65,7 @@ quote-style = "double"
|
|||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
testpaths = ["tests"]
|
testpaths = ["tests"]
|
||||||
pythonpath = ["."]
|
pythonpath = ["."]
|
||||||
|
addopts = "-m 'not benchmark and not slow and not stress'"
|
||||||
markers = [
|
markers = [
|
||||||
"benchmark: scale/performance benchmark tests",
|
"benchmark: scale/performance benchmark tests",
|
||||||
"slow: tests that take more than 30 seconds",
|
"slow: tests that take more than 30 seconds",
|
||||||
|
|||||||
@@ -96,8 +96,7 @@ def pytest_terminal_summary(terminalreporter, config):
|
|||||||
if not report_path:
|
if not report_path:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Collect results from the session fixture if available
|
# Collect results written by individual tests via record_metric()
|
||||||
# The results are written by individual tests via bench_results fixture
|
|
||||||
import platform
|
import platform
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
@@ -129,7 +128,7 @@ def pytest_terminal_summary(terminalreporter, config):
|
|||||||
"results": {},
|
"results": {},
|
||||||
}
|
}
|
||||||
|
|
||||||
# Read results from a temp file written by the bench_results fixture
|
# Read results from the temp file written by record_metric()
|
||||||
results_file = os.path.join(tempfile.gettempdir(), "mempalace_bench_results.json")
|
results_file = os.path.join(tempfile.gettempdir(), "mempalace_bench_results.json")
|
||||||
if os.path.exists(results_file):
|
if os.path.exists(results_file):
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -20,42 +20,150 @@ import yaml
|
|||||||
# ── Scale configurations ─────────────────────────────────────────────────
|
# ── Scale configurations ─────────────────────────────────────────────────
|
||||||
|
|
||||||
SCALE_CONFIGS = {
|
SCALE_CONFIGS = {
|
||||||
"small": {"drawers": 1_000, "wings": 3, "rooms_per_wing": 5, "kg_entities": 50, "kg_triples": 200, "needles": 20, "search_queries": 20},
|
"small": {
|
||||||
"medium": {"drawers": 10_000, "wings": 8, "rooms_per_wing": 12, "kg_entities": 200, "kg_triples": 2_000, "needles": 50, "search_queries": 50},
|
"drawers": 1_000,
|
||||||
"large": {"drawers": 50_000, "wings": 15, "rooms_per_wing": 20, "kg_entities": 500, "kg_triples": 10_000, "needles": 100, "search_queries": 100},
|
"wings": 3,
|
||||||
"stress": {"drawers": 100_000, "wings": 25, "rooms_per_wing": 30, "kg_entities": 1_000, "kg_triples": 50_000, "needles": 200, "search_queries": 200},
|
"rooms_per_wing": 5,
|
||||||
|
"kg_entities": 50,
|
||||||
|
"kg_triples": 200,
|
||||||
|
"needles": 20,
|
||||||
|
"search_queries": 20,
|
||||||
|
},
|
||||||
|
"medium": {
|
||||||
|
"drawers": 10_000,
|
||||||
|
"wings": 8,
|
||||||
|
"rooms_per_wing": 12,
|
||||||
|
"kg_entities": 200,
|
||||||
|
"kg_triples": 2_000,
|
||||||
|
"needles": 50,
|
||||||
|
"search_queries": 50,
|
||||||
|
},
|
||||||
|
"large": {
|
||||||
|
"drawers": 50_000,
|
||||||
|
"wings": 15,
|
||||||
|
"rooms_per_wing": 20,
|
||||||
|
"kg_entities": 500,
|
||||||
|
"kg_triples": 10_000,
|
||||||
|
"needles": 100,
|
||||||
|
"search_queries": 100,
|
||||||
|
},
|
||||||
|
"stress": {
|
||||||
|
"drawers": 100_000,
|
||||||
|
"wings": 25,
|
||||||
|
"rooms_per_wing": 30,
|
||||||
|
"kg_entities": 1_000,
|
||||||
|
"kg_triples": 50_000,
|
||||||
|
"needles": 200,
|
||||||
|
"search_queries": 200,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
# ── Vocabulary banks for realistic content ───────────────────────────────
|
# ── Vocabulary banks for realistic content ───────────────────────────────
|
||||||
|
|
||||||
WING_NAMES = [
|
WING_NAMES = [
|
||||||
"webapp", "backend_api", "mobile_app", "data_pipeline", "ml_platform",
|
"webapp",
|
||||||
"devops", "auth_service", "payments", "analytics", "docs_site",
|
"backend_api",
|
||||||
"cli_tool", "dashboard", "notification_service", "search_engine",
|
"mobile_app",
|
||||||
"user_mgmt", "inventory", "reporting", "testing_infra", "monitoring",
|
"data_pipeline",
|
||||||
"email_service", "chat_bot", "file_storage", "scheduler", "gateway",
|
"ml_platform",
|
||||||
|
"devops",
|
||||||
|
"auth_service",
|
||||||
|
"payments",
|
||||||
|
"analytics",
|
||||||
|
"docs_site",
|
||||||
|
"cli_tool",
|
||||||
|
"dashboard",
|
||||||
|
"notification_service",
|
||||||
|
"search_engine",
|
||||||
|
"user_mgmt",
|
||||||
|
"inventory",
|
||||||
|
"reporting",
|
||||||
|
"testing_infra",
|
||||||
|
"monitoring",
|
||||||
|
"email_service",
|
||||||
|
"chat_bot",
|
||||||
|
"file_storage",
|
||||||
|
"scheduler",
|
||||||
|
"gateway",
|
||||||
"marketplace",
|
"marketplace",
|
||||||
]
|
]
|
||||||
|
|
||||||
ROOM_NAMES = [
|
ROOM_NAMES = [
|
||||||
"backend", "frontend", "api", "database", "auth", "tests", "docs",
|
"backend",
|
||||||
"config", "deployment", "models", "views", "controllers", "middleware",
|
"frontend",
|
||||||
"utils", "schemas", "migrations", "fixtures", "scripts", "styles",
|
"api",
|
||||||
"components", "hooks", "services", "routes", "templates", "static",
|
"database",
|
||||||
"media", "logging", "cache", "queue", "workers",
|
"auth",
|
||||||
|
"tests",
|
||||||
|
"docs",
|
||||||
|
"config",
|
||||||
|
"deployment",
|
||||||
|
"models",
|
||||||
|
"views",
|
||||||
|
"controllers",
|
||||||
|
"middleware",
|
||||||
|
"utils",
|
||||||
|
"schemas",
|
||||||
|
"migrations",
|
||||||
|
"fixtures",
|
||||||
|
"scripts",
|
||||||
|
"styles",
|
||||||
|
"components",
|
||||||
|
"hooks",
|
||||||
|
"services",
|
||||||
|
"routes",
|
||||||
|
"templates",
|
||||||
|
"static",
|
||||||
|
"media",
|
||||||
|
"logging",
|
||||||
|
"cache",
|
||||||
|
"queue",
|
||||||
|
"workers",
|
||||||
]
|
]
|
||||||
|
|
||||||
TECH_TERMS = [
|
TECH_TERMS = [
|
||||||
"authentication", "authorization", "middleware", "endpoint", "REST API",
|
"authentication",
|
||||||
"GraphQL", "WebSocket", "database migration", "ORM", "query optimization",
|
"authorization",
|
||||||
"caching strategy", "load balancer", "rate limiting", "pagination",
|
"middleware",
|
||||||
"serialization", "validation", "error handling", "logging framework",
|
"endpoint",
|
||||||
"monitoring", "deployment pipeline", "CI/CD", "containerization",
|
"REST API",
|
||||||
"microservice", "event sourcing", "message queue", "pub/sub",
|
"GraphQL",
|
||||||
"connection pooling", "session management", "token refresh", "CORS",
|
"WebSocket",
|
||||||
"SSL termination", "health check", "circuit breaker", "retry logic",
|
"database migration",
|
||||||
"batch processing", "stream processing", "data pipeline", "ETL",
|
"ORM",
|
||||||
"feature flag", "A/B testing", "blue-green deployment", "canary release",
|
"query optimization",
|
||||||
|
"caching strategy",
|
||||||
|
"load balancer",
|
||||||
|
"rate limiting",
|
||||||
|
"pagination",
|
||||||
|
"serialization",
|
||||||
|
"validation",
|
||||||
|
"error handling",
|
||||||
|
"logging framework",
|
||||||
|
"monitoring",
|
||||||
|
"deployment pipeline",
|
||||||
|
"CI/CD",
|
||||||
|
"containerization",
|
||||||
|
"microservice",
|
||||||
|
"event sourcing",
|
||||||
|
"message queue",
|
||||||
|
"pub/sub",
|
||||||
|
"connection pooling",
|
||||||
|
"session management",
|
||||||
|
"token refresh",
|
||||||
|
"CORS",
|
||||||
|
"SSL termination",
|
||||||
|
"health check",
|
||||||
|
"circuit breaker",
|
||||||
|
"retry logic",
|
||||||
|
"batch processing",
|
||||||
|
"stream processing",
|
||||||
|
"data pipeline",
|
||||||
|
"ETL",
|
||||||
|
"feature flag",
|
||||||
|
"A/B testing",
|
||||||
|
"blue-green deployment",
|
||||||
|
"canary release",
|
||||||
]
|
]
|
||||||
|
|
||||||
CODE_SNIPPETS = [
|
CODE_SNIPPETS = [
|
||||||
@@ -75,17 +183,51 @@ PROSE_TEMPLATES = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
ENTITY_NAMES = [
|
ENTITY_NAMES = [
|
||||||
"Alice", "Bob", "Carol", "Dave", "Eve", "Frank", "Grace", "Heidi",
|
"Alice",
|
||||||
"Ivan", "Judy", "Karl", "Linda", "Mike", "Nina", "Oscar", "Pat",
|
"Bob",
|
||||||
"Quinn", "Rita", "Steve", "Tina", "Ursula", "Victor", "Wendy", "Xander",
|
"Carol",
|
||||||
|
"Dave",
|
||||||
|
"Eve",
|
||||||
|
"Frank",
|
||||||
|
"Grace",
|
||||||
|
"Heidi",
|
||||||
|
"Ivan",
|
||||||
|
"Judy",
|
||||||
|
"Karl",
|
||||||
|
"Linda",
|
||||||
|
"Mike",
|
||||||
|
"Nina",
|
||||||
|
"Oscar",
|
||||||
|
"Pat",
|
||||||
|
"Quinn",
|
||||||
|
"Rita",
|
||||||
|
"Steve",
|
||||||
|
"Tina",
|
||||||
|
"Ursula",
|
||||||
|
"Victor",
|
||||||
|
"Wendy",
|
||||||
|
"Xander",
|
||||||
]
|
]
|
||||||
|
|
||||||
ENTITY_TYPES = ["person", "project", "tool", "concept", "team", "service"]
|
ENTITY_TYPES = ["person", "project", "tool", "concept", "team", "service"]
|
||||||
|
|
||||||
PREDICATES = [
|
PREDICATES = [
|
||||||
"works_on", "manages", "reports_to", "collaborates_with", "created",
|
"works_on",
|
||||||
"maintains", "uses", "depends_on", "replaced", "reviewed", "deployed",
|
"manages",
|
||||||
"tested", "documented", "mentors", "leads", "contributes_to",
|
"reports_to",
|
||||||
|
"collaborates_with",
|
||||||
|
"created",
|
||||||
|
"maintains",
|
||||||
|
"uses",
|
||||||
|
"depends_on",
|
||||||
|
"replaced",
|
||||||
|
"reviewed",
|
||||||
|
"deployed",
|
||||||
|
"tested",
|
||||||
|
"documented",
|
||||||
|
"mentors",
|
||||||
|
"leads",
|
||||||
|
"contributes_to",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -136,13 +278,19 @@ class PalaceDataGenerator:
|
|||||||
room = self.rng.choice(self.rooms_by_wing[wing])
|
room = self.rng.choice(self.rooms_by_wing[wing])
|
||||||
needle_id = f"NEEDLE_{i:04d}"
|
needle_id = f"NEEDLE_{i:04d}"
|
||||||
content = f"{needle_id}: {topic}. This is a unique planted needle for recall benchmarking at scale."
|
content = f"{needle_id}: {topic}. This is a unique planted needle for recall benchmarking at scale."
|
||||||
self.needles.append({
|
self.needles.append(
|
||||||
"id": needle_id,
|
{
|
||||||
"content": content,
|
"id": needle_id,
|
||||||
"wing": wing,
|
"content": content,
|
||||||
"room": room,
|
"wing": wing,
|
||||||
"query": topic.split(" uses ")[0] if " uses " in topic else topic.split(" set to ")[0] if " set to " in topic else topic[:60],
|
"room": room,
|
||||||
})
|
"query": topic.split(" uses ")[0]
|
||||||
|
if " uses " in topic
|
||||||
|
else topic.split(" set to ")[0]
|
||||||
|
if " set to " in topic
|
||||||
|
else topic[:60],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
def _random_text(self, min_chars=600, max_chars=900):
|
def _random_text(self, min_chars=600, max_chars=900):
|
||||||
"""Generate a random text block of realistic content."""
|
"""Generate a random text block of realistic content."""
|
||||||
@@ -159,21 +307,25 @@ class PalaceDataGenerator:
|
|||||||
component=self.rng.choice(ROOM_NAMES),
|
component=self.rng.choice(ROOM_NAMES),
|
||||||
task=self.rng.choice(TECH_TERMS),
|
task=self.rng.choice(TECH_TERMS),
|
||||||
month=self.rng.choice(["January", "February", "March", "April", "May"]),
|
month=self.rng.choice(["January", "February", "March", "April", "May"]),
|
||||||
quality=self.rng.choice(["performance", "readability", "test coverage", "latency"]),
|
quality=self.rng.choice(
|
||||||
|
["performance", "readability", "test coverage", "latency"]
|
||||||
|
),
|
||||||
decision=self.rng.choice(TECH_TERMS),
|
decision=self.rng.choice(TECH_TERMS),
|
||||||
condition=self.rng.choice(TECH_TERMS) + " is null",
|
condition=self.rng.choice(TECH_TERMS) + " is null",
|
||||||
cause=self.rng.choice(["race condition", "null pointer", "timeout", "OOM"]),
|
cause=self.rng.choice(["race condition", "null pointer", "timeout", "OOM"]),
|
||||||
fix="adding " + self.rng.choice(TECH_TERMS),
|
fix="adding " + self.rng.choice(TECH_TERMS),
|
||||||
test_file=f"test_{self.rng.choice(ROOM_NAMES)}.py",
|
test_file=f"test_{self.rng.choice(ROOM_NAMES)}.py",
|
||||||
old_tech=self.rng.choice(["MySQL", "Flask", "REST", "Jenkins"]),
|
old_tech=self.rng.choice(["MySQL", "Flask", "REST", "Jenkins"]),
|
||||||
new_tech=self.rng.choice(["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]),
|
new_tech=self.rng.choice(
|
||||||
|
["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]
|
||||||
|
),
|
||||||
reason=self.rng.choice(TECH_TERMS),
|
reason=self.rng.choice(TECH_TERMS),
|
||||||
date=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
|
date=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}",
|
||||||
percent=self.rng.randint(10, 80),
|
percent=self.rng.randint(10, 80),
|
||||||
topic=self.rng.choice(TECH_TERMS),
|
topic=self.rng.choice(TECH_TERMS),
|
||||||
person=self.rng.choice(ENTITY_NAMES),
|
person=self.rng.choice(ENTITY_NAMES),
|
||||||
action=self.rng.choice(["refactor", "migrate", "optimize", "test"]),
|
action=self.rng.choice(["refactor", "migrate", "optimize", "test"]),
|
||||||
deadline=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
|
deadline=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}",
|
||||||
followup=self.rng.choice(TECH_TERMS),
|
followup=self.rng.choice(TECH_TERMS),
|
||||||
feature_name=self.rng.choice(TECH_TERMS),
|
feature_name=self.rng.choice(TECH_TERMS),
|
||||||
capability=self.rng.choice(TECH_TERMS),
|
capability=self.rng.choice(TECH_TERMS),
|
||||||
@@ -182,7 +334,12 @@ class PalaceDataGenerator:
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
words = self.rng.sample(TECH_TERMS, min(5, len(TECH_TERMS)))
|
words = self.rng.sample(TECH_TERMS, min(5, len(TECH_TERMS)))
|
||||||
text = " ".join(words) + ". " + self.rng.choice(TECH_TERMS) + " implementation details follow.\n"
|
text = (
|
||||||
|
" ".join(words)
|
||||||
|
+ ". "
|
||||||
|
+ self.rng.choice(TECH_TERMS)
|
||||||
|
+ " implementation details follow.\n"
|
||||||
|
)
|
||||||
parts.append(text)
|
parts.append(text)
|
||||||
total += len(text)
|
total += len(text)
|
||||||
return "\n".join(parts)[:max_chars]
|
return "\n".join(parts)[:max_chars]
|
||||||
@@ -270,15 +427,24 @@ class PalaceDataGenerator:
|
|||||||
needle_id = f"drawer_{needle['wing']}_{needle['room']}_{hashlib.md5(needle['id'].encode()).hexdigest()[:16]}"
|
needle_id = f"drawer_{needle['wing']}_{needle['room']}_{hashlib.md5(needle['id'].encode()).hexdigest()[:16]}"
|
||||||
docs.append(needle["content"])
|
docs.append(needle["content"])
|
||||||
ids.append(needle_id)
|
ids.append(needle_id)
|
||||||
metas.append({
|
metas.append(
|
||||||
"wing": needle["wing"],
|
{
|
||||||
"room": needle["room"],
|
"wing": needle["wing"],
|
||||||
"source_file": f"needle_{needle['id']}.txt",
|
"room": needle["room"],
|
||||||
"chunk_index": 0,
|
"source_file": f"needle_{needle['id']}.txt",
|
||||||
"added_by": "benchmark",
|
"chunk_index": 0,
|
||||||
"filed_at": datetime.now().isoformat(),
|
"added_by": "benchmark",
|
||||||
})
|
"filed_at": datetime.now().isoformat(),
|
||||||
needle_info.append({"id": needle_id, "query": needle["query"], "wing": needle["wing"], "room": needle["room"]})
|
}
|
||||||
|
)
|
||||||
|
needle_info.append(
|
||||||
|
{
|
||||||
|
"id": needle_id,
|
||||||
|
"query": needle["query"],
|
||||||
|
"wing": needle["wing"],
|
||||||
|
"room": needle["room"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Fill remaining drawers with realistic content
|
# Fill remaining drawers with realistic content
|
||||||
remaining = n_drawers - len(docs)
|
remaining = n_drawers - len(docs)
|
||||||
@@ -291,14 +457,16 @@ class PalaceDataGenerator:
|
|||||||
|
|
||||||
docs.append(content)
|
docs.append(content)
|
||||||
ids.append(drawer_id)
|
ids.append(drawer_id)
|
||||||
metas.append({
|
metas.append(
|
||||||
"wing": wing,
|
{
|
||||||
"room": room,
|
"wing": wing,
|
||||||
"source_file": f"generated_{i:06d}.txt",
|
"room": room,
|
||||||
"chunk_index": i % 10,
|
"source_file": f"generated_{i:06d}.txt",
|
||||||
"added_by": "benchmark",
|
"chunk_index": i % 10,
|
||||||
"filed_at": datetime.now().isoformat(),
|
"added_by": "benchmark",
|
||||||
})
|
"filed_at": datetime.now().isoformat(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Flush in batches
|
# Flush in batches
|
||||||
if len(docs) >= batch_size:
|
if len(docs) >= batch_size:
|
||||||
@@ -351,7 +519,9 @@ class PalaceDataGenerator:
|
|||||||
valid_to = None
|
valid_to = None
|
||||||
if self.rng.random() < 0.3:
|
if self.rng.random() < 0.3:
|
||||||
end_offset = self.rng.randint(30, 365)
|
end_offset = self.rng.randint(30, 365)
|
||||||
valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime("%Y-%m-%d")
|
valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime(
|
||||||
|
"%Y-%m-%d"
|
||||||
|
)
|
||||||
triples.append((subject, predicate, obj, valid_from, valid_to))
|
triples.append((subject, predicate, obj, valid_from, valid_to))
|
||||||
|
|
||||||
return entities, triples
|
return entities, triples
|
||||||
@@ -371,24 +541,28 @@ class PalaceDataGenerator:
|
|||||||
# Half are needle queries (known-good answers)
|
# Half are needle queries (known-good answers)
|
||||||
n_needle = min(n_queries // 2, len(self.needles))
|
n_needle = min(n_queries // 2, len(self.needles))
|
||||||
for needle in self.needles[:n_needle]:
|
for needle in self.needles[:n_needle]:
|
||||||
queries.append({
|
queries.append(
|
||||||
"query": needle["query"],
|
{
|
||||||
"expected_wing": needle["wing"],
|
"query": needle["query"],
|
||||||
"expected_room": needle["room"],
|
"expected_wing": needle["wing"],
|
||||||
"needle_id": needle["id"],
|
"expected_room": needle["room"],
|
||||||
"is_needle": True,
|
"needle_id": needle["id"],
|
||||||
})
|
"is_needle": True,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Other half are generic queries (measure latency, not recall)
|
# Other half are generic queries (measure latency, not recall)
|
||||||
n_generic = n_queries - n_needle
|
n_generic = n_queries - n_needle
|
||||||
for _ in range(n_generic):
|
for _ in range(n_generic):
|
||||||
queries.append({
|
queries.append(
|
||||||
"query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS),
|
{
|
||||||
"expected_wing": None,
|
"query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS),
|
||||||
"expected_room": None,
|
"expected_wing": None,
|
||||||
"needle_id": None,
|
"expected_room": None,
|
||||||
"is_needle": False,
|
"needle_id": None,
|
||||||
})
|
"is_needle": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
self.rng.shuffle(queries)
|
self.rng.shuffle(queries)
|
||||||
return queries
|
return queries
|
||||||
|
|||||||
+44
-17
@@ -45,16 +45,45 @@ def check_regression(current_report: str, baseline_report: str, threshold: float
|
|||||||
baseline = json.load(f)
|
baseline = json.load(f)
|
||||||
|
|
||||||
regressions = []
|
regressions = []
|
||||||
# Metrics where HIGHER is worse (latency, memory, etc.)
|
# Keywords for metric direction — checked in order, first match wins.
|
||||||
higher_is_worse = {
|
# "improvement" is checked before "latency" so that composite names
|
||||||
"latency", "rss", "memory", "oom", "lock_failures", "elapsed",
|
# like "latency_improvement_pct" are classified correctly.
|
||||||
"p50_ms", "p95_ms", "p99_ms", "rss_delta_mb", "peak_rss_mb",
|
_higher_is_better_kw = [
|
||||||
}
|
"improvement",
|
||||||
# Metrics where LOWER is worse (throughput, recall, etc.)
|
"recall",
|
||||||
lower_is_worse = {
|
"throughput",
|
||||||
"recall", "throughput", "per_sec", "files_per_sec", "drawers_per_sec",
|
"per_sec",
|
||||||
"triples_per_sec", "improvement",
|
"files_per_sec",
|
||||||
}
|
"drawers_per_sec",
|
||||||
|
"triples_per_sec",
|
||||||
|
"speedup",
|
||||||
|
]
|
||||||
|
_higher_is_worse_kw = [
|
||||||
|
"latency",
|
||||||
|
"rss",
|
||||||
|
"memory",
|
||||||
|
"oom",
|
||||||
|
"lock_failures",
|
||||||
|
"elapsed",
|
||||||
|
"p50_ms",
|
||||||
|
"p95_ms",
|
||||||
|
"p99_ms",
|
||||||
|
"rss_delta_mb",
|
||||||
|
"peak_rss_mb",
|
||||||
|
"errors",
|
||||||
|
"failures",
|
||||||
|
]
|
||||||
|
|
||||||
|
def _metric_direction(name: str) -> str:
|
||||||
|
"""Return 'higher_better', 'higher_worse', or 'unknown'."""
|
||||||
|
low = name.lower()
|
||||||
|
for kw in _higher_is_better_kw:
|
||||||
|
if kw in low:
|
||||||
|
return "higher_better"
|
||||||
|
for kw in _higher_is_worse_kw:
|
||||||
|
if kw in low:
|
||||||
|
return "higher_worse"
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
for category in baseline.get("results", {}):
|
for category in baseline.get("results", {}):
|
||||||
if category not in current.get("results", {}):
|
if category not in current.get("results", {}):
|
||||||
@@ -68,23 +97,21 @@ def check_regression(current_report: str, baseline_report: str, threshold: float
|
|||||||
if base_val == 0:
|
if base_val == 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Determine direction
|
direction = _metric_direction(metric)
|
||||||
is_latency_like = any(kw in metric.lower() for kw in higher_is_worse)
|
|
||||||
is_throughput_like = any(kw in metric.lower() for kw in lower_is_worse)
|
|
||||||
|
|
||||||
if is_latency_like:
|
if direction == "higher_worse":
|
||||||
# Higher is worse — check if current exceeds baseline by threshold
|
# Higher is worse — check if current exceeds baseline by threshold
|
||||||
if curr_val > base_val * (1 + threshold):
|
if curr_val > base_val * (1 + threshold):
|
||||||
pct = ((curr_val - base_val) / base_val) * 100
|
pct = ((curr_val - base_val) / base_val) * 100
|
||||||
regressions.append(
|
regressions.append(
|
||||||
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)"
|
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)"
|
||||||
)
|
)
|
||||||
elif is_throughput_like:
|
elif direction == "higher_better":
|
||||||
# Lower is worse — check if current is below baseline by threshold
|
# Lower is worse — check if current is below baseline by threshold
|
||||||
if curr_val < base_val * (1 - threshold):
|
if curr_val < base_val * (1 - threshold):
|
||||||
pct = ((curr_val - base_val) / base_val) * 100
|
pct = ((curr_val - base_val) / base_val) * 100
|
||||||
regressions.append(
|
regressions.append(
|
||||||
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)"
|
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)"
|
||||||
)
|
)
|
||||||
|
|
||||||
return regressions
|
return regressions
|
||||||
|
|||||||
@@ -143,7 +143,10 @@ class TestBulkInsertPerformance:
|
|||||||
batch_end = min(batch_start + batch_size, n_docs)
|
batch_end = min(batch_start + batch_size, n_docs)
|
||||||
batch_docs = contents[batch_start:batch_end]
|
batch_docs = contents[batch_start:batch_end]
|
||||||
batch_ids = [f"batch_{i}" for i in range(batch_start, batch_end)]
|
batch_ids = [f"batch_{i}" for i in range(batch_start, batch_end)]
|
||||||
batch_metas = [{"wing": "test", "room": "bench", "chunk_index": i} for i in range(batch_start, batch_end)]
|
batch_metas = [
|
||||||
|
{"wing": "test", "room": "bench", "chunk_index": i}
|
||||||
|
for i in range(batch_start, batch_end)
|
||||||
|
]
|
||||||
col_batch.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
col_batch.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
||||||
batched_ms = (time.perf_counter() - start) * 1000
|
batched_ms = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
|||||||
@@ -125,7 +125,9 @@ class TestChunkThroughput:
|
|||||||
chunks_per_sec = total_chunks / max(elapsed, 0.001)
|
chunks_per_sec = total_chunks / max(elapsed, 0.001)
|
||||||
kb_per_sec = (len(content) * n_iterations / 1024) / max(elapsed, 0.001)
|
kb_per_sec = (len(content) * n_iterations / 1024) / max(elapsed, 0.001)
|
||||||
|
|
||||||
record_metric("chunking", f"chunks_per_sec_at_{content_size_kb}kb", round(chunks_per_sec, 1))
|
record_metric(
|
||||||
|
"chunking", f"chunks_per_sec_at_{content_size_kb}kb", round(chunks_per_sec, 1)
|
||||||
|
)
|
||||||
record_metric("chunking", f"kb_per_sec_at_{content_size_kb}kb", round(kb_per_sec, 1))
|
record_metric("chunking", f"kb_per_sec_at_{content_size_kb}kb", round(kb_per_sec, 1))
|
||||||
|
|
||||||
|
|
||||||
@@ -160,4 +162,8 @@ class TestReingestSkipOverhead:
|
|||||||
|
|
||||||
record_metric("reingest", "skip_check_elapsed_sec", round(skip_elapsed, 2))
|
record_metric("reingest", "skip_check_elapsed_sec", round(skip_elapsed, 2))
|
||||||
record_metric("reingest", "files_checked", files_written)
|
record_metric("reingest", "files_checked", files_written)
|
||||||
record_metric("reingest", "skip_check_per_file_ms", round(skip_elapsed * 1000 / max(files_written, 1), 1))
|
record_metric(
|
||||||
|
"reingest",
|
||||||
|
"skip_check_per_file_ms",
|
||||||
|
round(skip_elapsed * 1000 / max(files_written, 1), 1),
|
||||||
|
)
|
||||||
|
|||||||
@@ -36,9 +36,7 @@ class TestTripleInsertionRate:
|
|||||||
# Measure triple insertion
|
# Measure triple insertion
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for subject, predicate, obj, valid_from, valid_to in triples:
|
for subject, predicate, obj, valid_from, valid_to in triples:
|
||||||
kg.add_triple(
|
kg.add_triple(subject, predicate, obj, valid_from=valid_from, valid_to=valid_to)
|
||||||
subject, predicate, obj, valid_from=valid_from, valid_to=valid_to
|
|
||||||
)
|
|
||||||
elapsed = time.perf_counter() - start
|
elapsed = time.perf_counter() - start
|
||||||
|
|
||||||
triples_per_sec = n_triples / max(elapsed, 0.001)
|
triples_per_sec = n_triples / max(elapsed, 0.001)
|
||||||
@@ -128,7 +126,9 @@ class TestTemporalQueryAccuracy:
|
|||||||
kg.add_entity("ProjectB", "project")
|
kg.add_entity("ProjectB", "project")
|
||||||
|
|
||||||
# Alice worked on ProjectA from 2024-01 to 2024-06
|
# Alice worked on ProjectA from 2024-01 to 2024-06
|
||||||
kg.add_triple("Alice", "works_on", "ProjectA", valid_from="2024-01-01", valid_to="2024-06-30")
|
kg.add_triple(
|
||||||
|
"Alice", "works_on", "ProjectA", valid_from="2024-01-01", valid_to="2024-06-30"
|
||||||
|
)
|
||||||
# Alice worked on ProjectB from 2024-07 onwards
|
# Alice worked on ProjectB from 2024-07 onwards
|
||||||
kg.add_triple("Alice", "works_on", "ProjectB", valid_from="2024-07-01")
|
kg.add_triple("Alice", "works_on", "ProjectB", valid_from="2024-07-01")
|
||||||
|
|
||||||
@@ -145,8 +145,16 @@ class TestTemporalQueryAccuracy:
|
|||||||
# Query Alice as of September 2024 — should find ProjectB
|
# Query Alice as of September 2024 — should find ProjectB
|
||||||
result_sept = kg.query_entity("Alice", as_of="2024-09-15")
|
result_sept = kg.query_entity("Alice", as_of="2024-09-15")
|
||||||
|
|
||||||
record_metric("kg_temporal", "march_query_results", len(result_march) if isinstance(result_march, list) else 0)
|
record_metric(
|
||||||
record_metric("kg_temporal", "sept_query_results", len(result_sept) if isinstance(result_sept, list) else 0)
|
"kg_temporal",
|
||||||
|
"march_query_results",
|
||||||
|
len(result_march) if isinstance(result_march, list) else 0,
|
||||||
|
)
|
||||||
|
record_metric(
|
||||||
|
"kg_temporal",
|
||||||
|
"sept_query_results",
|
||||||
|
len(result_sept) if isinstance(result_sept, list) else 0,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
@@ -230,7 +238,9 @@ class TestSQLiteConcurrentAccess:
|
|||||||
fails = 0
|
fails = 0
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
try:
|
try:
|
||||||
kg.add_triple(f"E_{i % 50}", "new_rel", f"E_{(i + 7) % 50}", valid_from="2025-06-01")
|
kg.add_triple(
|
||||||
|
f"E_{i % 50}", "new_rel", f"E_{(i + 7) % 50}", valid_from="2025-06-01"
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
fails += 1
|
fails += 1
|
||||||
write_errors.append(fails)
|
write_errors.append(fails)
|
||||||
|
|||||||
@@ -116,7 +116,9 @@ class TestLayer1UnboundedFetch:
|
|||||||
record_metric("layer1_filter", "unfiltered_ms", round(unfiltered_ms, 1))
|
record_metric("layer1_filter", "unfiltered_ms", round(unfiltered_ms, 1))
|
||||||
record_metric("layer1_filter", "filtered_ms", round(filtered_ms, 1))
|
record_metric("layer1_filter", "filtered_ms", round(filtered_ms, 1))
|
||||||
if unfiltered_ms > 0:
|
if unfiltered_ms > 0:
|
||||||
record_metric("layer1_filter", "speedup_pct", round((1 - filtered_ms / unfiltered_ms) * 100, 1))
|
record_metric(
|
||||||
|
"layer1_filter", "speedup_pct", round((1 - filtered_ms / unfiltered_ms) * 100, 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
@@ -146,7 +148,9 @@ class TestWakeUpTokenBudget:
|
|||||||
record_metric("wakeup_budget", f"tokens_at_{n_drawers}", token_estimate)
|
record_metric("wakeup_budget", f"tokens_at_{n_drawers}", token_estimate)
|
||||||
record_metric("wakeup_budget", f"chars_at_{n_drawers}", len(text))
|
record_metric("wakeup_budget", f"chars_at_{n_drawers}", len(text))
|
||||||
|
|
||||||
assert token_estimate < 1200, f"Wake-up exceeded budget: ~{token_estimate} tokens at {n_drawers} drawers"
|
assert token_estimate < 1200, (
|
||||||
|
f"Wake-up exceeded budget: ~{token_estimate} tokens at {n_drawers} drawers"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
|
|||||||
@@ -63,7 +63,9 @@ class TestSearchMemoryProfile:
|
|||||||
record_metric("memory_search", "rss_end_mb", round(end_rss, 2))
|
record_metric("memory_search", "rss_end_mb", round(end_rss, 2))
|
||||||
record_metric("memory_search", "rss_growth_mb", round(growth, 2))
|
record_metric("memory_search", "rss_growth_mb", round(growth, 2))
|
||||||
record_metric("memory_search", "n_calls", n_calls)
|
record_metric("memory_search", "n_calls", n_calls)
|
||||||
record_metric("memory_search", "growth_per_100_calls_mb", round(growth / (n_calls / 100), 2))
|
record_metric(
|
||||||
|
"memory_search", "growth_per_100_calls_mb", round(growth / (n_calls / 100), 2)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
@@ -166,11 +168,13 @@ class TestHeapSnapshot:
|
|||||||
stats = snap_after.compare_to(snap_before, "lineno")
|
stats = snap_after.compare_to(snap_before, "lineno")
|
||||||
top_allocators = []
|
top_allocators = []
|
||||||
for stat in stats[:10]:
|
for stat in stats[:10]:
|
||||||
top_allocators.append({
|
top_allocators.append(
|
||||||
"file": str(stat.traceback),
|
{
|
||||||
"size_kb": round(stat.size / 1024, 1),
|
"file": str(stat.traceback),
|
||||||
"count": stat.count,
|
"size_kb": round(stat.size / 1024, 1),
|
||||||
})
|
"count": stat.count,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
total_growth_kb = sum(s["size_kb"] for s in top_allocators)
|
total_growth_kb = sum(s["size_kb"] for s in top_allocators)
|
||||||
record_metric("heap_search", "top_10_growth_kb", round(total_growth_kb, 1))
|
record_metric("heap_search", "top_10_growth_kb", round(total_growth_kb, 1))
|
||||||
|
|||||||
@@ -123,8 +123,12 @@ class TestFilterLatencyBenefit:
|
|||||||
record_metric("filter_latency", "avg_wing_filtered_ms", round(avg_wing, 1))
|
record_metric("filter_latency", "avg_wing_filtered_ms", round(avg_wing, 1))
|
||||||
record_metric("filter_latency", "avg_room_filtered_ms", round(avg_room, 1))
|
record_metric("filter_latency", "avg_room_filtered_ms", round(avg_room, 1))
|
||||||
if avg_none > 0:
|
if avg_none > 0:
|
||||||
record_metric("filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1))
|
record_metric(
|
||||||
record_metric("filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1))
|
"filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1)
|
||||||
|
)
|
||||||
|
record_metric(
|
||||||
|
"filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
|
|||||||
@@ -61,14 +61,16 @@ def _populate_single_room(palace_path, n_drawers, n_needles=10):
|
|||||||
drawer_id = f"drawer_single_room_{hashlib.md5(needle_id.encode()).hexdigest()[:16]}"
|
drawer_id = f"drawer_single_room_{hashlib.md5(needle_id.encode()).hexdigest()[:16]}"
|
||||||
docs.append(content)
|
docs.append(content)
|
||||||
ids.append(drawer_id)
|
ids.append(drawer_id)
|
||||||
metas.append({
|
metas.append(
|
||||||
"wing": "concentrated",
|
{
|
||||||
"room": "single_room",
|
"wing": "concentrated",
|
||||||
"source_file": f"needle_{i}.txt",
|
"room": "single_room",
|
||||||
"chunk_index": 0,
|
"source_file": f"needle_{i}.txt",
|
||||||
"added_by": "threshold_bench",
|
"chunk_index": 0,
|
||||||
"filed_at": datetime.now().isoformat(),
|
"added_by": "threshold_bench",
|
||||||
})
|
"filed_at": datetime.now().isoformat(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Fill with noise — all in the SAME room
|
# Fill with noise — all in the SAME room
|
||||||
remaining = n_drawers - len(docs)
|
remaining = n_drawers - len(docs)
|
||||||
@@ -77,14 +79,16 @@ def _populate_single_room(palace_path, n_drawers, n_needles=10):
|
|||||||
drawer_id = f"drawer_single_room_{hashlib.md5(f'noise_{i}'.encode()).hexdigest()[:16]}"
|
drawer_id = f"drawer_single_room_{hashlib.md5(f'noise_{i}'.encode()).hexdigest()[:16]}"
|
||||||
docs.append(content)
|
docs.append(content)
|
||||||
ids.append(drawer_id)
|
ids.append(drawer_id)
|
||||||
metas.append({
|
metas.append(
|
||||||
"wing": "concentrated",
|
{
|
||||||
"room": "single_room",
|
"wing": "concentrated",
|
||||||
"source_file": f"noise_{i:06d}.txt",
|
"room": "single_room",
|
||||||
"chunk_index": i % 10,
|
"source_file": f"noise_{i:06d}.txt",
|
||||||
"added_by": "threshold_bench",
|
"chunk_index": i % 10,
|
||||||
"filed_at": datetime.now().isoformat(),
|
"added_by": "threshold_bench",
|
||||||
})
|
"filed_at": datetime.now().isoformat(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
if len(docs) >= batch_size:
|
if len(docs) >= batch_size:
|
||||||
col.add(documents=docs, ids=ids, metadatas=metas)
|
col.add(documents=docs, ids=ids, metadatas=metas)
|
||||||
|
|||||||
@@ -77,9 +77,7 @@ class TestSearchRecallAtScale:
|
|||||||
total_needle_queries = min(10, len(needle_info))
|
total_needle_queries = min(10, len(needle_info))
|
||||||
|
|
||||||
for needle in needle_info[:total_needle_queries]:
|
for needle in needle_info[:total_needle_queries]:
|
||||||
result = search_memories(
|
result = search_memories(needle["query"], palace_path=palace_path, n_results=10)
|
||||||
needle["query"], palace_path=palace_path, n_results=10
|
|
||||||
)
|
|
||||||
if "error" in result:
|
if "error" in result:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -150,8 +148,12 @@ class TestSearchFilteredVsUnfiltered:
|
|||||||
record_metric("search_filter", "avg_unfiltered_ms", round(avg_unfiltered, 1))
|
record_metric("search_filter", "avg_unfiltered_ms", round(avg_unfiltered, 1))
|
||||||
record_metric("search_filter", "avg_filtered_ms", round(avg_filtered, 1))
|
record_metric("search_filter", "avg_filtered_ms", round(avg_filtered, 1))
|
||||||
record_metric("search_filter", "latency_improvement_pct", round(latency_improvement, 1))
|
record_metric("search_filter", "latency_improvement_pct", round(latency_improvement, 1))
|
||||||
record_metric("search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3))
|
record_metric(
|
||||||
record_metric("search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3))
|
"search_filter", "unfiltered_recall_at_5", round(unfiltered_hits / max(n_queries, 1), 3)
|
||||||
|
)
|
||||||
|
record_metric(
|
||||||
|
"search_filter", "filtered_recall_at_5", round(filtered_hits / max(n_queries, 1), 3)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.benchmark
|
@pytest.mark.benchmark
|
||||||
@@ -167,9 +169,16 @@ class TestConcurrentSearch:
|
|||||||
from mempalace.searcher import search_memories
|
from mempalace.searcher import search_memories
|
||||||
|
|
||||||
queries = [
|
queries = [
|
||||||
"authentication", "database", "deployment", "error handling",
|
"authentication",
|
||||||
"testing", "monitoring", "caching", "middleware",
|
"database",
|
||||||
"serialization", "validation",
|
"deployment",
|
||||||
|
"error handling",
|
||||||
|
"testing",
|
||||||
|
"monitoring",
|
||||||
|
"caching",
|
||||||
|
"middleware",
|
||||||
|
"serialization",
|
||||||
|
"validation",
|
||||||
] * 3 # 30 total queries
|
] * 3 # 30 total queries
|
||||||
|
|
||||||
def run_search(query):
|
def run_search(query):
|
||||||
|
|||||||
Reference in New Issue
Block a user