fix: resolve formatting, regression logic, and pytest defaults

- Run ruff format on all benchmark files (fixes CI lint job)
- Fix check_regression() substring ambiguity: ordered keyword matching
  so "latency_improvement_pct" is correctly classified as higher-is-better
- Update stale comments in conftest.py referencing wrong fixture
- Add pytest addopts to skip benchmark/slow/stress markers by default
This commit is contained in:
Igor Lins e Silva
2026-04-08 10:56:39 -03:00
parent 7e4db33061
commit ebc26f3960
12 changed files with 383 additions and 138 deletions
+248 -74
View File
@@ -20,42 +20,150 @@ import yaml
# ── Scale configurations ─────────────────────────────────────────────────
SCALE_CONFIGS = {
"small": {"drawers": 1_000, "wings": 3, "rooms_per_wing": 5, "kg_entities": 50, "kg_triples": 200, "needles": 20, "search_queries": 20},
"medium": {"drawers": 10_000, "wings": 8, "rooms_per_wing": 12, "kg_entities": 200, "kg_triples": 2_000, "needles": 50, "search_queries": 50},
"large": {"drawers": 50_000, "wings": 15, "rooms_per_wing": 20, "kg_entities": 500, "kg_triples": 10_000, "needles": 100, "search_queries": 100},
"stress": {"drawers": 100_000, "wings": 25, "rooms_per_wing": 30, "kg_entities": 1_000, "kg_triples": 50_000, "needles": 200, "search_queries": 200},
"small": {
"drawers": 1_000,
"wings": 3,
"rooms_per_wing": 5,
"kg_entities": 50,
"kg_triples": 200,
"needles": 20,
"search_queries": 20,
},
"medium": {
"drawers": 10_000,
"wings": 8,
"rooms_per_wing": 12,
"kg_entities": 200,
"kg_triples": 2_000,
"needles": 50,
"search_queries": 50,
},
"large": {
"drawers": 50_000,
"wings": 15,
"rooms_per_wing": 20,
"kg_entities": 500,
"kg_triples": 10_000,
"needles": 100,
"search_queries": 100,
},
"stress": {
"drawers": 100_000,
"wings": 25,
"rooms_per_wing": 30,
"kg_entities": 1_000,
"kg_triples": 50_000,
"needles": 200,
"search_queries": 200,
},
}
# ── Vocabulary banks for realistic content ───────────────────────────────
WING_NAMES = [
"webapp", "backend_api", "mobile_app", "data_pipeline", "ml_platform",
"devops", "auth_service", "payments", "analytics", "docs_site",
"cli_tool", "dashboard", "notification_service", "search_engine",
"user_mgmt", "inventory", "reporting", "testing_infra", "monitoring",
"email_service", "chat_bot", "file_storage", "scheduler", "gateway",
"webapp",
"backend_api",
"mobile_app",
"data_pipeline",
"ml_platform",
"devops",
"auth_service",
"payments",
"analytics",
"docs_site",
"cli_tool",
"dashboard",
"notification_service",
"search_engine",
"user_mgmt",
"inventory",
"reporting",
"testing_infra",
"monitoring",
"email_service",
"chat_bot",
"file_storage",
"scheduler",
"gateway",
"marketplace",
]
ROOM_NAMES = [
"backend", "frontend", "api", "database", "auth", "tests", "docs",
"config", "deployment", "models", "views", "controllers", "middleware",
"utils", "schemas", "migrations", "fixtures", "scripts", "styles",
"components", "hooks", "services", "routes", "templates", "static",
"media", "logging", "cache", "queue", "workers",
"backend",
"frontend",
"api",
"database",
"auth",
"tests",
"docs",
"config",
"deployment",
"models",
"views",
"controllers",
"middleware",
"utils",
"schemas",
"migrations",
"fixtures",
"scripts",
"styles",
"components",
"hooks",
"services",
"routes",
"templates",
"static",
"media",
"logging",
"cache",
"queue",
"workers",
]
TECH_TERMS = [
"authentication", "authorization", "middleware", "endpoint", "REST API",
"GraphQL", "WebSocket", "database migration", "ORM", "query optimization",
"caching strategy", "load balancer", "rate limiting", "pagination",
"serialization", "validation", "error handling", "logging framework",
"monitoring", "deployment pipeline", "CI/CD", "containerization",
"microservice", "event sourcing", "message queue", "pub/sub",
"connection pooling", "session management", "token refresh", "CORS",
"SSL termination", "health check", "circuit breaker", "retry logic",
"batch processing", "stream processing", "data pipeline", "ETL",
"feature flag", "A/B testing", "blue-green deployment", "canary release",
"authentication",
"authorization",
"middleware",
"endpoint",
"REST API",
"GraphQL",
"WebSocket",
"database migration",
"ORM",
"query optimization",
"caching strategy",
"load balancer",
"rate limiting",
"pagination",
"serialization",
"validation",
"error handling",
"logging framework",
"monitoring",
"deployment pipeline",
"CI/CD",
"containerization",
"microservice",
"event sourcing",
"message queue",
"pub/sub",
"connection pooling",
"session management",
"token refresh",
"CORS",
"SSL termination",
"health check",
"circuit breaker",
"retry logic",
"batch processing",
"stream processing",
"data pipeline",
"ETL",
"feature flag",
"A/B testing",
"blue-green deployment",
"canary release",
]
CODE_SNIPPETS = [
@@ -75,17 +183,51 @@ PROSE_TEMPLATES = [
]
ENTITY_NAMES = [
"Alice", "Bob", "Carol", "Dave", "Eve", "Frank", "Grace", "Heidi",
"Ivan", "Judy", "Karl", "Linda", "Mike", "Nina", "Oscar", "Pat",
"Quinn", "Rita", "Steve", "Tina", "Ursula", "Victor", "Wendy", "Xander",
"Alice",
"Bob",
"Carol",
"Dave",
"Eve",
"Frank",
"Grace",
"Heidi",
"Ivan",
"Judy",
"Karl",
"Linda",
"Mike",
"Nina",
"Oscar",
"Pat",
"Quinn",
"Rita",
"Steve",
"Tina",
"Ursula",
"Victor",
"Wendy",
"Xander",
]
ENTITY_TYPES = ["person", "project", "tool", "concept", "team", "service"]
PREDICATES = [
"works_on", "manages", "reports_to", "collaborates_with", "created",
"maintains", "uses", "depends_on", "replaced", "reviewed", "deployed",
"tested", "documented", "mentors", "leads", "contributes_to",
"works_on",
"manages",
"reports_to",
"collaborates_with",
"created",
"maintains",
"uses",
"depends_on",
"replaced",
"reviewed",
"deployed",
"tested",
"documented",
"mentors",
"leads",
"contributes_to",
]
@@ -136,13 +278,19 @@ class PalaceDataGenerator:
room = self.rng.choice(self.rooms_by_wing[wing])
needle_id = f"NEEDLE_{i:04d}"
content = f"{needle_id}: {topic}. This is a unique planted needle for recall benchmarking at scale."
self.needles.append({
"id": needle_id,
"content": content,
"wing": wing,
"room": room,
"query": topic.split(" uses ")[0] if " uses " in topic else topic.split(" set to ")[0] if " set to " in topic else topic[:60],
})
self.needles.append(
{
"id": needle_id,
"content": content,
"wing": wing,
"room": room,
"query": topic.split(" uses ")[0]
if " uses " in topic
else topic.split(" set to ")[0]
if " set to " in topic
else topic[:60],
}
)
def _random_text(self, min_chars=600, max_chars=900):
"""Generate a random text block of realistic content."""
@@ -159,21 +307,25 @@ class PalaceDataGenerator:
component=self.rng.choice(ROOM_NAMES),
task=self.rng.choice(TECH_TERMS),
month=self.rng.choice(["January", "February", "March", "April", "May"]),
quality=self.rng.choice(["performance", "readability", "test coverage", "latency"]),
quality=self.rng.choice(
["performance", "readability", "test coverage", "latency"]
),
decision=self.rng.choice(TECH_TERMS),
condition=self.rng.choice(TECH_TERMS) + " is null",
cause=self.rng.choice(["race condition", "null pointer", "timeout", "OOM"]),
fix="adding " + self.rng.choice(TECH_TERMS),
test_file=f"test_{self.rng.choice(ROOM_NAMES)}.py",
old_tech=self.rng.choice(["MySQL", "Flask", "REST", "Jenkins"]),
new_tech=self.rng.choice(["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]),
new_tech=self.rng.choice(
["PostgreSQL", "FastAPI", "GraphQL", "GitHub Actions"]
),
reason=self.rng.choice(TECH_TERMS),
date=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
date=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}",
percent=self.rng.randint(10, 80),
topic=self.rng.choice(TECH_TERMS),
person=self.rng.choice(ENTITY_NAMES),
action=self.rng.choice(["refactor", "migrate", "optimize", "test"]),
deadline=f"2025-{self.rng.randint(1,12):02d}-{self.rng.randint(1,28):02d}",
deadline=f"2025-{self.rng.randint(1, 12):02d}-{self.rng.randint(1, 28):02d}",
followup=self.rng.choice(TECH_TERMS),
feature_name=self.rng.choice(TECH_TERMS),
capability=self.rng.choice(TECH_TERMS),
@@ -182,7 +334,12 @@ class PalaceDataGenerator:
)
else:
words = self.rng.sample(TECH_TERMS, min(5, len(TECH_TERMS)))
text = " ".join(words) + ". " + self.rng.choice(TECH_TERMS) + " implementation details follow.\n"
text = (
" ".join(words)
+ ". "
+ self.rng.choice(TECH_TERMS)
+ " implementation details follow.\n"
)
parts.append(text)
total += len(text)
return "\n".join(parts)[:max_chars]
@@ -270,15 +427,24 @@ class PalaceDataGenerator:
needle_id = f"drawer_{needle['wing']}_{needle['room']}_{hashlib.md5(needle['id'].encode()).hexdigest()[:16]}"
docs.append(needle["content"])
ids.append(needle_id)
metas.append({
"wing": needle["wing"],
"room": needle["room"],
"source_file": f"needle_{needle['id']}.txt",
"chunk_index": 0,
"added_by": "benchmark",
"filed_at": datetime.now().isoformat(),
})
needle_info.append({"id": needle_id, "query": needle["query"], "wing": needle["wing"], "room": needle["room"]})
metas.append(
{
"wing": needle["wing"],
"room": needle["room"],
"source_file": f"needle_{needle['id']}.txt",
"chunk_index": 0,
"added_by": "benchmark",
"filed_at": datetime.now().isoformat(),
}
)
needle_info.append(
{
"id": needle_id,
"query": needle["query"],
"wing": needle["wing"],
"room": needle["room"],
}
)
# Fill remaining drawers with realistic content
remaining = n_drawers - len(docs)
@@ -291,14 +457,16 @@ class PalaceDataGenerator:
docs.append(content)
ids.append(drawer_id)
metas.append({
"wing": wing,
"room": room,
"source_file": f"generated_{i:06d}.txt",
"chunk_index": i % 10,
"added_by": "benchmark",
"filed_at": datetime.now().isoformat(),
})
metas.append(
{
"wing": wing,
"room": room,
"source_file": f"generated_{i:06d}.txt",
"chunk_index": i % 10,
"added_by": "benchmark",
"filed_at": datetime.now().isoformat(),
}
)
# Flush in batches
if len(docs) >= batch_size:
@@ -351,7 +519,9 @@ class PalaceDataGenerator:
valid_to = None
if self.rng.random() < 0.3:
end_offset = self.rng.randint(30, 365)
valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime("%Y-%m-%d")
valid_to = (base_date + timedelta(days=days_offset + end_offset)).strftime(
"%Y-%m-%d"
)
triples.append((subject, predicate, obj, valid_from, valid_to))
return entities, triples
@@ -371,24 +541,28 @@ class PalaceDataGenerator:
# Half are needle queries (known-good answers)
n_needle = min(n_queries // 2, len(self.needles))
for needle in self.needles[:n_needle]:
queries.append({
"query": needle["query"],
"expected_wing": needle["wing"],
"expected_room": needle["room"],
"needle_id": needle["id"],
"is_needle": True,
})
queries.append(
{
"query": needle["query"],
"expected_wing": needle["wing"],
"expected_room": needle["room"],
"needle_id": needle["id"],
"is_needle": True,
}
)
# Other half are generic queries (measure latency, not recall)
n_generic = n_queries - n_needle
for _ in range(n_generic):
queries.append({
"query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS),
"expected_wing": None,
"expected_room": None,
"needle_id": None,
"is_needle": False,
})
queries.append(
{
"query": self.rng.choice(TECH_TERMS) + " " + self.rng.choice(TECH_TERMS),
"expected_wing": None,
"expected_room": None,
"needle_id": None,
"is_needle": False,
}
)
self.rng.shuffle(queries)
return queries