fix: resolve formatting, regression logic, and pytest defaults

- Run ruff format on all benchmark files (fixes CI lint job)
- Fix check_regression() substring ambiguity: ordered keyword matching
  so "latency_improvement_pct" is correctly classified as higher-is-better
- Update stale comments in conftest.py referencing wrong fixture
- Add pytest addopts to skip benchmark/slow/stress markers by default
This commit is contained in:
Igor Lins e Silva
2026-04-08 10:56:39 -03:00
parent 7e4db33061
commit ebc26f3960
12 changed files with 383 additions and 138 deletions
+44 -17
View File
@@ -45,16 +45,45 @@ def check_regression(current_report: str, baseline_report: str, threshold: float
baseline = json.load(f)
regressions = []
# Metrics where HIGHER is worse (latency, memory, etc.)
higher_is_worse = {
"latency", "rss", "memory", "oom", "lock_failures", "elapsed",
"p50_ms", "p95_ms", "p99_ms", "rss_delta_mb", "peak_rss_mb",
}
# Metrics where LOWER is worse (throughput, recall, etc.)
lower_is_worse = {
"recall", "throughput", "per_sec", "files_per_sec", "drawers_per_sec",
"triples_per_sec", "improvement",
}
# Keywords for metric direction — checked in order, first match wins.
# "improvement" is checked before "latency" so that composite names
# like "latency_improvement_pct" are classified correctly.
_higher_is_better_kw = [
"improvement",
"recall",
"throughput",
"per_sec",
"files_per_sec",
"drawers_per_sec",
"triples_per_sec",
"speedup",
]
_higher_is_worse_kw = [
"latency",
"rss",
"memory",
"oom",
"lock_failures",
"elapsed",
"p50_ms",
"p95_ms",
"p99_ms",
"rss_delta_mb",
"peak_rss_mb",
"errors",
"failures",
]
def _metric_direction(name: str) -> str:
"""Return 'higher_better', 'higher_worse', or 'unknown'."""
low = name.lower()
for kw in _higher_is_better_kw:
if kw in low:
return "higher_better"
for kw in _higher_is_worse_kw:
if kw in low:
return "higher_worse"
return "unknown"
for category in baseline.get("results", {}):
if category not in current.get("results", {}):
@@ -68,23 +97,21 @@ def check_regression(current_report: str, baseline_report: str, threshold: float
if base_val == 0:
continue
# Determine direction
is_latency_like = any(kw in metric.lower() for kw in higher_is_worse)
is_throughput_like = any(kw in metric.lower() for kw in lower_is_worse)
direction = _metric_direction(metric)
if is_latency_like:
if direction == "higher_worse":
# Higher is worse — check if current exceeds baseline by threshold
if curr_val > base_val * (1 + threshold):
pct = ((curr_val - base_val) / base_val) * 100
regressions.append(
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)"
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)"
)
elif is_throughput_like:
elif direction == "higher_better":
# Lower is worse — check if current is below baseline by threshold
if curr_val < base_val * (1 - threshold):
pct = ((curr_val - base_val) / base_val) * 100
regressions.append(
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold*100:.0f}%)"
f"{category}/{metric}: {base_val:.2f} -> {curr_val:.2f} ({pct:+.1f}%, threshold {threshold * 100:.0f}%)"
)
return regressions