fix(searcher): address Copilot review on #1306

- Dedup union candidates by (full_path, chunk_index), not basename —
  two files sharing a basename in different dirs no longer collide,
  and a vector hit on chunk N of a file no longer blocks BM25 from
  contributing chunk M of the same file.
- Validate candidate_strategy at the top of search_memories so invalid
  values fail consistently, not only when the call routes through the
  vector path.
- Trim hits back to n_results after the union+rerank pool grows;
  preserves the existing search_memories size contract that the MCP
  limit parameter is built on.
- Skip BM25-only injection when max_distance > 0.0; BM25-only
  candidates carry distance=None and would silently bypass the
  caller's strict vector-distance threshold.

Adds 4 tests covering: validation under vector_disabled, n_results
trim, max_distance honoring, and basename-collision dedup.
This commit is contained in:
Igor Lins e Silva
2026-05-03 06:09:10 -03:00
parent 6509071b8e
commit 3eb7980e55
2 changed files with 197 additions and 18 deletions
+96 -18
View File
@@ -381,6 +381,7 @@ def _bm25_only_via_sqlite(
room: str = None, room: str = None,
n_results: int = 5, n_results: int = 5,
max_candidates: int = 500, max_candidates: int = 500,
_include_internal: bool = False,
) -> dict: ) -> dict:
"""BM25-only search reading drawers directly from chroma.sqlite3. """BM25-only search reading drawers directly from chroma.sqlite3.
@@ -518,17 +519,25 @@ def _bm25_only_via_sqlite(
continue continue
if room and meta.get("room") != room: if room and meta.get("room") != room:
continue continue
full_source = meta.get("source_file", "") or ""
candidates.append( candidates.append(
{ {
"text": d["text"], "text": d["text"],
"wing": meta.get("wing", "unknown"), "wing": meta.get("wing", "unknown"),
"room": meta.get("room", "unknown"), "room": meta.get("room", "unknown"),
"source_file": Path(meta.get("source_file", "?") or "?").name, "source_file": Path(full_source).name if full_source else "?",
"created_at": meta.get("filed_at", "unknown"), "created_at": meta.get("filed_at", "unknown"),
# No vector distance available in BM25-only mode. # No vector distance available in BM25-only mode.
"similarity": None, "similarity": None,
"distance": None, "distance": None,
"matched_via": "bm25_sqlite", "matched_via": "bm25_sqlite",
# Internal: full path + chunk_index let callers (notably
# candidate_strategy="union") dedupe at chunk granularity
# rather than basename — two files in different directories
# may share a basename, and one source_file is split across
# multiple chunks. Stripped before this helper returns.
"_source_file_full": full_source,
"_chunk_index": meta.get("chunk_index"),
} }
) )
@@ -543,6 +552,12 @@ def _bm25_only_via_sqlite(
hits = candidates[:n_results] hits = candidates[:n_results]
for h in hits: for h in hits:
h.pop("_score", None) h.pop("_score", None)
# Strip internal fields by default so the public BM25-only fallback
# response stays clean. Callers that need chunk-precise dedup
# (notably the union-merge path) opt in via _include_internal.
if not _include_internal:
h.pop("_source_file_full", None)
h.pop("_chunk_index", None)
return { return {
"query": query, "query": query,
@@ -561,17 +576,33 @@ def _merge_bm25_union_candidates(
wing: str, wing: str,
room: str, room: str,
n_results: int, n_results: int,
max_distance: float = 0.0,
) -> None: ) -> None:
"""Append top-K BM25-only candidates from sqlite into ``hits`` in place. """Append top-K BM25-only candidates from sqlite into ``hits`` in place.
Used by ``search_memories(..., candidate_strategy="union")`` to widen Used by ``search_memories(..., candidate_strategy="union")`` to widen
the rerank pool's *source* (not just its size) — vector-only candidate the rerank pool's *source* (not just its size) — vector-only candidate
selection skips docs whose embeddings are far from the query even when selection skips docs whose embeddings are far from the query even when
BM25 signal is strong. We dedupe against existing hits by ``source_file`` BM25 signal is strong.
so vector-side entries (which carry real distance values) win on
collisions; BM25-only additions are marked with ``distance=None`` so Dedup is chunk-precise: the key is ``(_source_file_full, _chunk_index)``
``_hybrid_rank`` scores them on BM25 contribution alone. so two files sharing a basename in different directories don't collide,
and a vector hit on chunk N of a file doesn't block BM25 from
contributing chunk M of the same file. Falls back to ``source_file``
only when full-path/chunk metadata is absent.
BM25-only additions carry ``distance=None`` so ``_hybrid_rank`` scores
them on BM25 contribution alone.
When ``max_distance > 0.0`` (a strict vector-distance threshold is
set), BM25-only candidates are skipped entirely — they have no vector
distance to satisfy the threshold, and silently injecting them would
break the existing ``max_distance`` guarantee that hybrid results lie
within the requested vector-distance bound.
""" """
if max_distance > 0.0:
return
try: try:
bm25_extra = _bm25_only_via_sqlite( bm25_extra = _bm25_only_via_sqlite(
query, query,
@@ -579,21 +610,32 @@ def _merge_bm25_union_candidates(
wing=wing, wing=wing,
room=room, room=room,
n_results=n_results * 3, n_results=n_results * 3,
_include_internal=True,
).get("results", []) ).get("results", [])
except Exception: except Exception:
logger.debug("candidate_strategy=union: BM25 fetch failed", exc_info=True) logger.debug("candidate_strategy=union: BM25 fetch failed", exc_info=True)
return return
seen_sources = {h.get("source_file") for h in hits} def _dedup_key(entry: dict):
full = entry.get("_source_file_full")
ci = entry.get("_chunk_index")
if full and ci is not None:
return (full, ci)
# Fall back to basename only when richer metadata is missing —
# avoids silently dropping candidates on legacy data while still
# giving chunk-precise dedup whenever the metadata is present.
return entry.get("source_file")
seen = {_dedup_key(h) for h in hits}
for bh in bm25_extra: for bh in bm25_extra:
key = bh.get("source_file") key = _dedup_key(bh)
if not key or key == "?" or key in seen_sources: if not key or key == "?" or key in seen:
continue continue
bh["distance"] = None bh["distance"] = None
bh["effective_distance"] = None bh["effective_distance"] = None
bh["closet_boost"] = 0.0 bh["closet_boost"] = 0.0
hits.append(bh) hits.append(bh)
seen_sources.add(key) seen.add(key)
# Strategy dispatch — keeps search_memories' branch count under the # Strategy dispatch — keeps search_memories' branch count under the
@@ -605,6 +647,19 @@ _CANDIDATE_MERGERS = {
} }
def _validate_candidate_strategy(strategy: str) -> None:
"""Raise ``ValueError`` for unknown strategies.
Called eagerly at the top of ``search_memories`` so invalid values
fail consistently regardless of whether the call routes through the
vector path, the BM25-only fallback, or returns an early error dict.
"""
if strategy not in _CANDIDATE_MERGERS:
raise ValueError(
f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, got {strategy!r}"
)
def _apply_candidate_strategy( def _apply_candidate_strategy(
strategy: str, strategy: str,
hits: list, hits: list,
@@ -613,18 +668,16 @@ def _apply_candidate_strategy(
wing: str, wing: str,
room: str, room: str,
n_results: int, n_results: int,
max_distance: float = 0.0,
) -> None: ) -> None:
"""Dispatch to the registered merger for ``strategy``. """Dispatch to the registered merger for ``strategy``.
Raises ``ValueError`` for unknown strategies. ``"vector"`` is a no-op. Strategy validity is assumed (``_validate_candidate_strategy`` runs
earlier); ``"vector"`` is a no-op.
""" """
if strategy not in _CANDIDATE_MERGERS:
raise ValueError(
f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, " f"got {strategy!r}"
)
merger = _CANDIDATE_MERGERS[strategy] merger = _CANDIDATE_MERGERS[strategy]
if merger is not None: if merger is not None:
merger(hits, query, palace_path, wing, room, n_results) merger(hits, query, palace_path, wing, room, n_results, max_distance=max_distance)
def search_memories( def search_memories(
@@ -669,7 +722,16 @@ def search_memories(
by scenario descriptions). Adds one sqlite open + FTS5 MATCH by scenario descriptions). Adds one sqlite open + FTS5 MATCH
per query; perf cost is small but unmeasured at corpus scale. per query; perf cost is small but unmeasured at corpus scale.
Opt in until the cost is characterized. Opt in until the cost is characterized.
When ``max_distance > 0.0`` is also set, BM25-only candidates
are skipped — they have no vector distance and would silently
violate the requested distance threshold.
""" """
# Validate the strategy eagerly so invalid values fail the same way
# regardless of whether the call routes through the vector path or
# the BM25-only fallback below.
_validate_candidate_strategy(candidate_strategy)
if vector_disabled: if vector_disabled:
return _bm25_only_via_sqlite( return _bm25_only_via_sqlite(
query, query,
@@ -848,10 +910,26 @@ def search_memories(
# Candidate strategy hook: optionally widen the rerank pool's *source* # Candidate strategy hook: optionally widen the rerank pool's *source*
# before ranking. Default ("vector") is a no-op; "union" merges top-K # before ranking. Default ("vector") is a no-op; "union" merges top-K
# BM25 candidates from sqlite. See `_apply_candidate_strategy`. # BM25 candidates from sqlite. See `_apply_candidate_strategy`.
_apply_candidate_strategy(candidate_strategy, hits, query, palace_path, wing, room, n_results) # ``max_distance`` is forwarded so union mode can refuse to inject
# BM25-only (distance=None) candidates that would silently bypass the
# caller's strict distance threshold.
_apply_candidate_strategy(
candidate_strategy,
hits,
query,
palace_path,
wing,
room,
n_results,
max_distance=max_distance,
)
# BM25 hybrid re-rank within the final candidate set. # BM25 hybrid re-rank within the final candidate set, then trim back
hits = _hybrid_rank(hits, query) # to the requested size. Without the trim, ``candidate_strategy="union"``
# would return up to 4× ``n_results`` (vector hits + BM25 union pool),
# breaking the existing ``search_memories`` size contract that the MCP
# ``limit`` parameter is built on.
hits = _hybrid_rank(hits, query)[:n_results]
for h in hits: for h in hits:
h.pop("_sort_key", None) h.pop("_sort_key", None)
h.pop("_source_file_full", None) h.pop("_source_file_full", None)
+101
View File
@@ -113,6 +113,107 @@ class TestCandidateUnion:
with pytest.raises(ValueError, match="candidate_strategy"): with pytest.raises(ValueError, match="candidate_strategy"):
search_memories("anything", palace, n_results=5, candidate_strategy="bogus") search_memories("anything", palace, n_results=5, candidate_strategy="bogus")
def test_invalid_strategy_raises_even_when_vector_disabled(self, tmp_path):
"""Validation must happen before the ``vector_disabled`` early return —
invalid values must fail consistently regardless of routing."""
palace = str(tmp_path / "palace")
_seed_drawers(palace)
import pytest
with pytest.raises(ValueError, match="candidate_strategy"):
search_memories(
"anything",
palace,
n_results=5,
vector_disabled=True,
candidate_strategy="bogus",
)
def test_union_respects_n_results_limit(self, tmp_path):
"""When the merged candidate set is larger than ``n_results``, the
result must be trimmed back to the requested size — the MCP
``limit`` contract depends on this invariant."""
palace = str(tmp_path / "palace")
_seed_drawers(palace)
# 4-doc corpus, n_results=2 → union pool can grow to ~8 candidates,
# rerank reorders them, but final list must respect the cap.
result = search_memories(_NARRATIVE_QUERY, palace, n_results=2, candidate_strategy="union")
assert (
len(result["results"]) <= 2
), f"union must trim to n_results=2; got {len(result['results'])} results"
def test_union_skipped_when_max_distance_set(self, tmp_path):
"""``max_distance`` is a vector-distance threshold; BM25-only
candidates have ``distance=None`` and cannot satisfy it. Union
must not silently inject them when a strict threshold is set,
otherwise the existing ``max_distance`` guarantee regresses."""
palace = str(tmp_path / "palace")
_seed_drawers(palace)
# Sanity: without max_distance, union surfaces the BM25-strong doc.
unfiltered = search_memories(
_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union"
)
assert "brand_voice_D4.md" in {h["source_file"] for h in unfiltered["results"]}
# With a tight max_distance, union must NOT inject BM25-only hits —
# every returned hit must have a real (non-None) distance.
filtered = search_memories(
_NARRATIVE_QUERY,
palace,
n_results=5,
candidate_strategy="union",
max_distance=0.5,
)
for h in filtered["results"]:
assert h.get("distance") is not None, (
f"union under max_distance must not inject BM25-only "
f"(distance=None) candidates; offending hit: {h}"
)
assert h["distance"] <= 0.5, f"hit violates max_distance=0.5: distance={h['distance']}"
def test_union_dedup_is_chunk_precise_not_basename(self, tmp_path):
"""Two files with the same basename in different directories must
not collide — union must dedup on full path (or chunk-level key),
not on basename alone. Otherwise a BM25-strong README from one
directory silently shadows a BM25-strong README from another.
"""
palace = str(tmp_path / "palace")
col = get_collection(palace, create=True)
col.upsert(
ids=["A_README", "B_README", "narrative"],
documents=[
# Both README files share the basename README.md but live
# in different directories. Each contains distinctive
# terminology a query might surface via BM25.
"PROJECT ALPHA: configuration for the Frobnitz subsystem. "
"Set FROBNITZ_TIMEOUT=30 to enable widget rotation.",
"PROJECT BETA: configuration for the Wibble subsystem. "
"Set WIBBLE_THRESHOLD=0.5 to enable signal smoothing.",
"Engineers occasionally chat about how the legacy "
"subsystems all need their config knobs tweaked.",
],
metadatas=[
{"wing": "code", "room": "docs", "source_file": "alpha/README.md"},
{"wing": "code", "room": "docs", "source_file": "beta/README.md"},
{"wing": "code", "room": "docs", "source_file": "chat.md"},
],
)
# Query that hits BM25 for BOTH READMEs (distinct vocab from each).
# Vector-only might pick the chat doc as semantically "closest";
# union must surface both READMEs without basename collision.
result = search_memories(
"FROBNITZ_TIMEOUT WIBBLE_THRESHOLD configuration",
palace,
n_results=5,
candidate_strategy="union",
)
sources = [h["source_file"] for h in result["results"]]
readme_count = sum(1 for s in sources if s == "README.md")
assert readme_count >= 2, (
f"union must surface both README.md files from different dirs "
f"(basename collision would drop one); got sources={sources}"
)
class TestHybridRankTolerantOfMissingDistance: class TestHybridRankTolerantOfMissingDistance:
"""``_hybrid_rank`` accepts ``distance=None`` — required for BM25-only """``_hybrid_rank`` accepts ``distance=None`` — required for BM25-only