fix(searcher): address Copilot review on #1306
- Dedup union candidates by (full_path, chunk_index), not basename — two files sharing a basename in different dirs no longer collide, and a vector hit on chunk N of a file no longer blocks BM25 from contributing chunk M of the same file. - Validate candidate_strategy at the top of search_memories so invalid values fail consistently, not only when the call routes through the vector path. - Trim hits back to n_results after the union+rerank pool grows; preserves the existing search_memories size contract that the MCP limit parameter is built on. - Skip BM25-only injection when max_distance > 0.0; BM25-only candidates carry distance=None and would silently bypass the caller's strict vector-distance threshold. Adds 4 tests covering: validation under vector_disabled, n_results trim, max_distance honoring, and basename-collision dedup.
This commit is contained in:
+96
-18
@@ -381,6 +381,7 @@ def _bm25_only_via_sqlite(
|
||||
room: str = None,
|
||||
n_results: int = 5,
|
||||
max_candidates: int = 500,
|
||||
_include_internal: bool = False,
|
||||
) -> dict:
|
||||
"""BM25-only search reading drawers directly from chroma.sqlite3.
|
||||
|
||||
@@ -518,17 +519,25 @@ def _bm25_only_via_sqlite(
|
||||
continue
|
||||
if room and meta.get("room") != room:
|
||||
continue
|
||||
full_source = meta.get("source_file", "") or ""
|
||||
candidates.append(
|
||||
{
|
||||
"text": d["text"],
|
||||
"wing": meta.get("wing", "unknown"),
|
||||
"room": meta.get("room", "unknown"),
|
||||
"source_file": Path(meta.get("source_file", "?") or "?").name,
|
||||
"source_file": Path(full_source).name if full_source else "?",
|
||||
"created_at": meta.get("filed_at", "unknown"),
|
||||
# No vector distance available in BM25-only mode.
|
||||
"similarity": None,
|
||||
"distance": None,
|
||||
"matched_via": "bm25_sqlite",
|
||||
# Internal: full path + chunk_index let callers (notably
|
||||
# candidate_strategy="union") dedupe at chunk granularity
|
||||
# rather than basename — two files in different directories
|
||||
# may share a basename, and one source_file is split across
|
||||
# multiple chunks. Stripped before this helper returns.
|
||||
"_source_file_full": full_source,
|
||||
"_chunk_index": meta.get("chunk_index"),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -543,6 +552,12 @@ def _bm25_only_via_sqlite(
|
||||
hits = candidates[:n_results]
|
||||
for h in hits:
|
||||
h.pop("_score", None)
|
||||
# Strip internal fields by default so the public BM25-only fallback
|
||||
# response stays clean. Callers that need chunk-precise dedup
|
||||
# (notably the union-merge path) opt in via _include_internal.
|
||||
if not _include_internal:
|
||||
h.pop("_source_file_full", None)
|
||||
h.pop("_chunk_index", None)
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
@@ -561,17 +576,33 @@ def _merge_bm25_union_candidates(
|
||||
wing: str,
|
||||
room: str,
|
||||
n_results: int,
|
||||
max_distance: float = 0.0,
|
||||
) -> None:
|
||||
"""Append top-K BM25-only candidates from sqlite into ``hits`` in place.
|
||||
|
||||
Used by ``search_memories(..., candidate_strategy="union")`` to widen
|
||||
the rerank pool's *source* (not just its size) — vector-only candidate
|
||||
selection skips docs whose embeddings are far from the query even when
|
||||
BM25 signal is strong. We dedupe against existing hits by ``source_file``
|
||||
so vector-side entries (which carry real distance values) win on
|
||||
collisions; BM25-only additions are marked with ``distance=None`` so
|
||||
``_hybrid_rank`` scores them on BM25 contribution alone.
|
||||
BM25 signal is strong.
|
||||
|
||||
Dedup is chunk-precise: the key is ``(_source_file_full, _chunk_index)``
|
||||
so two files sharing a basename in different directories don't collide,
|
||||
and a vector hit on chunk N of a file doesn't block BM25 from
|
||||
contributing chunk M of the same file. Falls back to ``source_file``
|
||||
only when full-path/chunk metadata is absent.
|
||||
|
||||
BM25-only additions carry ``distance=None`` so ``_hybrid_rank`` scores
|
||||
them on BM25 contribution alone.
|
||||
|
||||
When ``max_distance > 0.0`` (a strict vector-distance threshold is
|
||||
set), BM25-only candidates are skipped entirely — they have no vector
|
||||
distance to satisfy the threshold, and silently injecting them would
|
||||
break the existing ``max_distance`` guarantee that hybrid results lie
|
||||
within the requested vector-distance bound.
|
||||
"""
|
||||
if max_distance > 0.0:
|
||||
return
|
||||
|
||||
try:
|
||||
bm25_extra = _bm25_only_via_sqlite(
|
||||
query,
|
||||
@@ -579,21 +610,32 @@ def _merge_bm25_union_candidates(
|
||||
wing=wing,
|
||||
room=room,
|
||||
n_results=n_results * 3,
|
||||
_include_internal=True,
|
||||
).get("results", [])
|
||||
except Exception:
|
||||
logger.debug("candidate_strategy=union: BM25 fetch failed", exc_info=True)
|
||||
return
|
||||
|
||||
seen_sources = {h.get("source_file") for h in hits}
|
||||
def _dedup_key(entry: dict):
|
||||
full = entry.get("_source_file_full")
|
||||
ci = entry.get("_chunk_index")
|
||||
if full and ci is not None:
|
||||
return (full, ci)
|
||||
# Fall back to basename only when richer metadata is missing —
|
||||
# avoids silently dropping candidates on legacy data while still
|
||||
# giving chunk-precise dedup whenever the metadata is present.
|
||||
return entry.get("source_file")
|
||||
|
||||
seen = {_dedup_key(h) for h in hits}
|
||||
for bh in bm25_extra:
|
||||
key = bh.get("source_file")
|
||||
if not key or key == "?" or key in seen_sources:
|
||||
key = _dedup_key(bh)
|
||||
if not key or key == "?" or key in seen:
|
||||
continue
|
||||
bh["distance"] = None
|
||||
bh["effective_distance"] = None
|
||||
bh["closet_boost"] = 0.0
|
||||
hits.append(bh)
|
||||
seen_sources.add(key)
|
||||
seen.add(key)
|
||||
|
||||
|
||||
# Strategy dispatch — keeps search_memories' branch count under the
|
||||
@@ -605,6 +647,19 @@ _CANDIDATE_MERGERS = {
|
||||
}
|
||||
|
||||
|
||||
def _validate_candidate_strategy(strategy: str) -> None:
|
||||
"""Raise ``ValueError`` for unknown strategies.
|
||||
|
||||
Called eagerly at the top of ``search_memories`` so invalid values
|
||||
fail consistently regardless of whether the call routes through the
|
||||
vector path, the BM25-only fallback, or returns an early error dict.
|
||||
"""
|
||||
if strategy not in _CANDIDATE_MERGERS:
|
||||
raise ValueError(
|
||||
f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, got {strategy!r}"
|
||||
)
|
||||
|
||||
|
||||
def _apply_candidate_strategy(
|
||||
strategy: str,
|
||||
hits: list,
|
||||
@@ -613,18 +668,16 @@ def _apply_candidate_strategy(
|
||||
wing: str,
|
||||
room: str,
|
||||
n_results: int,
|
||||
max_distance: float = 0.0,
|
||||
) -> None:
|
||||
"""Dispatch to the registered merger for ``strategy``.
|
||||
|
||||
Raises ``ValueError`` for unknown strategies. ``"vector"`` is a no-op.
|
||||
Strategy validity is assumed (``_validate_candidate_strategy`` runs
|
||||
earlier); ``"vector"`` is a no-op.
|
||||
"""
|
||||
if strategy not in _CANDIDATE_MERGERS:
|
||||
raise ValueError(
|
||||
f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, " f"got {strategy!r}"
|
||||
)
|
||||
merger = _CANDIDATE_MERGERS[strategy]
|
||||
if merger is not None:
|
||||
merger(hits, query, palace_path, wing, room, n_results)
|
||||
merger(hits, query, palace_path, wing, room, n_results, max_distance=max_distance)
|
||||
|
||||
|
||||
def search_memories(
|
||||
@@ -669,7 +722,16 @@ def search_memories(
|
||||
by scenario descriptions). Adds one sqlite open + FTS5 MATCH
|
||||
per query; perf cost is small but unmeasured at corpus scale.
|
||||
Opt in until the cost is characterized.
|
||||
|
||||
When ``max_distance > 0.0`` is also set, BM25-only candidates
|
||||
are skipped — they have no vector distance and would silently
|
||||
violate the requested distance threshold.
|
||||
"""
|
||||
# Validate the strategy eagerly so invalid values fail the same way
|
||||
# regardless of whether the call routes through the vector path or
|
||||
# the BM25-only fallback below.
|
||||
_validate_candidate_strategy(candidate_strategy)
|
||||
|
||||
if vector_disabled:
|
||||
return _bm25_only_via_sqlite(
|
||||
query,
|
||||
@@ -848,10 +910,26 @@ def search_memories(
|
||||
# Candidate strategy hook: optionally widen the rerank pool's *source*
|
||||
# before ranking. Default ("vector") is a no-op; "union" merges top-K
|
||||
# BM25 candidates from sqlite. See `_apply_candidate_strategy`.
|
||||
_apply_candidate_strategy(candidate_strategy, hits, query, palace_path, wing, room, n_results)
|
||||
# ``max_distance`` is forwarded so union mode can refuse to inject
|
||||
# BM25-only (distance=None) candidates that would silently bypass the
|
||||
# caller's strict distance threshold.
|
||||
_apply_candidate_strategy(
|
||||
candidate_strategy,
|
||||
hits,
|
||||
query,
|
||||
palace_path,
|
||||
wing,
|
||||
room,
|
||||
n_results,
|
||||
max_distance=max_distance,
|
||||
)
|
||||
|
||||
# BM25 hybrid re-rank within the final candidate set.
|
||||
hits = _hybrid_rank(hits, query)
|
||||
# BM25 hybrid re-rank within the final candidate set, then trim back
|
||||
# to the requested size. Without the trim, ``candidate_strategy="union"``
|
||||
# would return up to 4× ``n_results`` (vector hits + BM25 union pool),
|
||||
# breaking the existing ``search_memories`` size contract that the MCP
|
||||
# ``limit`` parameter is built on.
|
||||
hits = _hybrid_rank(hits, query)[:n_results]
|
||||
for h in hits:
|
||||
h.pop("_sort_key", None)
|
||||
h.pop("_source_file_full", None)
|
||||
|
||||
@@ -113,6 +113,107 @@ class TestCandidateUnion:
|
||||
with pytest.raises(ValueError, match="candidate_strategy"):
|
||||
search_memories("anything", palace, n_results=5, candidate_strategy="bogus")
|
||||
|
||||
def test_invalid_strategy_raises_even_when_vector_disabled(self, tmp_path):
|
||||
"""Validation must happen before the ``vector_disabled`` early return —
|
||||
invalid values must fail consistently regardless of routing."""
|
||||
palace = str(tmp_path / "palace")
|
||||
_seed_drawers(palace)
|
||||
import pytest
|
||||
|
||||
with pytest.raises(ValueError, match="candidate_strategy"):
|
||||
search_memories(
|
||||
"anything",
|
||||
palace,
|
||||
n_results=5,
|
||||
vector_disabled=True,
|
||||
candidate_strategy="bogus",
|
||||
)
|
||||
|
||||
def test_union_respects_n_results_limit(self, tmp_path):
|
||||
"""When the merged candidate set is larger than ``n_results``, the
|
||||
result must be trimmed back to the requested size — the MCP
|
||||
``limit`` contract depends on this invariant."""
|
||||
palace = str(tmp_path / "palace")
|
||||
_seed_drawers(palace)
|
||||
# 4-doc corpus, n_results=2 → union pool can grow to ~8 candidates,
|
||||
# rerank reorders them, but final list must respect the cap.
|
||||
result = search_memories(_NARRATIVE_QUERY, palace, n_results=2, candidate_strategy="union")
|
||||
assert (
|
||||
len(result["results"]) <= 2
|
||||
), f"union must trim to n_results=2; got {len(result['results'])} results"
|
||||
|
||||
def test_union_skipped_when_max_distance_set(self, tmp_path):
|
||||
"""``max_distance`` is a vector-distance threshold; BM25-only
|
||||
candidates have ``distance=None`` and cannot satisfy it. Union
|
||||
must not silently inject them when a strict threshold is set,
|
||||
otherwise the existing ``max_distance`` guarantee regresses."""
|
||||
palace = str(tmp_path / "palace")
|
||||
_seed_drawers(palace)
|
||||
# Sanity: without max_distance, union surfaces the BM25-strong doc.
|
||||
unfiltered = search_memories(
|
||||
_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union"
|
||||
)
|
||||
assert "brand_voice_D4.md" in {h["source_file"] for h in unfiltered["results"]}
|
||||
|
||||
# With a tight max_distance, union must NOT inject BM25-only hits —
|
||||
# every returned hit must have a real (non-None) distance.
|
||||
filtered = search_memories(
|
||||
_NARRATIVE_QUERY,
|
||||
palace,
|
||||
n_results=5,
|
||||
candidate_strategy="union",
|
||||
max_distance=0.5,
|
||||
)
|
||||
for h in filtered["results"]:
|
||||
assert h.get("distance") is not None, (
|
||||
f"union under max_distance must not inject BM25-only "
|
||||
f"(distance=None) candidates; offending hit: {h}"
|
||||
)
|
||||
assert h["distance"] <= 0.5, f"hit violates max_distance=0.5: distance={h['distance']}"
|
||||
|
||||
def test_union_dedup_is_chunk_precise_not_basename(self, tmp_path):
|
||||
"""Two files with the same basename in different directories must
|
||||
not collide — union must dedup on full path (or chunk-level key),
|
||||
not on basename alone. Otherwise a BM25-strong README from one
|
||||
directory silently shadows a BM25-strong README from another.
|
||||
"""
|
||||
palace = str(tmp_path / "palace")
|
||||
col = get_collection(palace, create=True)
|
||||
col.upsert(
|
||||
ids=["A_README", "B_README", "narrative"],
|
||||
documents=[
|
||||
# Both README files share the basename README.md but live
|
||||
# in different directories. Each contains distinctive
|
||||
# terminology a query might surface via BM25.
|
||||
"PROJECT ALPHA: configuration for the Frobnitz subsystem. "
|
||||
"Set FROBNITZ_TIMEOUT=30 to enable widget rotation.",
|
||||
"PROJECT BETA: configuration for the Wibble subsystem. "
|
||||
"Set WIBBLE_THRESHOLD=0.5 to enable signal smoothing.",
|
||||
"Engineers occasionally chat about how the legacy "
|
||||
"subsystems all need their config knobs tweaked.",
|
||||
],
|
||||
metadatas=[
|
||||
{"wing": "code", "room": "docs", "source_file": "alpha/README.md"},
|
||||
{"wing": "code", "room": "docs", "source_file": "beta/README.md"},
|
||||
{"wing": "code", "room": "docs", "source_file": "chat.md"},
|
||||
],
|
||||
)
|
||||
# Query that hits BM25 for BOTH READMEs (distinct vocab from each).
|
||||
# Vector-only might pick the chat doc as semantically "closest";
|
||||
# union must surface both READMEs without basename collision.
|
||||
result = search_memories(
|
||||
"FROBNITZ_TIMEOUT WIBBLE_THRESHOLD configuration",
|
||||
palace,
|
||||
n_results=5,
|
||||
candidate_strategy="union",
|
||||
)
|
||||
sources = [h["source_file"] for h in result["results"]]
|
||||
readme_count = sum(1 for s in sources if s == "README.md")
|
||||
assert readme_count >= 2, (
|
||||
f"union must surface both README.md files from different dirs "
|
||||
f"(basename collision would drop one); got sources={sources}"
|
||||
)
|
||||
|
||||
|
||||
class TestHybridRankTolerantOfMissingDistance:
|
||||
"""``_hybrid_rank`` accepts ``distance=None`` — required for BM25-only
|
||||
|
||||
Reference in New Issue
Block a user