From 3eb7980e5540cb68a756bfb582cf4cccbf1875c8 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Sun, 3 May 2026 06:09:10 -0300 Subject: [PATCH] fix(searcher): address Copilot review on #1306 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Dedup union candidates by (full_path, chunk_index), not basename — two files sharing a basename in different dirs no longer collide, and a vector hit on chunk N of a file no longer blocks BM25 from contributing chunk M of the same file. - Validate candidate_strategy at the top of search_memories so invalid values fail consistently, not only when the call routes through the vector path. - Trim hits back to n_results after the union+rerank pool grows; preserves the existing search_memories size contract that the MCP limit parameter is built on. - Skip BM25-only injection when max_distance > 0.0; BM25-only candidates carry distance=None and would silently bypass the caller's strict vector-distance threshold. Adds 4 tests covering: validation under vector_disabled, n_results trim, max_distance honoring, and basename-collision dedup. --- mempalace/searcher.py | 114 ++++++++++++++++++++++----- tests/test_hybrid_candidate_union.py | 101 ++++++++++++++++++++++++ 2 files changed, 197 insertions(+), 18 deletions(-) diff --git a/mempalace/searcher.py b/mempalace/searcher.py index 7a46158..d615623 100644 --- a/mempalace/searcher.py +++ b/mempalace/searcher.py @@ -381,6 +381,7 @@ def _bm25_only_via_sqlite( room: str = None, n_results: int = 5, max_candidates: int = 500, + _include_internal: bool = False, ) -> dict: """BM25-only search reading drawers directly from chroma.sqlite3. @@ -518,17 +519,25 @@ def _bm25_only_via_sqlite( continue if room and meta.get("room") != room: continue + full_source = meta.get("source_file", "") or "" candidates.append( { "text": d["text"], "wing": meta.get("wing", "unknown"), "room": meta.get("room", "unknown"), - "source_file": Path(meta.get("source_file", "?") or "?").name, + "source_file": Path(full_source).name if full_source else "?", "created_at": meta.get("filed_at", "unknown"), # No vector distance available in BM25-only mode. "similarity": None, "distance": None, "matched_via": "bm25_sqlite", + # Internal: full path + chunk_index let callers (notably + # candidate_strategy="union") dedupe at chunk granularity + # rather than basename — two files in different directories + # may share a basename, and one source_file is split across + # multiple chunks. Stripped before this helper returns. + "_source_file_full": full_source, + "_chunk_index": meta.get("chunk_index"), } ) @@ -543,6 +552,12 @@ def _bm25_only_via_sqlite( hits = candidates[:n_results] for h in hits: h.pop("_score", None) + # Strip internal fields by default so the public BM25-only fallback + # response stays clean. Callers that need chunk-precise dedup + # (notably the union-merge path) opt in via _include_internal. + if not _include_internal: + h.pop("_source_file_full", None) + h.pop("_chunk_index", None) return { "query": query, @@ -561,17 +576,33 @@ def _merge_bm25_union_candidates( wing: str, room: str, n_results: int, + max_distance: float = 0.0, ) -> None: """Append top-K BM25-only candidates from sqlite into ``hits`` in place. Used by ``search_memories(..., candidate_strategy="union")`` to widen the rerank pool's *source* (not just its size) — vector-only candidate selection skips docs whose embeddings are far from the query even when - BM25 signal is strong. We dedupe against existing hits by ``source_file`` - so vector-side entries (which carry real distance values) win on - collisions; BM25-only additions are marked with ``distance=None`` so - ``_hybrid_rank`` scores them on BM25 contribution alone. + BM25 signal is strong. + + Dedup is chunk-precise: the key is ``(_source_file_full, _chunk_index)`` + so two files sharing a basename in different directories don't collide, + and a vector hit on chunk N of a file doesn't block BM25 from + contributing chunk M of the same file. Falls back to ``source_file`` + only when full-path/chunk metadata is absent. + + BM25-only additions carry ``distance=None`` so ``_hybrid_rank`` scores + them on BM25 contribution alone. + + When ``max_distance > 0.0`` (a strict vector-distance threshold is + set), BM25-only candidates are skipped entirely — they have no vector + distance to satisfy the threshold, and silently injecting them would + break the existing ``max_distance`` guarantee that hybrid results lie + within the requested vector-distance bound. """ + if max_distance > 0.0: + return + try: bm25_extra = _bm25_only_via_sqlite( query, @@ -579,21 +610,32 @@ def _merge_bm25_union_candidates( wing=wing, room=room, n_results=n_results * 3, + _include_internal=True, ).get("results", []) except Exception: logger.debug("candidate_strategy=union: BM25 fetch failed", exc_info=True) return - seen_sources = {h.get("source_file") for h in hits} + def _dedup_key(entry: dict): + full = entry.get("_source_file_full") + ci = entry.get("_chunk_index") + if full and ci is not None: + return (full, ci) + # Fall back to basename only when richer metadata is missing — + # avoids silently dropping candidates on legacy data while still + # giving chunk-precise dedup whenever the metadata is present. + return entry.get("source_file") + + seen = {_dedup_key(h) for h in hits} for bh in bm25_extra: - key = bh.get("source_file") - if not key or key == "?" or key in seen_sources: + key = _dedup_key(bh) + if not key or key == "?" or key in seen: continue bh["distance"] = None bh["effective_distance"] = None bh["closet_boost"] = 0.0 hits.append(bh) - seen_sources.add(key) + seen.add(key) # Strategy dispatch — keeps search_memories' branch count under the @@ -605,6 +647,19 @@ _CANDIDATE_MERGERS = { } +def _validate_candidate_strategy(strategy: str) -> None: + """Raise ``ValueError`` for unknown strategies. + + Called eagerly at the top of ``search_memories`` so invalid values + fail consistently regardless of whether the call routes through the + vector path, the BM25-only fallback, or returns an early error dict. + """ + if strategy not in _CANDIDATE_MERGERS: + raise ValueError( + f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, got {strategy!r}" + ) + + def _apply_candidate_strategy( strategy: str, hits: list, @@ -613,18 +668,16 @@ def _apply_candidate_strategy( wing: str, room: str, n_results: int, + max_distance: float = 0.0, ) -> None: """Dispatch to the registered merger for ``strategy``. - Raises ``ValueError`` for unknown strategies. ``"vector"`` is a no-op. + Strategy validity is assumed (``_validate_candidate_strategy`` runs + earlier); ``"vector"`` is a no-op. """ - if strategy not in _CANDIDATE_MERGERS: - raise ValueError( - f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, " f"got {strategy!r}" - ) merger = _CANDIDATE_MERGERS[strategy] if merger is not None: - merger(hits, query, palace_path, wing, room, n_results) + merger(hits, query, palace_path, wing, room, n_results, max_distance=max_distance) def search_memories( @@ -669,7 +722,16 @@ def search_memories( by scenario descriptions). Adds one sqlite open + FTS5 MATCH per query; perf cost is small but unmeasured at corpus scale. Opt in until the cost is characterized. + + When ``max_distance > 0.0`` is also set, BM25-only candidates + are skipped — they have no vector distance and would silently + violate the requested distance threshold. """ + # Validate the strategy eagerly so invalid values fail the same way + # regardless of whether the call routes through the vector path or + # the BM25-only fallback below. + _validate_candidate_strategy(candidate_strategy) + if vector_disabled: return _bm25_only_via_sqlite( query, @@ -848,10 +910,26 @@ def search_memories( # Candidate strategy hook: optionally widen the rerank pool's *source* # before ranking. Default ("vector") is a no-op; "union" merges top-K # BM25 candidates from sqlite. See `_apply_candidate_strategy`. - _apply_candidate_strategy(candidate_strategy, hits, query, palace_path, wing, room, n_results) + # ``max_distance`` is forwarded so union mode can refuse to inject + # BM25-only (distance=None) candidates that would silently bypass the + # caller's strict distance threshold. + _apply_candidate_strategy( + candidate_strategy, + hits, + query, + palace_path, + wing, + room, + n_results, + max_distance=max_distance, + ) - # BM25 hybrid re-rank within the final candidate set. - hits = _hybrid_rank(hits, query) + # BM25 hybrid re-rank within the final candidate set, then trim back + # to the requested size. Without the trim, ``candidate_strategy="union"`` + # would return up to 4× ``n_results`` (vector hits + BM25 union pool), + # breaking the existing ``search_memories`` size contract that the MCP + # ``limit`` parameter is built on. + hits = _hybrid_rank(hits, query)[:n_results] for h in hits: h.pop("_sort_key", None) h.pop("_source_file_full", None) diff --git a/tests/test_hybrid_candidate_union.py b/tests/test_hybrid_candidate_union.py index 97cf4d1..feca81e 100644 --- a/tests/test_hybrid_candidate_union.py +++ b/tests/test_hybrid_candidate_union.py @@ -113,6 +113,107 @@ class TestCandidateUnion: with pytest.raises(ValueError, match="candidate_strategy"): search_memories("anything", palace, n_results=5, candidate_strategy="bogus") + def test_invalid_strategy_raises_even_when_vector_disabled(self, tmp_path): + """Validation must happen before the ``vector_disabled`` early return — + invalid values must fail consistently regardless of routing.""" + palace = str(tmp_path / "palace") + _seed_drawers(palace) + import pytest + + with pytest.raises(ValueError, match="candidate_strategy"): + search_memories( + "anything", + palace, + n_results=5, + vector_disabled=True, + candidate_strategy="bogus", + ) + + def test_union_respects_n_results_limit(self, tmp_path): + """When the merged candidate set is larger than ``n_results``, the + result must be trimmed back to the requested size — the MCP + ``limit`` contract depends on this invariant.""" + palace = str(tmp_path / "palace") + _seed_drawers(palace) + # 4-doc corpus, n_results=2 → union pool can grow to ~8 candidates, + # rerank reorders them, but final list must respect the cap. + result = search_memories(_NARRATIVE_QUERY, palace, n_results=2, candidate_strategy="union") + assert ( + len(result["results"]) <= 2 + ), f"union must trim to n_results=2; got {len(result['results'])} results" + + def test_union_skipped_when_max_distance_set(self, tmp_path): + """``max_distance`` is a vector-distance threshold; BM25-only + candidates have ``distance=None`` and cannot satisfy it. Union + must not silently inject them when a strict threshold is set, + otherwise the existing ``max_distance`` guarantee regresses.""" + palace = str(tmp_path / "palace") + _seed_drawers(palace) + # Sanity: without max_distance, union surfaces the BM25-strong doc. + unfiltered = search_memories( + _NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union" + ) + assert "brand_voice_D4.md" in {h["source_file"] for h in unfiltered["results"]} + + # With a tight max_distance, union must NOT inject BM25-only hits — + # every returned hit must have a real (non-None) distance. + filtered = search_memories( + _NARRATIVE_QUERY, + palace, + n_results=5, + candidate_strategy="union", + max_distance=0.5, + ) + for h in filtered["results"]: + assert h.get("distance") is not None, ( + f"union under max_distance must not inject BM25-only " + f"(distance=None) candidates; offending hit: {h}" + ) + assert h["distance"] <= 0.5, f"hit violates max_distance=0.5: distance={h['distance']}" + + def test_union_dedup_is_chunk_precise_not_basename(self, tmp_path): + """Two files with the same basename in different directories must + not collide — union must dedup on full path (or chunk-level key), + not on basename alone. Otherwise a BM25-strong README from one + directory silently shadows a BM25-strong README from another. + """ + palace = str(tmp_path / "palace") + col = get_collection(palace, create=True) + col.upsert( + ids=["A_README", "B_README", "narrative"], + documents=[ + # Both README files share the basename README.md but live + # in different directories. Each contains distinctive + # terminology a query might surface via BM25. + "PROJECT ALPHA: configuration for the Frobnitz subsystem. " + "Set FROBNITZ_TIMEOUT=30 to enable widget rotation.", + "PROJECT BETA: configuration for the Wibble subsystem. " + "Set WIBBLE_THRESHOLD=0.5 to enable signal smoothing.", + "Engineers occasionally chat about how the legacy " + "subsystems all need their config knobs tweaked.", + ], + metadatas=[ + {"wing": "code", "room": "docs", "source_file": "alpha/README.md"}, + {"wing": "code", "room": "docs", "source_file": "beta/README.md"}, + {"wing": "code", "room": "docs", "source_file": "chat.md"}, + ], + ) + # Query that hits BM25 for BOTH READMEs (distinct vocab from each). + # Vector-only might pick the chat doc as semantically "closest"; + # union must surface both READMEs without basename collision. + result = search_memories( + "FROBNITZ_TIMEOUT WIBBLE_THRESHOLD configuration", + palace, + n_results=5, + candidate_strategy="union", + ) + sources = [h["source_file"] for h in result["results"]] + readme_count = sum(1 for s in sources if s == "README.md") + assert readme_count >= 2, ( + f"union must surface both README.md files from different dirs " + f"(basename collision would drop one); got sources={sources}" + ) + class TestHybridRankTolerantOfMissingDistance: """``_hybrid_rank`` accepts ``distance=None`` — required for BM25-only