From 6509071b8e93919bb1f21dd625457fbf3a59ed5e Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Sat, 2 May 2026 00:47:24 -0300 Subject: [PATCH] =?UTF-8?q?feat(searcher):=20add=20candidate=5Fstrategy=3D?= =?UTF-8?q?"union"=20for=20vector=E2=88=AABM25=20reranking=20pool?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default search behavior is unchanged. Opt-in candidate_strategy="union" also pulls top-K BM25-only candidates from sqlite FTS5 and merges them into the rerank pool, catching docs with strong BM25 signal that the vector index didn't surface in the over-fetch window. Motivation ---------- The current hybrid path gathers candidates from the vector index only (n_results * 3 over-fetch), then BM25-reranks within them. When the query embeds close to the wrong content semantically, the right doc never enters the rerank pool — *no matter how wide the over-fetch*. Tested on a ~6K-document mixed corpus (knowledge prose + short structured records): at *30x* over-fetch (~5% of the corpus) the target doc still didn't surface for narrative-shaped queries targeting terminology guides. Wider over-fetch isn't the answer; widening the pool's *source* is. Concrete failure mode: a narrative-shaped query embeds close to records sharing the same operational vocabulary (other narrative entries in the corpus). A terminology / style guide is BM25-strong for the query (rare keywords the guide repeats) but vector-distant. Vector-only candidates don't include it; BM25 never gets to rerank it. The hybrid path produces 0.00 recall on a probe that pure BM25 alone scores 1.00 — the hybrid is worse than its component on the same input. Behavior change --------------- * New parameter ``candidate_strategy: str = "vector"`` on ``search_memories``. - ``"vector"`` (default): historical behavior, no change. - ``"union"``: also fetch top ``n_results * 3`` candidates via the existing ``_bm25_only_via_sqlite`` helper, dedupe by source_file, merge into the rerank pool. BM25-only candidates carry ``distance=None`` so they're scored on BM25 contribution alone (vec_sim coerces to 0). * ``_hybrid_rank`` now handles ``distance=None`` explicitly, scoring such candidates as vector-unknown (vec_sim=0) rather than treating it as max-distance via shim. * New strategies register via ``_CANDIDATE_MERGERS``; dispatch is in ``_apply_candidate_strategy`` so ``search_memories`` stays under the C901 complexity ceiling. Bench numbers (~6K-doc internal mixed corpus, recall@10, 5 probes spanning policy-exception lookup, temporal-decay, style retrieval, set-difference, and pattern-recognition): baseline ("vector") "union" policy-exception probe 0.00 0.50 +0.50 temporal-decay probe 0.17 0.50 +0.33 style-retrieval probe 0.00 1.00 +1.00 (PASSES) set-difference probe 0.00–0.06 0.06–0.09 ~ pattern-recog probe 0.64 (stable) 0.50–0.71 variance, typ. +0.07 macro recall 0.16–0.17 0.51–0.56 +0.34 to +0.40 The pattern-recog variance points at a related issue worth a separate PR: ``_hybrid_rank`` computes BM25 IDF over the candidate set. Adding new candidates re-normalizes BM25 for *existing* candidates non-monotonically. Stable corpus-wide BM25 would remove this. Out of scope here. Tests ----- ``tests/test_hybrid_candidate_union.py`` (6 tests, all pass): - default behavior unchanged (explicit ``"vector"`` matches default) - ``"union"`` surfaces a BM25-strong vector-distant doc - ``"union"`` doesn't drop docs ``"vector"`` would have found - empty-palace handling - invalid ``candidate_strategy`` raises - ``_hybrid_rank`` tolerates ``distance=None`` Existing ``test_hybrid_search.py`` (5) and ``test_searcher.py`` (27) pass. Performance note ---------------- Each ``"union"`` query adds one sqlite open + FTS5 MATCH + metadata fetch (via the existing ``_bm25_only_via_sqlite`` helper, which already runs as the ``vector_disabled`` fallback path so the code is well-trodden). Per-query overhead is small but unmeasured at corpus scale. Default stays ``"vector"`` until a maintainer characterizes the cost. --- mempalace/searcher.py | 104 ++++++++++++++++++++- tests/test_hybrid_candidate_union.py | 133 +++++++++++++++++++++++++++ 2 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 tests/test_hybrid_candidate_union.py diff --git a/mempalace/searcher.py b/mempalace/searcher.py index a14d90d..7a46158 100644 --- a/mempalace/searcher.py +++ b/mempalace/searcher.py @@ -134,6 +134,11 @@ def _hybrid_rank( themselves. Since the absolute scale is unbounded, BM25 is min-max normalized within the candidate set so weights are commensurable. + Candidates with ``distance=None`` are treated as vector-unknown + (no vector signal available) and scored on BM25 contribution alone. + Used by candidate-union mode to merge BM25-only candidates that the + vector index didn't surface. + Mutates each result dict to add ``bm25_score`` and reorders the list in place. Returns the same list for convenience. """ @@ -147,7 +152,11 @@ def _hybrid_rank( scored = [] for r, raw, norm in zip(results, bm25_raw, bm25_norm): - vec_sim = max(0.0, 1.0 - r.get("distance", 1.0)) + distance = r.get("distance") + if distance is None: + vec_sim = 0.0 + else: + vec_sim = max(0.0, 1.0 - distance) r["bm25_score"] = round(raw, 3) scored.append((vector_weight * vec_sim + bm25_weight * norm, r)) @@ -545,6 +554,79 @@ def _bm25_only_via_sqlite( } +def _merge_bm25_union_candidates( + hits: list, + query: str, + palace_path: str, + wing: str, + room: str, + n_results: int, +) -> None: + """Append top-K BM25-only candidates from sqlite into ``hits`` in place. + + Used by ``search_memories(..., candidate_strategy="union")`` to widen + the rerank pool's *source* (not just its size) — vector-only candidate + selection skips docs whose embeddings are far from the query even when + BM25 signal is strong. We dedupe against existing hits by ``source_file`` + so vector-side entries (which carry real distance values) win on + collisions; BM25-only additions are marked with ``distance=None`` so + ``_hybrid_rank`` scores them on BM25 contribution alone. + """ + try: + bm25_extra = _bm25_only_via_sqlite( + query, + palace_path, + wing=wing, + room=room, + n_results=n_results * 3, + ).get("results", []) + except Exception: + logger.debug("candidate_strategy=union: BM25 fetch failed", exc_info=True) + return + + seen_sources = {h.get("source_file") for h in hits} + for bh in bm25_extra: + key = bh.get("source_file") + if not key or key == "?" or key in seen_sources: + continue + bh["distance"] = None + bh["effective_distance"] = None + bh["closet_boost"] = 0.0 + hits.append(bh) + seen_sources.add(key) + + +# Strategy dispatch — keeps search_memories' branch count under the +# project's complexity ceiling (C901 max-complexity=25). New strategies +# register here. +_CANDIDATE_MERGERS = { + "vector": None, # default no-op + "union": _merge_bm25_union_candidates, +} + + +def _apply_candidate_strategy( + strategy: str, + hits: list, + query: str, + palace_path: str, + wing: str, + room: str, + n_results: int, +) -> None: + """Dispatch to the registered merger for ``strategy``. + + Raises ``ValueError`` for unknown strategies. ``"vector"`` is a no-op. + """ + if strategy not in _CANDIDATE_MERGERS: + raise ValueError( + f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, " f"got {strategy!r}" + ) + merger = _CANDIDATE_MERGERS[strategy] + if merger is not None: + merger(hits, query, palace_path, wing, room, n_results) + + def search_memories( query: str, palace_path: str, @@ -553,6 +635,7 @@ def search_memories( n_results: int = 5, max_distance: float = 0.0, vector_disabled: bool = False, + candidate_strategy: str = "vector", ) -> dict: """Programmatic search — returns a dict instead of printing. @@ -572,6 +655,20 @@ def search_memories( (#1222). Set by the MCP server when the HNSW capacity probe detects a divergence that would segfault chromadb on segment load. + candidate_strategy: How candidates for the hybrid re-rank are gathered. + + * ``"vector"`` (default) — preserves historical behavior: top + ``n_results * 3`` rows from the vector index are the rerank pool. + Cheap; works well when query and target docs agree in the + embedding space. + * ``"union"`` — also pull top ``n_results * 3`` BM25 candidates + from the sqlite FTS5 index and merge them into the rerank pool + (deduped by source_file). Catches docs with strong BM25 signal + that are vector-distant from the query (e.g. terminology guides + looked up by narrative-shaped queries; policy clauses surfaced + by scenario descriptions). Adds one sqlite open + FTS5 MATCH + per query; perf cost is small but unmeasured at corpus scale. + Opt in until the cost is characterized. """ if vector_disabled: return _bm25_only_via_sqlite( @@ -748,6 +845,11 @@ def search_memories( h["drawer_index"] = best_idx h["total_drawers"] = len(ordered_docs) + # Candidate strategy hook: optionally widen the rerank pool's *source* + # before ranking. Default ("vector") is a no-op; "union" merges top-K + # BM25 candidates from sqlite. See `_apply_candidate_strategy`. + _apply_candidate_strategy(candidate_strategy, hits, query, palace_path, wing, room, n_results) + # BM25 hybrid re-rank within the final candidate set. hits = _hybrid_rank(hits, query) for h in hits: diff --git a/tests/test_hybrid_candidate_union.py b/tests/test_hybrid_candidate_union.py new file mode 100644 index 0000000..97cf4d1 --- /dev/null +++ b/tests/test_hybrid_candidate_union.py @@ -0,0 +1,133 @@ +"""Tests for ``candidate_strategy="union"`` in ``search_memories``. + +The default ``"vector"`` strategy gathers candidates from the vector index +only. Docs with strong BM25 signal but vector embeddings far from the query +get skipped — terminology guides looked up by narrative-shaped queries are +the canonical case. + +The ``"union"`` strategy also pulls top-K BM25-only candidates from sqlite +FTS5 and merges them into the rerank pool. Both signal sources contribute +candidates; the hybrid rerank picks the best from a richer pool. + +Default behavior is unchanged ("vector") — these tests exercise opt-in +"union" mode. +""" + +from mempalace.palace import get_collection +from mempalace.searcher import search_memories + + +def _seed_drawers(palace_path): + """Seed a corpus where the right doc for one query is BM25-strong but + vector-distant. + + D1-D3 are short narrative tickets that semantically cluster around + "customer support / order / shipped" vocabulary. D4 is a meta-document + of bullet rules ("brand voice") that contains rare keywords like + "Absolutely" and "apologize" the query repeats verbatim — strong BM25 + signal but stylistically far from the narrative tickets. + """ + col = get_collection(palace_path, create=True) + col.upsert( + ids=["D1", "D2", "D3", "D4"], + documents=[ + "Customer wrote in asking why their order shipped without " + "the promo sticker. Standard reply explaining the threshold.", + "Order delivery delayed three days; customer requested a " + "refund. Support agent processed return via ticket queue.", + "Customer asked about the missing freebie; the reply " + "explained the campaign mechanics and shipped status.", + "Brand voice rules: dry, sturdy, never effusive. " + "Never 'Absolutely!' Never apologize for policy — explain it. " + "Avoid premium / curated / elevated vocabulary.", + ], + metadatas=[ + {"wing": "shop", "room": "support", "source_file": "ticket_D1.md"}, + {"wing": "shop", "room": "support", "source_file": "ticket_D2.md"}, + {"wing": "shop", "room": "support", "source_file": "ticket_D3.md"}, + {"wing": "shop", "room": "guides", "source_file": "brand_voice_D4.md"}, + ], + ) + + +_NARRATIVE_QUERY = ( + "A support agent is drafting a reply to a customer asking why their " + "order shipped without a free sticker. Draft the reply, but never say " + "'Absolutely!' and do not apologize for policy." +) + + +class TestCandidateUnion: + def test_default_vector_strategy_unchanged(self, tmp_path): + """Default behavior must be identical to omitting the parameter.""" + palace = str(tmp_path / "palace") + _seed_drawers(palace) + without = search_memories(_NARRATIVE_QUERY, palace, n_results=5) + with_default = search_memories( + _NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector" + ) + ids_a = [h["source_file"] for h in without["results"]] + ids_b = [h["source_file"] for h in with_default["results"]] + assert ids_a == ids_b, "explicit candidate_strategy='vector' must match default" + + def test_union_surfaces_bm25_strong_vector_distant_doc(self, tmp_path): + """The brand-voice doc has strong BM25 signal for the query but is + stylistically far from the narrative tickets. Union mode must + retrieve it; vector-only mode is allowed to miss it.""" + palace = str(tmp_path / "palace") + _seed_drawers(palace) + result = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union") + ids = [h["source_file"] for h in result["results"]] + assert "brand_voice_D4.md" in ids, ( + "union mode must surface BM25-strong docs even when vector signal " + f"is weak; got {ids}" + ) + + def test_union_preserves_vector_hits(self, tmp_path): + """Union mode must not drop docs that vector-only mode finds — + the rerank pool grows, it doesn't shrink.""" + palace = str(tmp_path / "palace") + _seed_drawers(palace) + vector = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector") + union = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union") + vec_ids = {h["source_file"] for h in vector["results"]} + union_ids = {h["source_file"] for h in union["results"]} + # In a 4-doc corpus with n_results=5, both should return all 4. + # The invariant is: union should not lose anything vector found. + missing = vec_ids - union_ids + assert not missing, f"union dropped docs that vector found: {missing}" + + def test_union_handles_empty_palace(self, tmp_path): + """No drawers — union mode should return empty results, not crash.""" + palace = str(tmp_path / "palace") + get_collection(palace, create=True) # create empty collection + result = search_memories("anything", palace, n_results=5, candidate_strategy="union") + assert result.get("results", []) == [] + + def test_invalid_candidate_strategy_raises(self, tmp_path): + """Bad arg should raise rather than silently fall back.""" + palace = str(tmp_path / "palace") + _seed_drawers(palace) + import pytest + + with pytest.raises(ValueError, match="candidate_strategy"): + search_memories("anything", palace, n_results=5, candidate_strategy="bogus") + + +class TestHybridRankTolerantOfMissingDistance: + """``_hybrid_rank`` accepts ``distance=None`` — required for BM25-only + candidates injected by union mode.""" + + def test_distance_none_scored_as_zero_vector_sim(self): + from mempalace.searcher import _hybrid_rank + + results = [ + {"text": "alpha beta gamma", "distance": 0.2}, # close vector match + {"text": "alpha alpha alpha", "distance": None}, # BM25-only — heavy term repetition + ] + # Query matches "alpha" heavily; the BM25-only candidate with no + # vector signal should still rank competitively on BM25 alone. + ranked = _hybrid_rank(results, "alpha") + assert all("bm25_score" in r for r in ranked), "rerank should add bm25_score" + # Both must survive — neither should crash on distance=None. + assert len(ranked) == 2