From 6509071b8e93919bb1f21dd625457fbf3a59ed5e Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 00:47:24 -0300
Subject: [PATCH] =?UTF-8?q?feat(searcher):=20add=20candidate=5Fstrategy=3D?=
 =?UTF-8?q?"union"=20for=20vector=E2=88=AABM25=20reranking=20pool?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Default search behavior is unchanged. Opt-in candidate_strategy="union"
also pulls top-K BM25-only candidates from sqlite FTS5 and merges them
into the rerank pool, catching docs with strong BM25 signal that the
vector index didn't surface in the over-fetch window.

Motivation
----------
The current hybrid path gathers candidates from the vector index only
(n_results * 3 over-fetch), then BM25-reranks within them. When the
query embeds close to the wrong content semantically, the right doc
never enters the rerank pool — *no matter how wide the over-fetch*.
Tested on a ~6K-document mixed corpus (knowledge prose + short structured
records): at *30x* over-fetch (~5% of the corpus) the target doc still
didn't surface for narrative-shaped queries targeting terminology guides.
Wider over-fetch isn't the answer; widening the pool's *source* is.

Concrete failure mode: a narrative-shaped query embeds close to records
sharing the same operational vocabulary (other narrative entries in the
corpus). A terminology / style guide is BM25-strong for the query
(rare keywords the guide repeats) but vector-distant. Vector-only
candidates don't include it; BM25 never gets to rerank it. The hybrid
path produces 0.00 recall on a probe that pure BM25 alone scores 1.00 —
the hybrid is worse than its component on the same input.

Behavior change
---------------
* New parameter ``candidate_strategy: str = "vector"`` on ``search_memories``.
  - ``"vector"`` (default): historical behavior, no change.
  - ``"union"``: also fetch top ``n_results * 3`` candidates via the
    existing ``_bm25_only_via_sqlite`` helper, dedupe by source_file,
    merge into the rerank pool. BM25-only candidates carry
    ``distance=None`` so they're scored on BM25 contribution alone
    (vec_sim coerces to 0).
* ``_hybrid_rank`` now handles ``distance=None`` explicitly, scoring
  such candidates as vector-unknown (vec_sim=0) rather than treating
  it as max-distance via shim.
* New strategies register via ``_CANDIDATE_MERGERS``; dispatch is in
  ``_apply_candidate_strategy`` so ``search_memories`` stays under the
  C901 complexity ceiling.

Bench numbers (~6K-doc internal mixed corpus, recall@10, 5 probes spanning
policy-exception lookup, temporal-decay, style retrieval, set-difference,
and pattern-recognition):

                              baseline ("vector")   "union"
  policy-exception probe        0.00                  0.50    +0.50
  temporal-decay probe          0.17                  0.50    +0.33
  style-retrieval probe         0.00                  1.00    +1.00 (PASSES)
  set-difference probe          0.00–0.06             0.06–0.09  ~
  pattern-recog probe           0.64 (stable)         0.50–0.71  variance, typ. +0.07
  macro recall                  0.16–0.17             0.51–0.56  +0.34 to +0.40

The pattern-recog variance points at a related issue worth a separate PR:
``_hybrid_rank`` computes BM25 IDF over the candidate set. Adding new
candidates re-normalizes BM25 for *existing* candidates non-monotonically.
Stable corpus-wide BM25 would remove this. Out of scope here.

Tests
-----
``tests/test_hybrid_candidate_union.py`` (6 tests, all pass):
- default behavior unchanged (explicit ``"vector"`` matches default)
- ``"union"`` surfaces a BM25-strong vector-distant doc
- ``"union"`` doesn't drop docs ``"vector"`` would have found
- empty-palace handling
- invalid ``candidate_strategy`` raises
- ``_hybrid_rank`` tolerates ``distance=None``

Existing ``test_hybrid_search.py`` (5) and ``test_searcher.py`` (27) pass.

Performance note
----------------
Each ``"union"`` query adds one sqlite open + FTS5 MATCH + metadata
fetch (via the existing ``_bm25_only_via_sqlite`` helper, which already
runs as the ``vector_disabled`` fallback path so the code is
well-trodden). Per-query overhead is small but unmeasured at corpus
scale. Default stays ``"vector"`` until a maintainer characterizes the
cost.
---
 mempalace/searcher.py                | 104 ++++++++++++++++++++-
 tests/test_hybrid_candidate_union.py | 133 +++++++++++++++++++++++++++
 2 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_hybrid_candidate_union.py

diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index a14d90d..7a46158 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -134,6 +134,11 @@ def _hybrid_rank(
       themselves. Since the absolute scale is unbounded, BM25 is min-max
       normalized within the candidate set so weights are commensurable.
 
+    Candidates with ``distance=None`` are treated as vector-unknown
+    (no vector signal available) and scored on BM25 contribution alone.
+    Used by candidate-union mode to merge BM25-only candidates that the
+    vector index didn't surface.
+
     Mutates each result dict to add ``bm25_score`` and reorders the list
     in place. Returns the same list for convenience.
     """
@@ -147,7 +152,11 @@ def _hybrid_rank(
 
     scored = []
     for r, raw, norm in zip(results, bm25_raw, bm25_norm):
-        vec_sim = max(0.0, 1.0 - r.get("distance", 1.0))
+        distance = r.get("distance")
+        if distance is None:
+            vec_sim = 0.0
+        else:
+            vec_sim = max(0.0, 1.0 - distance)
         r["bm25_score"] = round(raw, 3)
         scored.append((vector_weight * vec_sim + bm25_weight * norm, r))
 
@@ -545,6 +554,79 @@ def _bm25_only_via_sqlite(
     }
 
 
+def _merge_bm25_union_candidates(
+    hits: list,
+    query: str,
+    palace_path: str,
+    wing: str,
+    room: str,
+    n_results: int,
+) -> None:
+    """Append top-K BM25-only candidates from sqlite into ``hits`` in place.
+
+    Used by ``search_memories(..., candidate_strategy="union")`` to widen
+    the rerank pool's *source* (not just its size) — vector-only candidate
+    selection skips docs whose embeddings are far from the query even when
+    BM25 signal is strong. We dedupe against existing hits by ``source_file``
+    so vector-side entries (which carry real distance values) win on
+    collisions; BM25-only additions are marked with ``distance=None`` so
+    ``_hybrid_rank`` scores them on BM25 contribution alone.
+    """
+    try:
+        bm25_extra = _bm25_only_via_sqlite(
+            query,
+            palace_path,
+            wing=wing,
+            room=room,
+            n_results=n_results * 3,
+        ).get("results", [])
+    except Exception:
+        logger.debug("candidate_strategy=union: BM25 fetch failed", exc_info=True)
+        return
+
+    seen_sources = {h.get("source_file") for h in hits}
+    for bh in bm25_extra:
+        key = bh.get("source_file")
+        if not key or key == "?" or key in seen_sources:
+            continue
+        bh["distance"] = None
+        bh["effective_distance"] = None
+        bh["closet_boost"] = 0.0
+        hits.append(bh)
+        seen_sources.add(key)
+
+
+# Strategy dispatch — keeps search_memories' branch count under the
+# project's complexity ceiling (C901 max-complexity=25). New strategies
+# register here.
+_CANDIDATE_MERGERS = {
+    "vector": None,  # default no-op
+    "union": _merge_bm25_union_candidates,
+}
+
+
+def _apply_candidate_strategy(
+    strategy: str,
+    hits: list,
+    query: str,
+    palace_path: str,
+    wing: str,
+    room: str,
+    n_results: int,
+) -> None:
+    """Dispatch to the registered merger for ``strategy``.
+
+    Raises ``ValueError`` for unknown strategies. ``"vector"`` is a no-op.
+    """
+    if strategy not in _CANDIDATE_MERGERS:
+        raise ValueError(
+            f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, " f"got {strategy!r}"
+        )
+    merger = _CANDIDATE_MERGERS[strategy]
+    if merger is not None:
+        merger(hits, query, palace_path, wing, room, n_results)
+
+
 def search_memories(
     query: str,
     palace_path: str,
@@ -553,6 +635,7 @@ def search_memories(
     n_results: int = 5,
     max_distance: float = 0.0,
     vector_disabled: bool = False,
+    candidate_strategy: str = "vector",
 ) -> dict:
     """Programmatic search — returns a dict instead of printing.
 
@@ -572,6 +655,20 @@ def search_memories(
             (#1222). Set by the MCP server when the HNSW capacity probe
             detects a divergence that would segfault chromadb on segment
             load.
+        candidate_strategy: How candidates for the hybrid re-rank are gathered.
+
+            * ``"vector"`` (default) — preserves historical behavior: top
+              ``n_results * 3`` rows from the vector index are the rerank pool.
+              Cheap; works well when query and target docs agree in the
+              embedding space.
+            * ``"union"`` — also pull top ``n_results * 3`` BM25 candidates
+              from the sqlite FTS5 index and merge them into the rerank pool
+              (deduped by source_file). Catches docs with strong BM25 signal
+              that are vector-distant from the query (e.g. terminology guides
+              looked up by narrative-shaped queries; policy clauses surfaced
+              by scenario descriptions). Adds one sqlite open + FTS5 MATCH
+              per query; perf cost is small but unmeasured at corpus scale.
+              Opt in until the cost is characterized.
     """
     if vector_disabled:
         return _bm25_only_via_sqlite(
@@ -748,6 +845,11 @@ def search_memories(
         h["drawer_index"] = best_idx
         h["total_drawers"] = len(ordered_docs)
 
+    # Candidate strategy hook: optionally widen the rerank pool's *source*
+    # before ranking. Default ("vector") is a no-op; "union" merges top-K
+    # BM25 candidates from sqlite. See `_apply_candidate_strategy`.
+    _apply_candidate_strategy(candidate_strategy, hits, query, palace_path, wing, room, n_results)
+
     # BM25 hybrid re-rank within the final candidate set.
     hits = _hybrid_rank(hits, query)
     for h in hits:
diff --git a/tests/test_hybrid_candidate_union.py b/tests/test_hybrid_candidate_union.py
new file mode 100644
index 0000000..97cf4d1
--- /dev/null
+++ b/tests/test_hybrid_candidate_union.py
@@ -0,0 +1,133 @@
+"""Tests for ``candidate_strategy="union"`` in ``search_memories``.
+
+The default ``"vector"`` strategy gathers candidates from the vector index
+only. Docs with strong BM25 signal but vector embeddings far from the query
+get skipped — terminology guides looked up by narrative-shaped queries are
+the canonical case.
+
+The ``"union"`` strategy also pulls top-K BM25-only candidates from sqlite
+FTS5 and merges them into the rerank pool. Both signal sources contribute
+candidates; the hybrid rerank picks the best from a richer pool.
+
+Default behavior is unchanged ("vector") — these tests exercise opt-in
+"union" mode.
+"""
+
+from mempalace.palace import get_collection
+from mempalace.searcher import search_memories
+
+
+def _seed_drawers(palace_path):
+    """Seed a corpus where the right doc for one query is BM25-strong but
+    vector-distant.
+
+    D1-D3 are short narrative tickets that semantically cluster around
+    "customer support / order / shipped" vocabulary. D4 is a meta-document
+    of bullet rules ("brand voice") that contains rare keywords like
+    "Absolutely" and "apologize" the query repeats verbatim — strong BM25
+    signal but stylistically far from the narrative tickets.
+    """
+    col = get_collection(palace_path, create=True)
+    col.upsert(
+        ids=["D1", "D2", "D3", "D4"],
+        documents=[
+            "Customer wrote in asking why their order shipped without "
+            "the promo sticker. Standard reply explaining the threshold.",
+            "Order delivery delayed three days; customer requested a "
+            "refund. Support agent processed return via ticket queue.",
+            "Customer asked about the missing freebie; the reply "
+            "explained the campaign mechanics and shipped status.",
+            "Brand voice rules: dry, sturdy, never effusive. "
+            "Never 'Absolutely!' Never apologize for policy — explain it. "
+            "Avoid premium / curated / elevated vocabulary.",
+        ],
+        metadatas=[
+            {"wing": "shop", "room": "support", "source_file": "ticket_D1.md"},
+            {"wing": "shop", "room": "support", "source_file": "ticket_D2.md"},
+            {"wing": "shop", "room": "support", "source_file": "ticket_D3.md"},
+            {"wing": "shop", "room": "guides", "source_file": "brand_voice_D4.md"},
+        ],
+    )
+
+
+_NARRATIVE_QUERY = (
+    "A support agent is drafting a reply to a customer asking why their "
+    "order shipped without a free sticker. Draft the reply, but never say "
+    "'Absolutely!' and do not apologize for policy."
+)
+
+
+class TestCandidateUnion:
+    def test_default_vector_strategy_unchanged(self, tmp_path):
+        """Default behavior must be identical to omitting the parameter."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        without = search_memories(_NARRATIVE_QUERY, palace, n_results=5)
+        with_default = search_memories(
+            _NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector"
+        )
+        ids_a = [h["source_file"] for h in without["results"]]
+        ids_b = [h["source_file"] for h in with_default["results"]]
+        assert ids_a == ids_b, "explicit candidate_strategy='vector' must match default"
+
+    def test_union_surfaces_bm25_strong_vector_distant_doc(self, tmp_path):
+        """The brand-voice doc has strong BM25 signal for the query but is
+        stylistically far from the narrative tickets. Union mode must
+        retrieve it; vector-only mode is allowed to miss it."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        result = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union")
+        ids = [h["source_file"] for h in result["results"]]
+        assert "brand_voice_D4.md" in ids, (
+            "union mode must surface BM25-strong docs even when vector signal "
+            f"is weak; got {ids}"
+        )
+
+    def test_union_preserves_vector_hits(self, tmp_path):
+        """Union mode must not drop docs that vector-only mode finds —
+        the rerank pool grows, it doesn't shrink."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        vector = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector")
+        union = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union")
+        vec_ids = {h["source_file"] for h in vector["results"]}
+        union_ids = {h["source_file"] for h in union["results"]}
+        # In a 4-doc corpus with n_results=5, both should return all 4.
+        # The invariant is: union should not lose anything vector found.
+        missing = vec_ids - union_ids
+        assert not missing, f"union dropped docs that vector found: {missing}"
+
+    def test_union_handles_empty_palace(self, tmp_path):
+        """No drawers — union mode should return empty results, not crash."""
+        palace = str(tmp_path / "palace")
+        get_collection(palace, create=True)  # create empty collection
+        result = search_memories("anything", palace, n_results=5, candidate_strategy="union")
+        assert result.get("results", []) == []
+
+    def test_invalid_candidate_strategy_raises(self, tmp_path):
+        """Bad arg should raise rather than silently fall back."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        import pytest
+
+        with pytest.raises(ValueError, match="candidate_strategy"):
+            search_memories("anything", palace, n_results=5, candidate_strategy="bogus")
+
+
+class TestHybridRankTolerantOfMissingDistance:
+    """``_hybrid_rank`` accepts ``distance=None`` — required for BM25-only
+    candidates injected by union mode."""
+
+    def test_distance_none_scored_as_zero_vector_sim(self):
+        from mempalace.searcher import _hybrid_rank
+
+        results = [
+            {"text": "alpha beta gamma", "distance": 0.2},  # close vector match
+            {"text": "alpha alpha alpha", "distance": None},  # BM25-only — heavy term repetition
+        ]
+        # Query matches "alpha" heavily; the BM25-only candidate with no
+        # vector signal should still rank competitively on BM25 alone.
+        ranked = _hybrid_rank(results, "alpha")
+        assert all("bm25_score" in r for r in ranked), "rerank should add bm25_score"
+        # Both must survive — neither should crash on distance=None.
+        assert len(ranked) == 2