Merge pull request #1306 from MemPalace/feat/hybrid-candidate-union

feat(searcher): candidate_strategy="union" — BM25 candidates joined with vector pool before hybrid rerank
2026-05-03 03:40:51 -03:00
parent 3e6f6480c0 3eb7980e55
commit 2ad379b547
2 changed files with 418 additions and 4 deletions
@@ -134,6 +134,11 @@ def _hybrid_rank(
      themselves. Since the absolute scale is unbounded, BM25 is min-max
      normalized within the candidate set so weights are commensurable.

+    Candidates with ``distance=None`` are treated as vector-unknown
+    (no vector signal available) and scored on BM25 contribution alone.
+    Used by candidate-union mode to merge BM25-only candidates that the
+    vector index didn't surface.
+
    Mutates each result dict to add ``bm25_score`` and reorders the list
    in place. Returns the same list for convenience.
    """
@@ -147,7 +152,11 @@ def _hybrid_rank(

    scored = []
    for r, raw, norm in zip(results, bm25_raw, bm25_norm):
-        vec_sim = max(0.0, 1.0 - r.get("distance", 1.0))
+        distance = r.get("distance")
+        if distance is None:
+            vec_sim = 0.0
+        else:
+            vec_sim = max(0.0, 1.0 - distance)
        r["bm25_score"] = round(raw, 3)
        scored.append((vector_weight * vec_sim + bm25_weight * norm, r))

@@ -372,6 +381,7 @@ def _bm25_only_via_sqlite(
    room: str = None,
    n_results: int = 5,
    max_candidates: int = 500,
+    _include_internal: bool = False,
 ) -> dict:
    """BM25-only search reading drawers directly from chroma.sqlite3.

@@ -509,17 +519,25 @@ def _bm25_only_via_sqlite(
            continue
        if room and meta.get("room") != room:
            continue
+        full_source = meta.get("source_file", "") or ""
        candidates.append(
            {
                "text": d["text"],
                "wing": meta.get("wing", "unknown"),
                "room": meta.get("room", "unknown"),
-                "source_file": Path(meta.get("source_file", "?") or "?").name,
+                "source_file": Path(full_source).name if full_source else "?",
                "created_at": meta.get("filed_at", "unknown"),
                # No vector distance available in BM25-only mode.
                "similarity": None,
                "distance": None,
                "matched_via": "bm25_sqlite",
+                # Internal: full path + chunk_index let callers (notably
+                # candidate_strategy="union") dedupe at chunk granularity
+                # rather than basename — two files in different directories
+                # may share a basename, and one source_file is split across
+                # multiple chunks. Stripped before this helper returns.
+                "_source_file_full": full_source,
+                "_chunk_index": meta.get("chunk_index"),
            }
        )

@@ -534,6 +552,12 @@ def _bm25_only_via_sqlite(
    hits = candidates[:n_results]
    for h in hits:
        h.pop("_score", None)
+        # Strip internal fields by default so the public BM25-only fallback
+        # response stays clean. Callers that need chunk-precise dedup
+        # (notably the union-merge path) opt in via _include_internal.
+        if not _include_internal:
+            h.pop("_source_file_full", None)
+            h.pop("_chunk_index", None)

    return {
        "query": query,
@@ -545,6 +569,117 @@ def _bm25_only_via_sqlite(
    }


+def _merge_bm25_union_candidates(
+    hits: list,
+    query: str,
+    palace_path: str,
+    wing: str,
+    room: str,
+    n_results: int,
+    max_distance: float = 0.0,
+) -> None:
+    """Append top-K BM25-only candidates from sqlite into ``hits`` in place.
+
+    Used by ``search_memories(..., candidate_strategy="union")`` to widen
+    the rerank pool's *source* (not just its size) — vector-only candidate
+    selection skips docs whose embeddings are far from the query even when
+    BM25 signal is strong.
+
+    Dedup is chunk-precise: the key is ``(_source_file_full, _chunk_index)``
+    so two files sharing a basename in different directories don't collide,
+    and a vector hit on chunk N of a file doesn't block BM25 from
+    contributing chunk M of the same file. Falls back to ``source_file``
+    only when full-path/chunk metadata is absent.
+
+    BM25-only additions carry ``distance=None`` so ``_hybrid_rank`` scores
+    them on BM25 contribution alone.
+
+    When ``max_distance > 0.0`` (a strict vector-distance threshold is
+    set), BM25-only candidates are skipped entirely — they have no vector
+    distance to satisfy the threshold, and silently injecting them would
+    break the existing ``max_distance`` guarantee that hybrid results lie
+    within the requested vector-distance bound.
+    """
+    if max_distance > 0.0:
+        return
+
+    try:
+        bm25_extra = _bm25_only_via_sqlite(
+            query,
+            palace_path,
+            wing=wing,
+            room=room,
+            n_results=n_results * 3,
+            _include_internal=True,
+        ).get("results", [])
+    except Exception:
+        logger.debug("candidate_strategy=union: BM25 fetch failed", exc_info=True)
+        return
+
+    def _dedup_key(entry: dict):
+        full = entry.get("_source_file_full")
+        ci = entry.get("_chunk_index")
+        if full and ci is not None:
+            return (full, ci)
+        # Fall back to basename only when richer metadata is missing —
+        # avoids silently dropping candidates on legacy data while still
+        # giving chunk-precise dedup whenever the metadata is present.
+        return entry.get("source_file")
+
+    seen = {_dedup_key(h) for h in hits}
+    for bh in bm25_extra:
+        key = _dedup_key(bh)
+        if not key or key == "?" or key in seen:
+            continue
+        bh["distance"] = None
+        bh["effective_distance"] = None
+        bh["closet_boost"] = 0.0
+        hits.append(bh)
+        seen.add(key)
+
+
+# Strategy dispatch — keeps search_memories' branch count under the
+# project's complexity ceiling (C901 max-complexity=25). New strategies
+# register here.
+_CANDIDATE_MERGERS = {
+    "vector": None,  # default no-op
+    "union": _merge_bm25_union_candidates,
+}
+
+
+def _validate_candidate_strategy(strategy: str) -> None:
+    """Raise ``ValueError`` for unknown strategies.
+
+    Called eagerly at the top of ``search_memories`` so invalid values
+    fail consistently regardless of whether the call routes through the
+    vector path, the BM25-only fallback, or returns an early error dict.
+    """
+    if strategy not in _CANDIDATE_MERGERS:
+        raise ValueError(
+            f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, got {strategy!r}"
+        )
+
+
+def _apply_candidate_strategy(
+    strategy: str,
+    hits: list,
+    query: str,
+    palace_path: str,
+    wing: str,
+    room: str,
+    n_results: int,
+    max_distance: float = 0.0,
+) -> None:
+    """Dispatch to the registered merger for ``strategy``.
+
+    Strategy validity is assumed (``_validate_candidate_strategy`` runs
+    earlier); ``"vector"`` is a no-op.
+    """
+    merger = _CANDIDATE_MERGERS[strategy]
+    if merger is not None:
+        merger(hits, query, palace_path, wing, room, n_results, max_distance=max_distance)
+
+
 def search_memories(
    query: str,
    palace_path: str,
@@ -553,6 +688,7 @@ def search_memories(
    n_results: int = 5,
    max_distance: float = 0.0,
    vector_disabled: bool = False,
+    candidate_strategy: str = "vector",
 ) -> dict:
    """Programmatic search — returns a dict instead of printing.

@@ -572,7 +708,30 @@ def search_memories(
            (#1222). Set by the MCP server when the HNSW capacity probe
            detects a divergence that would segfault chromadb on segment
            load.
+        candidate_strategy: How candidates for the hybrid re-rank are gathered.
+
+            * ``"vector"`` (default) — preserves historical behavior: top
+              ``n_results * 3`` rows from the vector index are the rerank pool.
+              Cheap; works well when query and target docs agree in the
+              embedding space.
+            * ``"union"`` — also pull top ``n_results * 3`` BM25 candidates
+              from the sqlite FTS5 index and merge them into the rerank pool
+              (deduped by source_file). Catches docs with strong BM25 signal
+              that are vector-distant from the query (e.g. terminology guides
+              looked up by narrative-shaped queries; policy clauses surfaced
+              by scenario descriptions). Adds one sqlite open + FTS5 MATCH
+              per query; perf cost is small but unmeasured at corpus scale.
+              Opt in until the cost is characterized.
+
+              When ``max_distance > 0.0`` is also set, BM25-only candidates
+              are skipped — they have no vector distance and would silently
+              violate the requested distance threshold.
    """
+    # Validate the strategy eagerly so invalid values fail the same way
+    # regardless of whether the call routes through the vector path or
+    # the BM25-only fallback below.
+    _validate_candidate_strategy(candidate_strategy)
+
    if vector_disabled:
        return _bm25_only_via_sqlite(
            query,
@@ -748,8 +907,29 @@ def search_memories(
        h["drawer_index"] = best_idx
        h["total_drawers"] = len(ordered_docs)

-    # BM25 hybrid re-rank within the final candidate set.
-    hits = _hybrid_rank(hits, query)
+    # Candidate strategy hook: optionally widen the rerank pool's *source*
+    # before ranking. Default ("vector") is a no-op; "union" merges top-K
+    # BM25 candidates from sqlite. See `_apply_candidate_strategy`.
+    # ``max_distance`` is forwarded so union mode can refuse to inject
+    # BM25-only (distance=None) candidates that would silently bypass the
+    # caller's strict distance threshold.
+    _apply_candidate_strategy(
+        candidate_strategy,
+        hits,
+        query,
+        palace_path,
+        wing,
+        room,
+        n_results,
+        max_distance=max_distance,
+    )
+
+    # BM25 hybrid re-rank within the final candidate set, then trim back
+    # to the requested size. Without the trim, ``candidate_strategy="union"``
+    # would return up to 4× ``n_results`` (vector hits + BM25 union pool),
+    # breaking the existing ``search_memories`` size contract that the MCP
+    # ``limit`` parameter is built on.
+    hits = _hybrid_rank(hits, query)[:n_results]
    for h in hits:
        h.pop("_sort_key", None)
        h.pop("_source_file_full", None)
@@ -0,0 +1,234 @@
+"""Tests for ``candidate_strategy="union"`` in ``search_memories``.
+
+The default ``"vector"`` strategy gathers candidates from the vector index
+only. Docs with strong BM25 signal but vector embeddings far from the query
+get skipped — terminology guides looked up by narrative-shaped queries are
+the canonical case.
+
+The ``"union"`` strategy also pulls top-K BM25-only candidates from sqlite
+FTS5 and merges them into the rerank pool. Both signal sources contribute
+candidates; the hybrid rerank picks the best from a richer pool.
+
+Default behavior is unchanged ("vector") — these tests exercise opt-in
+"union" mode.
+"""
+
+from mempalace.palace import get_collection
+from mempalace.searcher import search_memories
+
+
+def _seed_drawers(palace_path):
+    """Seed a corpus where the right doc for one query is BM25-strong but
+    vector-distant.
+
+    D1-D3 are short narrative tickets that semantically cluster around
+    "customer support / order / shipped" vocabulary. D4 is a meta-document
+    of bullet rules ("brand voice") that contains rare keywords like
+    "Absolutely" and "apologize" the query repeats verbatim — strong BM25
+    signal but stylistically far from the narrative tickets.
+    """
+    col = get_collection(palace_path, create=True)
+    col.upsert(
+        ids=["D1", "D2", "D3", "D4"],
+        documents=[
+            "Customer wrote in asking why their order shipped without "
+            "the promo sticker. Standard reply explaining the threshold.",
+            "Order delivery delayed three days; customer requested a "
+            "refund. Support agent processed return via ticket queue.",
+            "Customer asked about the missing freebie; the reply "
+            "explained the campaign mechanics and shipped status.",
+            "Brand voice rules: dry, sturdy, never effusive. "
+            "Never 'Absolutely!' Never apologize for policy — explain it. "
+            "Avoid premium / curated / elevated vocabulary.",
+        ],
+        metadatas=[
+            {"wing": "shop", "room": "support", "source_file": "ticket_D1.md"},
+            {"wing": "shop", "room": "support", "source_file": "ticket_D2.md"},
+            {"wing": "shop", "room": "support", "source_file": "ticket_D3.md"},
+            {"wing": "shop", "room": "guides", "source_file": "brand_voice_D4.md"},
+        ],
+    )
+
+
+_NARRATIVE_QUERY = (
+    "A support agent is drafting a reply to a customer asking why their "
+    "order shipped without a free sticker. Draft the reply, but never say "
+    "'Absolutely!' and do not apologize for policy."
+)
+
+
+class TestCandidateUnion:
+    def test_default_vector_strategy_unchanged(self, tmp_path):
+        """Default behavior must be identical to omitting the parameter."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        without = search_memories(_NARRATIVE_QUERY, palace, n_results=5)
+        with_default = search_memories(
+            _NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector"
+        )
+        ids_a = [h["source_file"] for h in without["results"]]
+        ids_b = [h["source_file"] for h in with_default["results"]]
+        assert ids_a == ids_b, "explicit candidate_strategy='vector' must match default"
+
+    def test_union_surfaces_bm25_strong_vector_distant_doc(self, tmp_path):
+        """The brand-voice doc has strong BM25 signal for the query but is
+        stylistically far from the narrative tickets. Union mode must
+        retrieve it; vector-only mode is allowed to miss it."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        result = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union")
+        ids = [h["source_file"] for h in result["results"]]
+        assert "brand_voice_D4.md" in ids, (
+            "union mode must surface BM25-strong docs even when vector signal "
+            f"is weak; got {ids}"
+        )
+
+    def test_union_preserves_vector_hits(self, tmp_path):
+        """Union mode must not drop docs that vector-only mode finds —
+        the rerank pool grows, it doesn't shrink."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        vector = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector")
+        union = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union")
+        vec_ids = {h["source_file"] for h in vector["results"]}
+        union_ids = {h["source_file"] for h in union["results"]}
+        # In a 4-doc corpus with n_results=5, both should return all 4.
+        # The invariant is: union should not lose anything vector found.
+        missing = vec_ids - union_ids
+        assert not missing, f"union dropped docs that vector found: {missing}"
+
+    def test_union_handles_empty_palace(self, tmp_path):
+        """No drawers — union mode should return empty results, not crash."""
+        palace = str(tmp_path / "palace")
+        get_collection(palace, create=True)  # create empty collection
+        result = search_memories("anything", palace, n_results=5, candidate_strategy="union")
+        assert result.get("results", []) == []
+
+    def test_invalid_candidate_strategy_raises(self, tmp_path):
+        """Bad arg should raise rather than silently fall back."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        import pytest
+
+        with pytest.raises(ValueError, match="candidate_strategy"):
+            search_memories("anything", palace, n_results=5, candidate_strategy="bogus")
+
+    def test_invalid_strategy_raises_even_when_vector_disabled(self, tmp_path):
+        """Validation must happen before the ``vector_disabled`` early return —
+        invalid values must fail consistently regardless of routing."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        import pytest
+
+        with pytest.raises(ValueError, match="candidate_strategy"):
+            search_memories(
+                "anything",
+                palace,
+                n_results=5,
+                vector_disabled=True,
+                candidate_strategy="bogus",
+            )
+
+    def test_union_respects_n_results_limit(self, tmp_path):
+        """When the merged candidate set is larger than ``n_results``, the
+        result must be trimmed back to the requested size — the MCP
+        ``limit`` contract depends on this invariant."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        # 4-doc corpus, n_results=2 → union pool can grow to ~8 candidates,
+        # rerank reorders them, but final list must respect the cap.
+        result = search_memories(_NARRATIVE_QUERY, palace, n_results=2, candidate_strategy="union")
+        assert (
+            len(result["results"]) <= 2
+        ), f"union must trim to n_results=2; got {len(result['results'])} results"
+
+    def test_union_skipped_when_max_distance_set(self, tmp_path):
+        """``max_distance`` is a vector-distance threshold; BM25-only
+        candidates have ``distance=None`` and cannot satisfy it. Union
+        must not silently inject them when a strict threshold is set,
+        otherwise the existing ``max_distance`` guarantee regresses."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        # Sanity: without max_distance, union surfaces the BM25-strong doc.
+        unfiltered = search_memories(
+            _NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union"
+        )
+        assert "brand_voice_D4.md" in {h["source_file"] for h in unfiltered["results"]}
+
+        # With a tight max_distance, union must NOT inject BM25-only hits —
+        # every returned hit must have a real (non-None) distance.
+        filtered = search_memories(
+            _NARRATIVE_QUERY,
+            palace,
+            n_results=5,
+            candidate_strategy="union",
+            max_distance=0.5,
+        )
+        for h in filtered["results"]:
+            assert h.get("distance") is not None, (
+                f"union under max_distance must not inject BM25-only "
+                f"(distance=None) candidates; offending hit: {h}"
+            )
+            assert h["distance"] <= 0.5, f"hit violates max_distance=0.5: distance={h['distance']}"
+
+    def test_union_dedup_is_chunk_precise_not_basename(self, tmp_path):
+        """Two files with the same basename in different directories must
+        not collide — union must dedup on full path (or chunk-level key),
+        not on basename alone. Otherwise a BM25-strong README from one
+        directory silently shadows a BM25-strong README from another.
+        """
+        palace = str(tmp_path / "palace")
+        col = get_collection(palace, create=True)
+        col.upsert(
+            ids=["A_README", "B_README", "narrative"],
+            documents=[
+                # Both README files share the basename README.md but live
+                # in different directories. Each contains distinctive
+                # terminology a query might surface via BM25.
+                "PROJECT ALPHA: configuration for the Frobnitz subsystem. "
+                "Set FROBNITZ_TIMEOUT=30 to enable widget rotation.",
+                "PROJECT BETA: configuration for the Wibble subsystem. "
+                "Set WIBBLE_THRESHOLD=0.5 to enable signal smoothing.",
+                "Engineers occasionally chat about how the legacy "
+                "subsystems all need their config knobs tweaked.",
+            ],
+            metadatas=[
+                {"wing": "code", "room": "docs", "source_file": "alpha/README.md"},
+                {"wing": "code", "room": "docs", "source_file": "beta/README.md"},
+                {"wing": "code", "room": "docs", "source_file": "chat.md"},
+            ],
+        )
+        # Query that hits BM25 for BOTH READMEs (distinct vocab from each).
+        # Vector-only might pick the chat doc as semantically "closest";
+        # union must surface both READMEs without basename collision.
+        result = search_memories(
+            "FROBNITZ_TIMEOUT WIBBLE_THRESHOLD configuration",
+            palace,
+            n_results=5,
+            candidate_strategy="union",
+        )
+        sources = [h["source_file"] for h in result["results"]]
+        readme_count = sum(1 for s in sources if s == "README.md")
+        assert readme_count >= 2, (
+            f"union must surface both README.md files from different dirs "
+            f"(basename collision would drop one); got sources={sources}"
+        )
+
+
+class TestHybridRankTolerantOfMissingDistance:
+    """``_hybrid_rank`` accepts ``distance=None`` — required for BM25-only
+    candidates injected by union mode."""
+
+    def test_distance_none_scored_as_zero_vector_sim(self):
+        from mempalace.searcher import _hybrid_rank
+
+        results = [
+            {"text": "alpha beta gamma", "distance": 0.2},  # close vector match
+            {"text": "alpha alpha alpha", "distance": None},  # BM25-only — heavy term repetition
+        ]
+        # Query matches "alpha" heavily; the BM25-only candidate with no
+        # vector signal should still rank competitively on BM25 alone.
+        ranked = _hybrid_rank(results, "alpha")
+        assert all("bm25_score" in r for r in ranked), "rerank should add bm25_score"
+        # Both must survive — neither should crash on distance=None.
+        assert len(ranked) == 2