Merge pull request #1306 from MemPalace/feat/hybrid-candidate-union
feat(searcher): candidate_strategy="union" — BM25 candidates joined with vector pool before hybrid rerank
This commit is contained in:
+184
-4
@@ -134,6 +134,11 @@ def _hybrid_rank(
|
|||||||
themselves. Since the absolute scale is unbounded, BM25 is min-max
|
themselves. Since the absolute scale is unbounded, BM25 is min-max
|
||||||
normalized within the candidate set so weights are commensurable.
|
normalized within the candidate set so weights are commensurable.
|
||||||
|
|
||||||
|
Candidates with ``distance=None`` are treated as vector-unknown
|
||||||
|
(no vector signal available) and scored on BM25 contribution alone.
|
||||||
|
Used by candidate-union mode to merge BM25-only candidates that the
|
||||||
|
vector index didn't surface.
|
||||||
|
|
||||||
Mutates each result dict to add ``bm25_score`` and reorders the list
|
Mutates each result dict to add ``bm25_score`` and reorders the list
|
||||||
in place. Returns the same list for convenience.
|
in place. Returns the same list for convenience.
|
||||||
"""
|
"""
|
||||||
@@ -147,7 +152,11 @@ def _hybrid_rank(
|
|||||||
|
|
||||||
scored = []
|
scored = []
|
||||||
for r, raw, norm in zip(results, bm25_raw, bm25_norm):
|
for r, raw, norm in zip(results, bm25_raw, bm25_norm):
|
||||||
vec_sim = max(0.0, 1.0 - r.get("distance", 1.0))
|
distance = r.get("distance")
|
||||||
|
if distance is None:
|
||||||
|
vec_sim = 0.0
|
||||||
|
else:
|
||||||
|
vec_sim = max(0.0, 1.0 - distance)
|
||||||
r["bm25_score"] = round(raw, 3)
|
r["bm25_score"] = round(raw, 3)
|
||||||
scored.append((vector_weight * vec_sim + bm25_weight * norm, r))
|
scored.append((vector_weight * vec_sim + bm25_weight * norm, r))
|
||||||
|
|
||||||
@@ -372,6 +381,7 @@ def _bm25_only_via_sqlite(
|
|||||||
room: str = None,
|
room: str = None,
|
||||||
n_results: int = 5,
|
n_results: int = 5,
|
||||||
max_candidates: int = 500,
|
max_candidates: int = 500,
|
||||||
|
_include_internal: bool = False,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""BM25-only search reading drawers directly from chroma.sqlite3.
|
"""BM25-only search reading drawers directly from chroma.sqlite3.
|
||||||
|
|
||||||
@@ -509,17 +519,25 @@ def _bm25_only_via_sqlite(
|
|||||||
continue
|
continue
|
||||||
if room and meta.get("room") != room:
|
if room and meta.get("room") != room:
|
||||||
continue
|
continue
|
||||||
|
full_source = meta.get("source_file", "") or ""
|
||||||
candidates.append(
|
candidates.append(
|
||||||
{
|
{
|
||||||
"text": d["text"],
|
"text": d["text"],
|
||||||
"wing": meta.get("wing", "unknown"),
|
"wing": meta.get("wing", "unknown"),
|
||||||
"room": meta.get("room", "unknown"),
|
"room": meta.get("room", "unknown"),
|
||||||
"source_file": Path(meta.get("source_file", "?") or "?").name,
|
"source_file": Path(full_source).name if full_source else "?",
|
||||||
"created_at": meta.get("filed_at", "unknown"),
|
"created_at": meta.get("filed_at", "unknown"),
|
||||||
# No vector distance available in BM25-only mode.
|
# No vector distance available in BM25-only mode.
|
||||||
"similarity": None,
|
"similarity": None,
|
||||||
"distance": None,
|
"distance": None,
|
||||||
"matched_via": "bm25_sqlite",
|
"matched_via": "bm25_sqlite",
|
||||||
|
# Internal: full path + chunk_index let callers (notably
|
||||||
|
# candidate_strategy="union") dedupe at chunk granularity
|
||||||
|
# rather than basename — two files in different directories
|
||||||
|
# may share a basename, and one source_file is split across
|
||||||
|
# multiple chunks. Stripped before this helper returns.
|
||||||
|
"_source_file_full": full_source,
|
||||||
|
"_chunk_index": meta.get("chunk_index"),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -534,6 +552,12 @@ def _bm25_only_via_sqlite(
|
|||||||
hits = candidates[:n_results]
|
hits = candidates[:n_results]
|
||||||
for h in hits:
|
for h in hits:
|
||||||
h.pop("_score", None)
|
h.pop("_score", None)
|
||||||
|
# Strip internal fields by default so the public BM25-only fallback
|
||||||
|
# response stays clean. Callers that need chunk-precise dedup
|
||||||
|
# (notably the union-merge path) opt in via _include_internal.
|
||||||
|
if not _include_internal:
|
||||||
|
h.pop("_source_file_full", None)
|
||||||
|
h.pop("_chunk_index", None)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"query": query,
|
"query": query,
|
||||||
@@ -545,6 +569,117 @@ def _bm25_only_via_sqlite(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_bm25_union_candidates(
|
||||||
|
hits: list,
|
||||||
|
query: str,
|
||||||
|
palace_path: str,
|
||||||
|
wing: str,
|
||||||
|
room: str,
|
||||||
|
n_results: int,
|
||||||
|
max_distance: float = 0.0,
|
||||||
|
) -> None:
|
||||||
|
"""Append top-K BM25-only candidates from sqlite into ``hits`` in place.
|
||||||
|
|
||||||
|
Used by ``search_memories(..., candidate_strategy="union")`` to widen
|
||||||
|
the rerank pool's *source* (not just its size) — vector-only candidate
|
||||||
|
selection skips docs whose embeddings are far from the query even when
|
||||||
|
BM25 signal is strong.
|
||||||
|
|
||||||
|
Dedup is chunk-precise: the key is ``(_source_file_full, _chunk_index)``
|
||||||
|
so two files sharing a basename in different directories don't collide,
|
||||||
|
and a vector hit on chunk N of a file doesn't block BM25 from
|
||||||
|
contributing chunk M of the same file. Falls back to ``source_file``
|
||||||
|
only when full-path/chunk metadata is absent.
|
||||||
|
|
||||||
|
BM25-only additions carry ``distance=None`` so ``_hybrid_rank`` scores
|
||||||
|
them on BM25 contribution alone.
|
||||||
|
|
||||||
|
When ``max_distance > 0.0`` (a strict vector-distance threshold is
|
||||||
|
set), BM25-only candidates are skipped entirely — they have no vector
|
||||||
|
distance to satisfy the threshold, and silently injecting them would
|
||||||
|
break the existing ``max_distance`` guarantee that hybrid results lie
|
||||||
|
within the requested vector-distance bound.
|
||||||
|
"""
|
||||||
|
if max_distance > 0.0:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
bm25_extra = _bm25_only_via_sqlite(
|
||||||
|
query,
|
||||||
|
palace_path,
|
||||||
|
wing=wing,
|
||||||
|
room=room,
|
||||||
|
n_results=n_results * 3,
|
||||||
|
_include_internal=True,
|
||||||
|
).get("results", [])
|
||||||
|
except Exception:
|
||||||
|
logger.debug("candidate_strategy=union: BM25 fetch failed", exc_info=True)
|
||||||
|
return
|
||||||
|
|
||||||
|
def _dedup_key(entry: dict):
|
||||||
|
full = entry.get("_source_file_full")
|
||||||
|
ci = entry.get("_chunk_index")
|
||||||
|
if full and ci is not None:
|
||||||
|
return (full, ci)
|
||||||
|
# Fall back to basename only when richer metadata is missing —
|
||||||
|
# avoids silently dropping candidates on legacy data while still
|
||||||
|
# giving chunk-precise dedup whenever the metadata is present.
|
||||||
|
return entry.get("source_file")
|
||||||
|
|
||||||
|
seen = {_dedup_key(h) for h in hits}
|
||||||
|
for bh in bm25_extra:
|
||||||
|
key = _dedup_key(bh)
|
||||||
|
if not key or key == "?" or key in seen:
|
||||||
|
continue
|
||||||
|
bh["distance"] = None
|
||||||
|
bh["effective_distance"] = None
|
||||||
|
bh["closet_boost"] = 0.0
|
||||||
|
hits.append(bh)
|
||||||
|
seen.add(key)
|
||||||
|
|
||||||
|
|
||||||
|
# Strategy dispatch — keeps search_memories' branch count under the
|
||||||
|
# project's complexity ceiling (C901 max-complexity=25). New strategies
|
||||||
|
# register here.
|
||||||
|
_CANDIDATE_MERGERS = {
|
||||||
|
"vector": None, # default no-op
|
||||||
|
"union": _merge_bm25_union_candidates,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_candidate_strategy(strategy: str) -> None:
|
||||||
|
"""Raise ``ValueError`` for unknown strategies.
|
||||||
|
|
||||||
|
Called eagerly at the top of ``search_memories`` so invalid values
|
||||||
|
fail consistently regardless of whether the call routes through the
|
||||||
|
vector path, the BM25-only fallback, or returns an early error dict.
|
||||||
|
"""
|
||||||
|
if strategy not in _CANDIDATE_MERGERS:
|
||||||
|
raise ValueError(
|
||||||
|
f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, got {strategy!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_candidate_strategy(
|
||||||
|
strategy: str,
|
||||||
|
hits: list,
|
||||||
|
query: str,
|
||||||
|
palace_path: str,
|
||||||
|
wing: str,
|
||||||
|
room: str,
|
||||||
|
n_results: int,
|
||||||
|
max_distance: float = 0.0,
|
||||||
|
) -> None:
|
||||||
|
"""Dispatch to the registered merger for ``strategy``.
|
||||||
|
|
||||||
|
Strategy validity is assumed (``_validate_candidate_strategy`` runs
|
||||||
|
earlier); ``"vector"`` is a no-op.
|
||||||
|
"""
|
||||||
|
merger = _CANDIDATE_MERGERS[strategy]
|
||||||
|
if merger is not None:
|
||||||
|
merger(hits, query, palace_path, wing, room, n_results, max_distance=max_distance)
|
||||||
|
|
||||||
|
|
||||||
def search_memories(
|
def search_memories(
|
||||||
query: str,
|
query: str,
|
||||||
palace_path: str,
|
palace_path: str,
|
||||||
@@ -553,6 +688,7 @@ def search_memories(
|
|||||||
n_results: int = 5,
|
n_results: int = 5,
|
||||||
max_distance: float = 0.0,
|
max_distance: float = 0.0,
|
||||||
vector_disabled: bool = False,
|
vector_disabled: bool = False,
|
||||||
|
candidate_strategy: str = "vector",
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Programmatic search — returns a dict instead of printing.
|
"""Programmatic search — returns a dict instead of printing.
|
||||||
|
|
||||||
@@ -572,7 +708,30 @@ def search_memories(
|
|||||||
(#1222). Set by the MCP server when the HNSW capacity probe
|
(#1222). Set by the MCP server when the HNSW capacity probe
|
||||||
detects a divergence that would segfault chromadb on segment
|
detects a divergence that would segfault chromadb on segment
|
||||||
load.
|
load.
|
||||||
|
candidate_strategy: How candidates for the hybrid re-rank are gathered.
|
||||||
|
|
||||||
|
* ``"vector"`` (default) — preserves historical behavior: top
|
||||||
|
``n_results * 3`` rows from the vector index are the rerank pool.
|
||||||
|
Cheap; works well when query and target docs agree in the
|
||||||
|
embedding space.
|
||||||
|
* ``"union"`` — also pull top ``n_results * 3`` BM25 candidates
|
||||||
|
from the sqlite FTS5 index and merge them into the rerank pool
|
||||||
|
(deduped by source_file). Catches docs with strong BM25 signal
|
||||||
|
that are vector-distant from the query (e.g. terminology guides
|
||||||
|
looked up by narrative-shaped queries; policy clauses surfaced
|
||||||
|
by scenario descriptions). Adds one sqlite open + FTS5 MATCH
|
||||||
|
per query; perf cost is small but unmeasured at corpus scale.
|
||||||
|
Opt in until the cost is characterized.
|
||||||
|
|
||||||
|
When ``max_distance > 0.0`` is also set, BM25-only candidates
|
||||||
|
are skipped — they have no vector distance and would silently
|
||||||
|
violate the requested distance threshold.
|
||||||
"""
|
"""
|
||||||
|
# Validate the strategy eagerly so invalid values fail the same way
|
||||||
|
# regardless of whether the call routes through the vector path or
|
||||||
|
# the BM25-only fallback below.
|
||||||
|
_validate_candidate_strategy(candidate_strategy)
|
||||||
|
|
||||||
if vector_disabled:
|
if vector_disabled:
|
||||||
return _bm25_only_via_sqlite(
|
return _bm25_only_via_sqlite(
|
||||||
query,
|
query,
|
||||||
@@ -748,8 +907,29 @@ def search_memories(
|
|||||||
h["drawer_index"] = best_idx
|
h["drawer_index"] = best_idx
|
||||||
h["total_drawers"] = len(ordered_docs)
|
h["total_drawers"] = len(ordered_docs)
|
||||||
|
|
||||||
# BM25 hybrid re-rank within the final candidate set.
|
# Candidate strategy hook: optionally widen the rerank pool's *source*
|
||||||
hits = _hybrid_rank(hits, query)
|
# before ranking. Default ("vector") is a no-op; "union" merges top-K
|
||||||
|
# BM25 candidates from sqlite. See `_apply_candidate_strategy`.
|
||||||
|
# ``max_distance`` is forwarded so union mode can refuse to inject
|
||||||
|
# BM25-only (distance=None) candidates that would silently bypass the
|
||||||
|
# caller's strict distance threshold.
|
||||||
|
_apply_candidate_strategy(
|
||||||
|
candidate_strategy,
|
||||||
|
hits,
|
||||||
|
query,
|
||||||
|
palace_path,
|
||||||
|
wing,
|
||||||
|
room,
|
||||||
|
n_results,
|
||||||
|
max_distance=max_distance,
|
||||||
|
)
|
||||||
|
|
||||||
|
# BM25 hybrid re-rank within the final candidate set, then trim back
|
||||||
|
# to the requested size. Without the trim, ``candidate_strategy="union"``
|
||||||
|
# would return up to 4× ``n_results`` (vector hits + BM25 union pool),
|
||||||
|
# breaking the existing ``search_memories`` size contract that the MCP
|
||||||
|
# ``limit`` parameter is built on.
|
||||||
|
hits = _hybrid_rank(hits, query)[:n_results]
|
||||||
for h in hits:
|
for h in hits:
|
||||||
h.pop("_sort_key", None)
|
h.pop("_sort_key", None)
|
||||||
h.pop("_source_file_full", None)
|
h.pop("_source_file_full", None)
|
||||||
|
|||||||
@@ -0,0 +1,234 @@
|
|||||||
|
"""Tests for ``candidate_strategy="union"`` in ``search_memories``.
|
||||||
|
|
||||||
|
The default ``"vector"`` strategy gathers candidates from the vector index
|
||||||
|
only. Docs with strong BM25 signal but vector embeddings far from the query
|
||||||
|
get skipped — terminology guides looked up by narrative-shaped queries are
|
||||||
|
the canonical case.
|
||||||
|
|
||||||
|
The ``"union"`` strategy also pulls top-K BM25-only candidates from sqlite
|
||||||
|
FTS5 and merges them into the rerank pool. Both signal sources contribute
|
||||||
|
candidates; the hybrid rerank picks the best from a richer pool.
|
||||||
|
|
||||||
|
Default behavior is unchanged ("vector") — these tests exercise opt-in
|
||||||
|
"union" mode.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from mempalace.palace import get_collection
|
||||||
|
from mempalace.searcher import search_memories
|
||||||
|
|
||||||
|
|
||||||
|
def _seed_drawers(palace_path):
|
||||||
|
"""Seed a corpus where the right doc for one query is BM25-strong but
|
||||||
|
vector-distant.
|
||||||
|
|
||||||
|
D1-D3 are short narrative tickets that semantically cluster around
|
||||||
|
"customer support / order / shipped" vocabulary. D4 is a meta-document
|
||||||
|
of bullet rules ("brand voice") that contains rare keywords like
|
||||||
|
"Absolutely" and "apologize" the query repeats verbatim — strong BM25
|
||||||
|
signal but stylistically far from the narrative tickets.
|
||||||
|
"""
|
||||||
|
col = get_collection(palace_path, create=True)
|
||||||
|
col.upsert(
|
||||||
|
ids=["D1", "D2", "D3", "D4"],
|
||||||
|
documents=[
|
||||||
|
"Customer wrote in asking why their order shipped without "
|
||||||
|
"the promo sticker. Standard reply explaining the threshold.",
|
||||||
|
"Order delivery delayed three days; customer requested a "
|
||||||
|
"refund. Support agent processed return via ticket queue.",
|
||||||
|
"Customer asked about the missing freebie; the reply "
|
||||||
|
"explained the campaign mechanics and shipped status.",
|
||||||
|
"Brand voice rules: dry, sturdy, never effusive. "
|
||||||
|
"Never 'Absolutely!' Never apologize for policy — explain it. "
|
||||||
|
"Avoid premium / curated / elevated vocabulary.",
|
||||||
|
],
|
||||||
|
metadatas=[
|
||||||
|
{"wing": "shop", "room": "support", "source_file": "ticket_D1.md"},
|
||||||
|
{"wing": "shop", "room": "support", "source_file": "ticket_D2.md"},
|
||||||
|
{"wing": "shop", "room": "support", "source_file": "ticket_D3.md"},
|
||||||
|
{"wing": "shop", "room": "guides", "source_file": "brand_voice_D4.md"},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_NARRATIVE_QUERY = (
|
||||||
|
"A support agent is drafting a reply to a customer asking why their "
|
||||||
|
"order shipped without a free sticker. Draft the reply, but never say "
|
||||||
|
"'Absolutely!' and do not apologize for policy."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCandidateUnion:
|
||||||
|
def test_default_vector_strategy_unchanged(self, tmp_path):
|
||||||
|
"""Default behavior must be identical to omitting the parameter."""
|
||||||
|
palace = str(tmp_path / "palace")
|
||||||
|
_seed_drawers(palace)
|
||||||
|
without = search_memories(_NARRATIVE_QUERY, palace, n_results=5)
|
||||||
|
with_default = search_memories(
|
||||||
|
_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector"
|
||||||
|
)
|
||||||
|
ids_a = [h["source_file"] for h in without["results"]]
|
||||||
|
ids_b = [h["source_file"] for h in with_default["results"]]
|
||||||
|
assert ids_a == ids_b, "explicit candidate_strategy='vector' must match default"
|
||||||
|
|
||||||
|
def test_union_surfaces_bm25_strong_vector_distant_doc(self, tmp_path):
|
||||||
|
"""The brand-voice doc has strong BM25 signal for the query but is
|
||||||
|
stylistically far from the narrative tickets. Union mode must
|
||||||
|
retrieve it; vector-only mode is allowed to miss it."""
|
||||||
|
palace = str(tmp_path / "palace")
|
||||||
|
_seed_drawers(palace)
|
||||||
|
result = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union")
|
||||||
|
ids = [h["source_file"] for h in result["results"]]
|
||||||
|
assert "brand_voice_D4.md" in ids, (
|
||||||
|
"union mode must surface BM25-strong docs even when vector signal "
|
||||||
|
f"is weak; got {ids}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_union_preserves_vector_hits(self, tmp_path):
|
||||||
|
"""Union mode must not drop docs that vector-only mode finds —
|
||||||
|
the rerank pool grows, it doesn't shrink."""
|
||||||
|
palace = str(tmp_path / "palace")
|
||||||
|
_seed_drawers(palace)
|
||||||
|
vector = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector")
|
||||||
|
union = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union")
|
||||||
|
vec_ids = {h["source_file"] for h in vector["results"]}
|
||||||
|
union_ids = {h["source_file"] for h in union["results"]}
|
||||||
|
# In a 4-doc corpus with n_results=5, both should return all 4.
|
||||||
|
# The invariant is: union should not lose anything vector found.
|
||||||
|
missing = vec_ids - union_ids
|
||||||
|
assert not missing, f"union dropped docs that vector found: {missing}"
|
||||||
|
|
||||||
|
def test_union_handles_empty_palace(self, tmp_path):
|
||||||
|
"""No drawers — union mode should return empty results, not crash."""
|
||||||
|
palace = str(tmp_path / "palace")
|
||||||
|
get_collection(palace, create=True) # create empty collection
|
||||||
|
result = search_memories("anything", palace, n_results=5, candidate_strategy="union")
|
||||||
|
assert result.get("results", []) == []
|
||||||
|
|
||||||
|
def test_invalid_candidate_strategy_raises(self, tmp_path):
|
||||||
|
"""Bad arg should raise rather than silently fall back."""
|
||||||
|
palace = str(tmp_path / "palace")
|
||||||
|
_seed_drawers(palace)
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="candidate_strategy"):
|
||||||
|
search_memories("anything", palace, n_results=5, candidate_strategy="bogus")
|
||||||
|
|
||||||
|
def test_invalid_strategy_raises_even_when_vector_disabled(self, tmp_path):
|
||||||
|
"""Validation must happen before the ``vector_disabled`` early return —
|
||||||
|
invalid values must fail consistently regardless of routing."""
|
||||||
|
palace = str(tmp_path / "palace")
|
||||||
|
_seed_drawers(palace)
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="candidate_strategy"):
|
||||||
|
search_memories(
|
||||||
|
"anything",
|
||||||
|
palace,
|
||||||
|
n_results=5,
|
||||||
|
vector_disabled=True,
|
||||||
|
candidate_strategy="bogus",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_union_respects_n_results_limit(self, tmp_path):
|
||||||
|
"""When the merged candidate set is larger than ``n_results``, the
|
||||||
|
result must be trimmed back to the requested size — the MCP
|
||||||
|
``limit`` contract depends on this invariant."""
|
||||||
|
palace = str(tmp_path / "palace")
|
||||||
|
_seed_drawers(palace)
|
||||||
|
# 4-doc corpus, n_results=2 → union pool can grow to ~8 candidates,
|
||||||
|
# rerank reorders them, but final list must respect the cap.
|
||||||
|
result = search_memories(_NARRATIVE_QUERY, palace, n_results=2, candidate_strategy="union")
|
||||||
|
assert (
|
||||||
|
len(result["results"]) <= 2
|
||||||
|
), f"union must trim to n_results=2; got {len(result['results'])} results"
|
||||||
|
|
||||||
|
def test_union_skipped_when_max_distance_set(self, tmp_path):
|
||||||
|
"""``max_distance`` is a vector-distance threshold; BM25-only
|
||||||
|
candidates have ``distance=None`` and cannot satisfy it. Union
|
||||||
|
must not silently inject them when a strict threshold is set,
|
||||||
|
otherwise the existing ``max_distance`` guarantee regresses."""
|
||||||
|
palace = str(tmp_path / "palace")
|
||||||
|
_seed_drawers(palace)
|
||||||
|
# Sanity: without max_distance, union surfaces the BM25-strong doc.
|
||||||
|
unfiltered = search_memories(
|
||||||
|
_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union"
|
||||||
|
)
|
||||||
|
assert "brand_voice_D4.md" in {h["source_file"] for h in unfiltered["results"]}
|
||||||
|
|
||||||
|
# With a tight max_distance, union must NOT inject BM25-only hits —
|
||||||
|
# every returned hit must have a real (non-None) distance.
|
||||||
|
filtered = search_memories(
|
||||||
|
_NARRATIVE_QUERY,
|
||||||
|
palace,
|
||||||
|
n_results=5,
|
||||||
|
candidate_strategy="union",
|
||||||
|
max_distance=0.5,
|
||||||
|
)
|
||||||
|
for h in filtered["results"]:
|
||||||
|
assert h.get("distance") is not None, (
|
||||||
|
f"union under max_distance must not inject BM25-only "
|
||||||
|
f"(distance=None) candidates; offending hit: {h}"
|
||||||
|
)
|
||||||
|
assert h["distance"] <= 0.5, f"hit violates max_distance=0.5: distance={h['distance']}"
|
||||||
|
|
||||||
|
def test_union_dedup_is_chunk_precise_not_basename(self, tmp_path):
|
||||||
|
"""Two files with the same basename in different directories must
|
||||||
|
not collide — union must dedup on full path (or chunk-level key),
|
||||||
|
not on basename alone. Otherwise a BM25-strong README from one
|
||||||
|
directory silently shadows a BM25-strong README from another.
|
||||||
|
"""
|
||||||
|
palace = str(tmp_path / "palace")
|
||||||
|
col = get_collection(palace, create=True)
|
||||||
|
col.upsert(
|
||||||
|
ids=["A_README", "B_README", "narrative"],
|
||||||
|
documents=[
|
||||||
|
# Both README files share the basename README.md but live
|
||||||
|
# in different directories. Each contains distinctive
|
||||||
|
# terminology a query might surface via BM25.
|
||||||
|
"PROJECT ALPHA: configuration for the Frobnitz subsystem. "
|
||||||
|
"Set FROBNITZ_TIMEOUT=30 to enable widget rotation.",
|
||||||
|
"PROJECT BETA: configuration for the Wibble subsystem. "
|
||||||
|
"Set WIBBLE_THRESHOLD=0.5 to enable signal smoothing.",
|
||||||
|
"Engineers occasionally chat about how the legacy "
|
||||||
|
"subsystems all need their config knobs tweaked.",
|
||||||
|
],
|
||||||
|
metadatas=[
|
||||||
|
{"wing": "code", "room": "docs", "source_file": "alpha/README.md"},
|
||||||
|
{"wing": "code", "room": "docs", "source_file": "beta/README.md"},
|
||||||
|
{"wing": "code", "room": "docs", "source_file": "chat.md"},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# Query that hits BM25 for BOTH READMEs (distinct vocab from each).
|
||||||
|
# Vector-only might pick the chat doc as semantically "closest";
|
||||||
|
# union must surface both READMEs without basename collision.
|
||||||
|
result = search_memories(
|
||||||
|
"FROBNITZ_TIMEOUT WIBBLE_THRESHOLD configuration",
|
||||||
|
palace,
|
||||||
|
n_results=5,
|
||||||
|
candidate_strategy="union",
|
||||||
|
)
|
||||||
|
sources = [h["source_file"] for h in result["results"]]
|
||||||
|
readme_count = sum(1 for s in sources if s == "README.md")
|
||||||
|
assert readme_count >= 2, (
|
||||||
|
f"union must surface both README.md files from different dirs "
|
||||||
|
f"(basename collision would drop one); got sources={sources}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHybridRankTolerantOfMissingDistance:
|
||||||
|
"""``_hybrid_rank`` accepts ``distance=None`` — required for BM25-only
|
||||||
|
candidates injected by union mode."""
|
||||||
|
|
||||||
|
def test_distance_none_scored_as_zero_vector_sim(self):
|
||||||
|
from mempalace.searcher import _hybrid_rank
|
||||||
|
|
||||||
|
results = [
|
||||||
|
{"text": "alpha beta gamma", "distance": 0.2}, # close vector match
|
||||||
|
{"text": "alpha alpha alpha", "distance": None}, # BM25-only — heavy term repetition
|
||||||
|
]
|
||||||
|
# Query matches "alpha" heavily; the BM25-only candidate with no
|
||||||
|
# vector signal should still rank competitively on BM25 alone.
|
||||||
|
ranked = _hybrid_rank(results, "alpha")
|
||||||
|
assert all("bm25_score" in r for r in ranked), "rerank should add bm25_score"
|
||||||
|
# Both must survive — neither should crash on distance=None.
|
||||||
|
assert len(ranked) == 2
|
||||||
Reference in New Issue
Block a user