Files

235 lines
11 KiB
Python
Raw Permalink Normal View History

"""Tests for ``candidate_strategy="union"`` in ``search_memories``.
The default ``"vector"`` strategy gathers candidates from the vector index
only. Docs with strong BM25 signal but vector embeddings far from the query
get skipped — terminology guides looked up by narrative-shaped queries are
the canonical case.
The ``"union"`` strategy also pulls top-K BM25-only candidates from sqlite
FTS5 and merges them into the rerank pool. Both signal sources contribute
candidates; the hybrid rerank picks the best from a richer pool.
Default behavior is unchanged ("vector") — these tests exercise opt-in
"union" mode.
"""
from mempalace.palace import get_collection
from mempalace.searcher import search_memories
def _seed_drawers(palace_path):
"""Seed a corpus where the right doc for one query is BM25-strong but
vector-distant.
D1-D3 are short narrative tickets that semantically cluster around
"customer support / order / shipped" vocabulary. D4 is a meta-document
of bullet rules ("brand voice") that contains rare keywords like
"Absolutely" and "apologize" the query repeats verbatim — strong BM25
signal but stylistically far from the narrative tickets.
"""
col = get_collection(palace_path, create=True)
col.upsert(
ids=["D1", "D2", "D3", "D4"],
documents=[
"Customer wrote in asking why their order shipped without "
"the promo sticker. Standard reply explaining the threshold.",
"Order delivery delayed three days; customer requested a "
"refund. Support agent processed return via ticket queue.",
"Customer asked about the missing freebie; the reply "
"explained the campaign mechanics and shipped status.",
"Brand voice rules: dry, sturdy, never effusive. "
"Never 'Absolutely!' Never apologize for policy — explain it. "
"Avoid premium / curated / elevated vocabulary.",
],
metadatas=[
{"wing": "shop", "room": "support", "source_file": "ticket_D1.md"},
{"wing": "shop", "room": "support", "source_file": "ticket_D2.md"},
{"wing": "shop", "room": "support", "source_file": "ticket_D3.md"},
{"wing": "shop", "room": "guides", "source_file": "brand_voice_D4.md"},
],
)
_NARRATIVE_QUERY = (
"A support agent is drafting a reply to a customer asking why their "
"order shipped without a free sticker. Draft the reply, but never say "
"'Absolutely!' and do not apologize for policy."
)
class TestCandidateUnion:
def test_default_vector_strategy_unchanged(self, tmp_path):
"""Default behavior must be identical to omitting the parameter."""
palace = str(tmp_path / "palace")
_seed_drawers(palace)
without = search_memories(_NARRATIVE_QUERY, palace, n_results=5)
with_default = search_memories(
_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector"
)
ids_a = [h["source_file"] for h in without["results"]]
ids_b = [h["source_file"] for h in with_default["results"]]
assert ids_a == ids_b, "explicit candidate_strategy='vector' must match default"
def test_union_surfaces_bm25_strong_vector_distant_doc(self, tmp_path):
"""The brand-voice doc has strong BM25 signal for the query but is
stylistically far from the narrative tickets. Union mode must
retrieve it; vector-only mode is allowed to miss it."""
palace = str(tmp_path / "palace")
_seed_drawers(palace)
result = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union")
ids = [h["source_file"] for h in result["results"]]
assert "brand_voice_D4.md" in ids, (
"union mode must surface BM25-strong docs even when vector signal "
f"is weak; got {ids}"
)
def test_union_preserves_vector_hits(self, tmp_path):
"""Union mode must not drop docs that vector-only mode finds —
the rerank pool grows, it doesn't shrink."""
palace = str(tmp_path / "palace")
_seed_drawers(palace)
vector = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector")
union = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union")
vec_ids = {h["source_file"] for h in vector["results"]}
union_ids = {h["source_file"] for h in union["results"]}
# In a 4-doc corpus with n_results=5, both should return all 4.
# The invariant is: union should not lose anything vector found.
missing = vec_ids - union_ids
assert not missing, f"union dropped docs that vector found: {missing}"
def test_union_handles_empty_palace(self, tmp_path):
"""No drawers — union mode should return empty results, not crash."""
palace = str(tmp_path / "palace")
get_collection(palace, create=True) # create empty collection
result = search_memories("anything", palace, n_results=5, candidate_strategy="union")
assert result.get("results", []) == []
def test_invalid_candidate_strategy_raises(self, tmp_path):
"""Bad arg should raise rather than silently fall back."""
palace = str(tmp_path / "palace")
_seed_drawers(palace)
import pytest
with pytest.raises(ValueError, match="candidate_strategy"):
search_memories("anything", palace, n_results=5, candidate_strategy="bogus")
def test_invalid_strategy_raises_even_when_vector_disabled(self, tmp_path):
"""Validation must happen before the ``vector_disabled`` early return —
invalid values must fail consistently regardless of routing."""
palace = str(tmp_path / "palace")
_seed_drawers(palace)
import pytest
with pytest.raises(ValueError, match="candidate_strategy"):
search_memories(
"anything",
palace,
n_results=5,
vector_disabled=True,
candidate_strategy="bogus",
)
def test_union_respects_n_results_limit(self, tmp_path):
"""When the merged candidate set is larger than ``n_results``, the
result must be trimmed back to the requested size — the MCP
``limit`` contract depends on this invariant."""
palace = str(tmp_path / "palace")
_seed_drawers(palace)
# 4-doc corpus, n_results=2 → union pool can grow to ~8 candidates,
# rerank reorders them, but final list must respect the cap.
result = search_memories(_NARRATIVE_QUERY, palace, n_results=2, candidate_strategy="union")
assert (
len(result["results"]) <= 2
), f"union must trim to n_results=2; got {len(result['results'])} results"
def test_union_skipped_when_max_distance_set(self, tmp_path):
"""``max_distance`` is a vector-distance threshold; BM25-only
candidates have ``distance=None`` and cannot satisfy it. Union
must not silently inject them when a strict threshold is set,
otherwise the existing ``max_distance`` guarantee regresses."""
palace = str(tmp_path / "palace")
_seed_drawers(palace)
# Sanity: without max_distance, union surfaces the BM25-strong doc.
unfiltered = search_memories(
_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union"
)
assert "brand_voice_D4.md" in {h["source_file"] for h in unfiltered["results"]}
# With a tight max_distance, union must NOT inject BM25-only hits —
# every returned hit must have a real (non-None) distance.
filtered = search_memories(
_NARRATIVE_QUERY,
palace,
n_results=5,
candidate_strategy="union",
max_distance=0.5,
)
for h in filtered["results"]:
assert h.get("distance") is not None, (
f"union under max_distance must not inject BM25-only "
f"(distance=None) candidates; offending hit: {h}"
)
assert h["distance"] <= 0.5, f"hit violates max_distance=0.5: distance={h['distance']}"
def test_union_dedup_is_chunk_precise_not_basename(self, tmp_path):
"""Two files with the same basename in different directories must
not collide — union must dedup on full path (or chunk-level key),
not on basename alone. Otherwise a BM25-strong README from one
directory silently shadows a BM25-strong README from another.
"""
palace = str(tmp_path / "palace")
col = get_collection(palace, create=True)
col.upsert(
ids=["A_README", "B_README", "narrative"],
documents=[
# Both README files share the basename README.md but live
# in different directories. Each contains distinctive
# terminology a query might surface via BM25.
"PROJECT ALPHA: configuration for the Frobnitz subsystem. "
"Set FROBNITZ_TIMEOUT=30 to enable widget rotation.",
"PROJECT BETA: configuration for the Wibble subsystem. "
"Set WIBBLE_THRESHOLD=0.5 to enable signal smoothing.",
"Engineers occasionally chat about how the legacy "
"subsystems all need their config knobs tweaked.",
],
metadatas=[
{"wing": "code", "room": "docs", "source_file": "alpha/README.md"},
{"wing": "code", "room": "docs", "source_file": "beta/README.md"},
{"wing": "code", "room": "docs", "source_file": "chat.md"},
],
)
# Query that hits BM25 for BOTH READMEs (distinct vocab from each).
# Vector-only might pick the chat doc as semantically "closest";
# union must surface both READMEs without basename collision.
result = search_memories(
"FROBNITZ_TIMEOUT WIBBLE_THRESHOLD configuration",
palace,
n_results=5,
candidate_strategy="union",
)
sources = [h["source_file"] for h in result["results"]]
readme_count = sum(1 for s in sources if s == "README.md")
assert readme_count >= 2, (
f"union must surface both README.md files from different dirs "
f"(basename collision would drop one); got sources={sources}"
)
class TestHybridRankTolerantOfMissingDistance:
"""``_hybrid_rank`` accepts ``distance=None`` — required for BM25-only
candidates injected by union mode."""
def test_distance_none_scored_as_zero_vector_sim(self):
from mempalace.searcher import _hybrid_rank
results = [
{"text": "alpha beta gamma", "distance": 0.2}, # close vector match
{"text": "alpha alpha alpha", "distance": None}, # BM25-only — heavy term repetition
]
# Query matches "alpha" heavily; the BM25-only candidate with no
# vector signal should still rank competitively on BM25 alone.
ranked = _hybrid_rank(results, "alpha")
assert all("bm25_score" in r for r in ranked), "rerank should add bm25_score"
# Both must survive — neither should crash on distance=None.
assert len(ranked) == 2