From 133dfbfb41d5285c722b5a9bc3bcf6ef6e71b8b7 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 24 Apr 2026 18:50:28 -0300 Subject: [PATCH] fix(search): BM25 hybrid rerank, legacy-metric warning, invariant tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three tightly-coupled search-quality fixes for v3.3.3: 1. CLI `mempalace search` now routes through the same `_hybrid_rank` the MCP path already used. Drawers whose text contains every query term but embed as file-tree noise (directory listings, diffs, log fragments) were scoring cosine distance >= 1.0 — the display formula `max(0, 1 - dist)` then floored every result to `Match: 0.0`, with no way for the user to tell a lexical match from a total miss. BM25 catches these cleanly; the display surfaces both `cosine=` and `bm25=` so users see which component is firing. 2. Legacy-palace distance-metric warning. Palaces created before `hnsw:space=cosine` was consistently set silently use ChromaDB's default L2 metric, which breaks the cosine-similarity formula (L2 distances routinely exceed 1.0 on normalized 384-dim vectors). The search path now detects this at query time and prints a one-line notice pointing at `mempalace repair`. Only fires for legacy palaces; new palaces already set cosine correctly. 3. Invariant tests pinning `hnsw:space=cosine` on every collection- creation path — legacy `get_or_create_collection`, legacy `create_collection`, RFC 001 `get_collection(create=True)`, the public `palace.get_collection`, and a round-trip through reopen. Locks down the correctness that new-user palaces already have so a future refactor can't silently regress it. Also adds a `metadata` property to `ChromaCollection` so callers can read the underlying hnsw:space without reaching into `_collection`. Tests: - New regression: simulate three candidates at distance 1.5 (cosine=0), one containing query terms — must rank first with non-zero bm25. - New: legacy metric (empty or non-cosine) produces stderr warning. - New: correctly-configured palace produces no warning. - New: all five creation paths pin cosine metadata. All existing tests still pass. --- CHANGELOG.md | 5 ++ mempalace/backends/chroma.py | 12 ++++ mempalace/searcher.py | 65 ++++++++++++++++-- tests/test_collection_metric_invariant.py | 82 +++++++++++++++++++++++ tests/test_searcher.py | 79 ++++++++++++++++++++++ 5 files changed, 238 insertions(+), 5 deletions(-) create mode 100644 tests/test_collection_metric_invariant.py diff --git a/CHANGELOG.md b/CHANGELOG.md index fa7c90f..c17442b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), ## [3.3.4] — unreleased +### Bug Fixes + +- **CLI `mempalace search` retrieval quality.** The CLI was using pure ChromaDB cosine distance with no BM25 rerank, so drawers containing every query term but embedding as noise (directory listings, diff output, shell logs) scored `Match: 0.0` alongside genuinely irrelevant results with no way to tell them apart. Wired the CLI through the same `_hybrid_rank` the `mempalace_search` MCP tool already used, and surfaced both `cosine=` and `bm25=` scores in the output so users see which component of the match is firing. MCP search was unaffected; this fixes the human-facing CLI parity gap. +- **Legacy-palace distance-metric warning.** CLI search now detects palaces created before `hnsw:space=cosine` was consistently set and prints a one-line notice pointing at `mempalace repair`. Without the warning such palaces silently used L2 distance, under which the similarity display floored every result to `Match: 0.0`. New palaces mined today already set cosine correctly and now have invariant tests pinning that behavior so future refactors can't silently regress it. (#1179) + ### Added - **Cross-wing topic tunnels.** When two wings have confirmed `TOPIC` labels in common (the LLM-refine bucket from `mempalace init --llm`), the miner now drops a symmetric tunnel between them at mine time so the palace graph reflects shared themes (frameworks, vendors, recurring concepts). Tunnels are routed through the existing `create_tunnel` storage so they share dedup and persistence with explicit tunnels. Topic tunnels are stored under a synthetic `topic:` room and tagged with `kind: "topic"` on the stored dict — this keeps them distinct from literal folder-derived rooms of the same name (a wing with both an `Angular` folder room and an `Angular` topic tunnel no longer collides at `follow_tunnels` read time) and gives LLMs scanning `list_tunnels` a visible discriminator. Threshold is configurable via `MEMPALACE_TOPIC_TUNNEL_MIN_COUNT` env var or `topic_tunnel_min_count` in `~/.mempalace/config.json` (default `1`). Manifest-dependency overlap and per-topic allow/deny lists remain out of scope. (#1180) diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 8ba2279..14ae9cd 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -368,6 +368,18 @@ class ChromaCollection(BaseCollection): def count(self): return self._collection.count() + @property + def metadata(self) -> dict: + """Pass-through to the underlying ChromaDB collection's metadata. + + Used by the searcher to detect legacy palaces that were created + without ``hnsw:space=cosine`` and therefore silently use L2 + distance, which breaks cosine-based similarity interpretation. + Returns ``{}`` when metadata is absent so callers can do a plain + ``.get("hnsw:space")`` without None-checks. + """ + return self._collection.metadata or {} + # --------------------------------------------------------------------------- # Backend diff --git a/mempalace/searcher.py b/mempalace/searcher.py index 5208b13..c4a1d87 100644 --- a/mempalace/searcher.py +++ b/mempalace/searcher.py @@ -236,6 +236,42 @@ def _expand_with_neighbors(drawers_col, matched_doc: str, matched_meta: dict, ra } +def _warn_if_legacy_metric(col) -> None: + """Print a one-line notice if the palace was created without + ``hnsw:space=cosine``. + + ChromaDB's default is L2 (Euclidean), under which cosine-based + similarity interpretation falls apart — distances routinely exceed + 1.0 and the display ``max(0, 1 - dist)`` floors every result to 0. + Legacy palaces (mined before this metadata was consistently set) + need ``mempalace repair`` to rebuild with the correct metric. + + The warning fires only for palaces that clearly have the wrong + metric; palaces with no metadata table at all (empty dict) also + fall under this check since that is the signal of a pre-metadata + palace. + """ + try: + meta = getattr(col, "metadata", None) + except Exception: + return + if not isinstance(meta, dict): + return + space = meta.get("hnsw:space") + if space == "cosine": + return + # Either missing or set to something else — both are suspect. + import sys as _sys + + detail = f"hnsw:space={space!r}" if space else "no hnsw:space metadata" + print( + f"\n NOTICE: this palace was created without cosine distance ({detail}).\n" + " Semantic similarity scores will not be meaningful.\n" + " Run `mempalace repair` to rebuild the index with the correct metric.", + file=_sys.stderr, + ) + + def search(query: str, palace_path: str, wing: str = None, room: str = None, n_results: int = 5): """ Search the palace. Returns verbatim drawer content. @@ -248,6 +284,10 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r print(" Run: mempalace init then mempalace mine ") raise SearchError(f"No palace found at {palace_path}") + # Alert the user if this palace predates hnsw:space=cosine being set on + # creation — their similarity scores will be junk until they run repair. + _warn_if_legacy_metric(col) + where = build_where_filter(wing, room) try: @@ -273,6 +313,20 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r print(f'\n No results found for: "{query}"') return + # Pure-cosine retrieval on the CLI path was missing lexical matches: + # a drawer whose text contains every query term can still score distance + # >= 1.0 against the natural-language query when the drawer is a + # mechanical artifact (directory listing, diff, log fragment) that + # embeds as file-tree noise rather than as prose about its subject. + # The MCP tool path already hybridizes BM25 with vector sim via + # `_hybrid_rank`; do the same here so CLI results match what agents + # see via `mempalace_search`. + hits = [ + {"text": doc, "distance": float(dist), "metadata": meta or {}} + for doc, meta, dist in zip(docs, metas, dists) + ] + hits = _hybrid_rank(hits, query) + print(f"\n{'=' * 60}") print(f' Results for: "{query}"') if wing: @@ -281,19 +335,20 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r print(f" Room: {room}") print(f"{'=' * 60}\n") - for i, (doc, meta, dist) in enumerate(zip(docs, metas, dists), 1): - similarity = round(max(0.0, 1 - dist), 3) - meta = meta or {} + for i, hit in enumerate(hits, 1): + vec_sim = round(max(0.0, 1 - hit["distance"]), 3) + bm25 = hit.get("bm25_score", 0.0) + meta = hit["metadata"] source = Path(meta.get("source_file", "?")).name wing_name = meta.get("wing", "?") room_name = meta.get("room", "?") print(f" [{i}] {wing_name} / {room_name}") print(f" Source: {source}") - print(f" Match: {similarity}") + print(f" Match: cosine={vec_sim} bm25={bm25}") print() # Print the verbatim text, indented - for line in doc.strip().split("\n"): + for line in hit["text"].strip().split("\n"): print(f" {line}") print() print(f" {'─' * 56}") diff --git a/tests/test_collection_metric_invariant.py b/tests/test_collection_metric_invariant.py new file mode 100644 index 0000000..851711e --- /dev/null +++ b/tests/test_collection_metric_invariant.py @@ -0,0 +1,82 @@ +"""Invariant tests: every ChromaDB collection-creation path must set +``hnsw:space=cosine``. + +Reason: ChromaDB's default HNSW distance is L2 (Euclidean). Under L2, +the searcher's ``max(0, 1 - distance)`` similarity formula systematically +floors to 0 because L2 distances on normalized 384-dim vectors routinely +exceed 1.0 — users then see flat ``Match: 0.0`` across every result and +have no signal that their palace is broken. + +This test file locks the invariant so a future refactor that drops the +``metadata={"hnsw:space": "cosine"}`` parameter from any creation path +gets caught at test time rather than silently degrading search quality. +""" + +import tempfile + +from mempalace.backends.chroma import ChromaBackend +from mempalace.palace import get_collection + + +EXPECTED_METRIC = "cosine" + + +def _assert_cosine(col, where: str) -> None: + meta = col.metadata if hasattr(col, "metadata") else col._collection.metadata + assert isinstance(meta, dict), f"{where}: expected metadata dict, got {meta!r}" + assert meta.get("hnsw:space") == EXPECTED_METRIC, ( + f"{where}: expected hnsw:space={EXPECTED_METRIC!r}, got {meta!r}. " + "A collection without cosine metric will silently break the " + "similarity formula used by the searcher." + ) + + +def test_legacy_get_or_create_collection_sets_cosine(tmp_path): + backend = ChromaBackend() + col = backend.get_or_create_collection(str(tmp_path), "mempalace_drawers") + _assert_cosine(col, "legacy get_or_create_collection") + + +def test_legacy_create_collection_sets_cosine(tmp_path): + backend = ChromaBackend() + col = backend.create_collection(str(tmp_path), "mempalace_drawers") + _assert_cosine(col, "legacy create_collection") + + +def test_new_get_collection_with_create_sets_cosine(tmp_path): + """RFC 001 typed surface — ``get_collection(..., create=True)`` is the + path the miner + init flow take. Must also set cosine.""" + backend = ChromaBackend() + col = backend.get_collection(str(tmp_path), "mempalace_drawers", create=True) + _assert_cosine(col, "get_collection(create=True)") + + +def test_palace_module_get_collection_sets_cosine(tmp_path): + """The public ``mempalace.palace.get_collection`` is what most callers + use. Must produce cosine palaces.""" + col = get_collection(str(tmp_path), "mempalace_drawers", create=True) + _assert_cosine(col, "palace.get_collection(create=True)") + + +def test_reopening_cosine_palace_preserves_metric(tmp_path): + """Opening a previously-created cosine palace (create=False) must + still expose the cosine metadata — catches any regression where + reopening drops or overwrites metadata.""" + backend = ChromaBackend() + backend.create_collection(str(tmp_path), "mempalace_drawers") + # Fresh backend simulates a process restart + backend2 = ChromaBackend() + col = backend2.get_collection(str(tmp_path), "mempalace_drawers", create=False) + _assert_cosine(col, "re-opened palace") + + +def test_fresh_palace_via_full_stack_gets_cosine(): + """End-to-end: build a palace with the public API the way a new user + would, confirm the resulting collection uses cosine distance.""" + with tempfile.TemporaryDirectory() as tmp: + col = get_collection(tmp, "mempalace_drawers", create=True) + _assert_cosine(col, "full-stack new palace") + + # And the closets collection too + closets = get_collection(tmp, "mempalace_closets", create=True) + _assert_cosine(closets, "full-stack new closets") diff --git a/tests/test_searcher.py b/tests/test_searcher.py index 3ccfb2d..65191c3 100644 --- a/tests/test_searcher.py +++ b/tests/test_searcher.py @@ -173,6 +173,85 @@ class TestSearchCLI: # Should have output with at least one result block assert "[1]" in captured.out + def test_search_applies_bm25_hybrid_rerank(self, capsys): + """CLI search must call the same hybrid rerank that the MCP path uses. + + Regression for a bug where the CLI only consulted ChromaDB cosine + distance: a drawer whose body contained every query term still + scored zero similarity if its embedding happened to be far from + the query (e.g. the drawer was a shell-output fragment that + embeds as "file tree noise"). Hybrid rerank fixes this by + combining BM25 with cosine — lexical matches rise above pure + vector noise. + + Simulates: three candidates, all with distance >= 1.0 (cosine = 0); + candidate 2 contains every query term. After the fix, candidate 2 + should rank first and display a non-zero bm25 score. + """ + mock_col = MagicMock() + mock_col.metadata = {"hnsw:space": "cosine"} + mock_col.query.return_value = { + "documents": [ + [ + "unrelated directory listing -rw-rw-r-- file.txt", + "foo bar baz is a multi-word phrase", + "another unrelated chunk about colors", + ] + ], + "metadatas": [ + [ + {"source_file": "a.md", "wing": "w", "room": "r"}, + {"source_file": "b.md", "wing": "w", "room": "r"}, + {"source_file": "c.md", "wing": "w", "room": "r"}, + ] + ], + "distances": [[1.5, 1.5, 1.5]], + } + with patch("mempalace.searcher.get_collection", return_value=mock_col): + search("foo bar baz", "/fake/path") + captured = capsys.readouterr() + first_block, _, _ = captured.out.partition("[2]") + # Lexical match must rank first + assert ( + "b.md" in first_block + ), f"expected lexical match 'b.md' at rank 1, got:\n{captured.out}" + # Non-zero bm25 reported + assert "bm25=" in first_block + assert "bm25=0.0" not in first_block + # Cosine still reported for transparency + assert "cosine=" in first_block + + def test_search_warns_when_palace_uses_wrong_distance_metric(self, capsys): + """Legacy palaces created without `hnsw:space=cosine` silently + use L2, which breaks similarity interpretation. CLI must warn + the user and point them at `mempalace repair` rather than + pretending the `Match` scores are meaningful.""" + mock_col = MagicMock() + mock_col.metadata = {} # legacy: no hnsw:space set + mock_col.query.return_value = { + "documents": [["some drawer content"]], + "metadatas": [[{"source_file": "a.md", "wing": "w", "room": "r"}]], + "distances": [[1.2]], + } + with patch("mempalace.searcher.get_collection", return_value=mock_col): + search("anything", "/fake/path") + captured = capsys.readouterr() + assert "mempalace repair" in captured.err + assert "cosine" in captured.err.lower() + + def test_search_does_not_warn_when_palace_is_correctly_configured(self, capsys): + mock_col = MagicMock() + mock_col.metadata = {"hnsw:space": "cosine"} + mock_col.query.return_value = { + "documents": [["some drawer content"]], + "metadatas": [[{"source_file": "a.md", "wing": "w", "room": "r"}]], + "distances": [[0.3]], + } + with patch("mempalace.searcher.get_collection", return_value=mock_col): + search("anything", "/fake/path") + captured = capsys.readouterr() + assert "mempalace repair" not in captured.err + def test_search_handles_none_metadata_without_crash(self, palace_path, capsys): """ChromaDB can return `None` entries in the metadatas list when a drawer has no metadata. The CLI print path must not crash on them