Merge pull request #1179 from MemPalace/fix/search-metric-quality

fix(search): CLI hybrid rerank, legacy-metric warning, invariant tests (3.3.4)
This commit is contained in:
Igor Lins e Silva
2026-04-25 00:57:25 -03:00
committed by GitHub
5 changed files with 243 additions and 5 deletions
+5
View File
@@ -8,6 +8,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
## [3.3.4] — unreleased ## [3.3.4] — unreleased
### Bug Fixes
- **CLI `mempalace search` retrieval quality.** The CLI was using pure ChromaDB cosine distance with no BM25 rerank, so drawers containing every query term but embedding as noise (directory listings, diff output, shell logs) scored `Match: 0.0` alongside genuinely irrelevant results with no way to tell them apart. Wired the CLI through the same `_hybrid_rank` the `mempalace_search` MCP tool already used, and surfaced both `cosine=` and `bm25=` scores in the output so users see which component of the match is firing. MCP search was unaffected; this fixes the human-facing CLI parity gap.
- **Legacy-palace distance-metric warning.** CLI search now detects palaces created before `hnsw:space=cosine` was consistently set and prints a one-line notice pointing at `mempalace repair`. Without the warning such palaces silently used L2 distance, under which the similarity display floored every result to `Match: 0.0`. New palaces mined today already set cosine correctly and now have invariant tests pinning that behavior so future refactors can't silently regress it. (#1179)
### Added ### Added
- **Cross-wing topic tunnels.** When two wings have confirmed `TOPIC` labels in common (the LLM-refine bucket from `mempalace init --llm`), the miner now drops a symmetric tunnel between them at mine time so the palace graph reflects shared themes (frameworks, vendors, recurring concepts). Tunnels are routed through the existing `create_tunnel` storage so they share dedup and persistence with explicit tunnels. Topic tunnels are stored under a synthetic `topic:<name>` room and tagged with `kind: "topic"` on the stored dict — this keeps them distinct from literal folder-derived rooms of the same name (a wing with both an `Angular` folder room and an `Angular` topic tunnel no longer collides at `follow_tunnels` read time) and gives LLMs scanning `list_tunnels` a visible discriminator. Threshold is configurable via `MEMPALACE_TOPIC_TUNNEL_MIN_COUNT` env var or `topic_tunnel_min_count` in `~/.mempalace/config.json` (default `1`). Manifest-dependency overlap and per-topic allow/deny lists remain out of scope. (#1180) - **Cross-wing topic tunnels.** When two wings have confirmed `TOPIC` labels in common (the LLM-refine bucket from `mempalace init --llm`), the miner now drops a symmetric tunnel between them at mine time so the palace graph reflects shared themes (frameworks, vendors, recurring concepts). Tunnels are routed through the existing `create_tunnel` storage so they share dedup and persistence with explicit tunnels. Topic tunnels are stored under a synthetic `topic:<name>` room and tagged with `kind: "topic"` on the stored dict — this keeps them distinct from literal folder-derived rooms of the same name (a wing with both an `Angular` folder room and an `Angular` topic tunnel no longer collides at `follow_tunnels` read time) and gives LLMs scanning `list_tunnels` a visible discriminator. Threshold is configurable via `MEMPALACE_TOPIC_TUNNEL_MIN_COUNT` env var or `topic_tunnel_min_count` in `~/.mempalace/config.json` (default `1`). Manifest-dependency overlap and per-topic allow/deny lists remain out of scope. (#1180)
+12
View File
@@ -368,6 +368,18 @@ class ChromaCollection(BaseCollection):
def count(self): def count(self):
return self._collection.count() return self._collection.count()
@property
def metadata(self) -> dict:
"""Pass-through to the underlying ChromaDB collection's metadata.
Used by the searcher to detect legacy palaces that were created
without ``hnsw:space=cosine`` and therefore silently use L2
distance, which breaks cosine-based similarity interpretation.
Returns ``{}`` when metadata is absent so callers can do a plain
``.get("hnsw:space")`` without None-checks.
"""
return self._collection.metadata or {}
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Backend # Backend
+60 -5
View File
@@ -236,6 +236,42 @@ def _expand_with_neighbors(drawers_col, matched_doc: str, matched_meta: dict, ra
} }
def _warn_if_legacy_metric(col) -> None:
"""Print a one-line notice if the palace was created without
``hnsw:space=cosine``.
ChromaDB's default is L2 (Euclidean), under which cosine-based
similarity interpretation falls apart — distances routinely exceed
1.0 and the display ``max(0, 1 - dist)`` floors every result to 0.
Legacy palaces (mined before this metadata was consistently set)
need ``mempalace repair`` to rebuild with the correct metric.
The warning fires only for palaces that clearly have the wrong
metric; palaces with no metadata table at all (empty dict) also
fall under this check since that is the signal of a pre-metadata
palace.
"""
try:
meta = getattr(col, "metadata", None)
except Exception:
return
if not isinstance(meta, dict):
return
space = meta.get("hnsw:space")
if space == "cosine":
return
# Either missing or set to something else — both are suspect.
import sys as _sys
detail = f"hnsw:space={space!r}" if space else "no hnsw:space metadata"
print(
f"\n NOTICE: this palace was created without cosine distance ({detail}).\n"
" Semantic similarity scores will not be meaningful.\n"
" Run `mempalace repair` to rebuild the index with the correct metric.",
file=_sys.stderr,
)
def search(query: str, palace_path: str, wing: str = None, room: str = None, n_results: int = 5): def search(query: str, palace_path: str, wing: str = None, room: str = None, n_results: int = 5):
""" """
Search the palace. Returns verbatim drawer content. Search the palace. Returns verbatim drawer content.
@@ -248,6 +284,10 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
print(" Run: mempalace init <dir> then mempalace mine <dir>") print(" Run: mempalace init <dir> then mempalace mine <dir>")
raise SearchError(f"No palace found at {palace_path}") raise SearchError(f"No palace found at {palace_path}")
# Alert the user if this palace predates hnsw:space=cosine being set on
# creation — their similarity scores will be junk until they run repair.
_warn_if_legacy_metric(col)
where = build_where_filter(wing, room) where = build_where_filter(wing, room)
try: try:
@@ -273,6 +313,20 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
print(f'\n No results found for: "{query}"') print(f'\n No results found for: "{query}"')
return return
# Pure-cosine retrieval on the CLI path was missing lexical matches:
# a drawer whose text contains every query term can still score distance
# >= 1.0 against the natural-language query when the drawer is a
# mechanical artifact (directory listing, diff, log fragment) that
# embeds as file-tree noise rather than as prose about its subject.
# The MCP tool path already hybridizes BM25 with vector sim via
# `_hybrid_rank`; do the same here so CLI results match what agents
# see via `mempalace_search`.
hits = [
{"text": doc, "distance": float(dist), "metadata": meta or {}}
for doc, meta, dist in zip(docs, metas, dists)
]
hits = _hybrid_rank(hits, query)
print(f"\n{'=' * 60}") print(f"\n{'=' * 60}")
print(f' Results for: "{query}"') print(f' Results for: "{query}"')
if wing: if wing:
@@ -281,19 +335,20 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
print(f" Room: {room}") print(f" Room: {room}")
print(f"{'=' * 60}\n") print(f"{'=' * 60}\n")
for i, (doc, meta, dist) in enumerate(zip(docs, metas, dists), 1): for i, hit in enumerate(hits, 1):
similarity = round(max(0.0, 1 - dist), 3) vec_sim = round(max(0.0, 1 - hit["distance"]), 3)
meta = meta or {} bm25 = hit.get("bm25_score", 0.0)
meta = hit["metadata"]
source = Path(meta.get("source_file", "?")).name source = Path(meta.get("source_file", "?")).name
wing_name = meta.get("wing", "?") wing_name = meta.get("wing", "?")
room_name = meta.get("room", "?") room_name = meta.get("room", "?")
print(f" [{i}] {wing_name} / {room_name}") print(f" [{i}] {wing_name} / {room_name}")
print(f" Source: {source}") print(f" Source: {source}")
print(f" Match: {similarity}") print(f" Match: cosine={vec_sim} bm25={bm25}")
print() print()
# Print the verbatim text, indented # Print the verbatim text, indented
for line in doc.strip().split("\n"): for line in hit["text"].strip().split("\n"):
print(f" {line}") print(f" {line}")
print() print()
print(f" {'' * 56}") print(f" {'' * 56}")
+87
View File
@@ -0,0 +1,87 @@
"""Invariant tests: every ChromaDB collection-creation path must set
``hnsw:space=cosine``.
Reason: ChromaDB's default HNSW distance is L2 (Euclidean). Under L2,
the searcher's ``max(0, 1 - distance)`` similarity formula systematically
floors to 0 because L2 distances on normalized 384-dim vectors routinely
exceed 1.0 — users then see flat ``Match: 0.0`` across every result and
have no signal that their palace is broken.
This test file locks the invariant so a future refactor that drops the
``metadata={"hnsw:space": "cosine"}`` parameter from any creation path
gets caught at test time rather than silently degrading search quality.
"""
from mempalace.backends.chroma import ChromaBackend
from mempalace.palace import get_collection
EXPECTED_METRIC = "cosine"
def _assert_cosine(col, where: str) -> None:
meta = col.metadata if hasattr(col, "metadata") else col._collection.metadata
assert isinstance(meta, dict), f"{where}: expected metadata dict, got {meta!r}"
assert meta.get("hnsw:space") == EXPECTED_METRIC, (
f"{where}: expected hnsw:space={EXPECTED_METRIC!r}, got {meta!r}. "
"A collection without cosine metric will silently break the "
"similarity formula used by the searcher."
)
def test_legacy_get_or_create_collection_sets_cosine(tmp_path):
backend = ChromaBackend()
col = backend.get_or_create_collection(str(tmp_path), "mempalace_drawers")
_assert_cosine(col, "legacy get_or_create_collection")
def test_legacy_create_collection_sets_cosine(tmp_path):
backend = ChromaBackend()
col = backend.create_collection(str(tmp_path), "mempalace_drawers")
_assert_cosine(col, "legacy create_collection")
def test_new_get_collection_with_create_sets_cosine(tmp_path):
"""RFC 001 typed surface — ``get_collection(..., create=True)`` is the
path the miner + init flow take. Must also set cosine."""
backend = ChromaBackend()
col = backend.get_collection(str(tmp_path), "mempalace_drawers", create=True)
_assert_cosine(col, "get_collection(create=True)")
def test_palace_module_get_collection_sets_cosine(tmp_path):
"""The public ``mempalace.palace.get_collection`` is what most callers
use. Must produce cosine palaces."""
col = get_collection(str(tmp_path), "mempalace_drawers", create=True)
_assert_cosine(col, "palace.get_collection(create=True)")
def test_reopening_cosine_palace_preserves_metric(tmp_path):
"""Opening a previously-created cosine palace (create=False) must
still expose the cosine metadata — catches any regression where
reopening drops or overwrites metadata."""
backend = ChromaBackend()
backend.create_collection(str(tmp_path), "mempalace_drawers")
# Fresh backend simulates a process restart
backend2 = ChromaBackend()
col = backend2.get_collection(str(tmp_path), "mempalace_drawers", create=False)
_assert_cosine(col, "re-opened palace")
def test_fresh_palace_via_full_stack_gets_cosine(tmp_path):
"""End-to-end: build a palace with the public API the way a new user
would, confirm the resulting collection uses cosine distance.
Uses the ``tmp_path`` fixture rather than ``tempfile.TemporaryDirectory``
so ChromaDB's persistent SQLite file handles aren't asked to release
during the test body — pytest cleans the path at session end, by which
point the process is exiting and Windows' file-lock contention is
moot. Matches the cleanup strategy used by the rest of this file and
the project's 80% Windows coverage note in CLAUDE.md.
"""
col = get_collection(str(tmp_path), "mempalace_drawers", create=True)
_assert_cosine(col, "full-stack new palace")
# And the closets collection too
closets = get_collection(str(tmp_path), "mempalace_closets", create=True)
_assert_cosine(closets, "full-stack new closets")
+79
View File
@@ -173,6 +173,85 @@ class TestSearchCLI:
# Should have output with at least one result block # Should have output with at least one result block
assert "[1]" in captured.out assert "[1]" in captured.out
def test_search_applies_bm25_hybrid_rerank(self, capsys):
"""CLI search must call the same hybrid rerank that the MCP path uses.
Regression for a bug where the CLI only consulted ChromaDB cosine
distance: a drawer whose body contained every query term still
scored zero similarity if its embedding happened to be far from
the query (e.g. the drawer was a shell-output fragment that
embeds as "file tree noise"). Hybrid rerank fixes this by
combining BM25 with cosine — lexical matches rise above pure
vector noise.
Simulates: three candidates, all with distance >= 1.0 (cosine = 0);
candidate 2 contains every query term. After the fix, candidate 2
should rank first and display a non-zero bm25 score.
"""
mock_col = MagicMock()
mock_col.metadata = {"hnsw:space": "cosine"}
mock_col.query.return_value = {
"documents": [
[
"unrelated directory listing -rw-rw-r-- file.txt",
"foo bar baz is a multi-word phrase",
"another unrelated chunk about colors",
]
],
"metadatas": [
[
{"source_file": "a.md", "wing": "w", "room": "r"},
{"source_file": "b.md", "wing": "w", "room": "r"},
{"source_file": "c.md", "wing": "w", "room": "r"},
]
],
"distances": [[1.5, 1.5, 1.5]],
}
with patch("mempalace.searcher.get_collection", return_value=mock_col):
search("foo bar baz", "/fake/path")
captured = capsys.readouterr()
first_block, _, _ = captured.out.partition("[2]")
# Lexical match must rank first
assert (
"b.md" in first_block
), f"expected lexical match 'b.md' at rank 1, got:\n{captured.out}"
# Non-zero bm25 reported
assert "bm25=" in first_block
assert "bm25=0.0" not in first_block
# Cosine still reported for transparency
assert "cosine=" in first_block
def test_search_warns_when_palace_uses_wrong_distance_metric(self, capsys):
"""Legacy palaces created without `hnsw:space=cosine` silently
use L2, which breaks similarity interpretation. CLI must warn
the user and point them at `mempalace repair` rather than
pretending the `Match` scores are meaningful."""
mock_col = MagicMock()
mock_col.metadata = {} # legacy: no hnsw:space set
mock_col.query.return_value = {
"documents": [["some drawer content"]],
"metadatas": [[{"source_file": "a.md", "wing": "w", "room": "r"}]],
"distances": [[1.2]],
}
with patch("mempalace.searcher.get_collection", return_value=mock_col):
search("anything", "/fake/path")
captured = capsys.readouterr()
assert "mempalace repair" in captured.err
assert "cosine" in captured.err.lower()
def test_search_does_not_warn_when_palace_is_correctly_configured(self, capsys):
mock_col = MagicMock()
mock_col.metadata = {"hnsw:space": "cosine"}
mock_col.query.return_value = {
"documents": [["some drawer content"]],
"metadatas": [[{"source_file": "a.md", "wing": "w", "room": "r"}]],
"distances": [[0.3]],
}
with patch("mempalace.searcher.get_collection", return_value=mock_col):
search("anything", "/fake/path")
captured = capsys.readouterr()
assert "mempalace repair" not in captured.err
def test_search_handles_none_metadata_without_crash(self, palace_path, capsys): def test_search_handles_none_metadata_without_crash(self, palace_path, capsys):
"""ChromaDB can return `None` entries in the metadatas list when a """ChromaDB can return `None` entries in the metadatas list when a
drawer has no metadata. The CLI print path must not crash on them drawer has no metadata. The CLI print path must not crash on them