From 0e32b9643c3d4ceca36b6f610ddbcbbf58ad8dd7 Mon Sep 17 00:00:00 2001 From: Mika Cohen Date: Thu, 30 Apr 2026 21:47:08 -0600 Subject: [PATCH] fix: avoid false hnsw divergence fallback --- mempalace/backends/chroma.py | 27 +++--- mempalace/searcher.py | 43 ++++++++-- tests/test_hnsw_capacity.py | 160 ++++++++++++++++++++++++++++++++++- 3 files changed, 204 insertions(+), 26 deletions(-) diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 646969b..7ce8fd8 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -491,22 +491,17 @@ def hnsw_capacity_status(palace_path: str, collection_name: str = "mempalace_dra divergence_floor = max(_HNSW_DIVERGENCE_FALLBACK_FLOOR, 2 * sync_threshold) if hnsw_count is None: - # No pickle yet — segment hasn't persisted metadata. Could be - # fresh-but-unflushed (normal) or interrupted-mid-flush (bad). - # We can't distinguish without the pickle, so only flag - # divergence when sqlite holds clearly more than two flush - # windows worth — same threshold as the with-pickle path. - if sqlite_count > divergence_floor: - out["status"] = "diverged" - out["diverged"] = True - out["divergence"] = sqlite_count - out["message"] = ( - f"sqlite holds {sqlite_count:,} embeddings but the HNSW segment " - "has never flushed metadata — vector search will return nothing " - "until the segment is rebuilt. Run `mempalace repair`." - ) - else: - out["message"] = "HNSW segment metadata not yet flushed; skipping" + # No pickle yet, so this probe cannot measure HNSW capacity. + # Chroma 1.5.x can have binary HNSW files without a flushed + # metadata pickle; absence of the pickle alone is not proof that + # vector search is unusable or dangerous. Keep the status unknown + # so MCP does not globally disable vectors on an inconclusive + # signal. Corrupt/invalid metadata, when present, is handled by + # quarantine_invalid_hnsw_metadata before Chroma opens. + out["message"] = ( + "HNSW capacity unavailable: metadata has not been flushed; " + "leaving vector search enabled" + ) return out divergence = sqlite_count - hnsw_count diff --git a/mempalace/searcher.py b/mempalace/searcher.py index a14d90d..4ff0f23 100644 --- a/mempalace/searcher.py +++ b/mempalace/searcher.py @@ -396,6 +396,31 @@ def _bm25_only_via_sqlite( "hint": "Run: mempalace init && mempalace mine ", } + def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]: + clauses = [] + params = [] + for key, value in (("wing", wing), ("room", room)): + if not value: + continue + clauses.append( + f""" + AND EXISTS ( + SELECT 1 + FROM embedding_metadata mf + WHERE mf.id = {row_id_expr} + AND mf.key = ? + AND COALESCE( + mf.string_value, + CAST(mf.int_value AS TEXT), + CAST(mf.float_value AS TEXT), + CAST(mf.bool_value AS TEXT) + ) = ? + ) + """ + ) + params.extend([key, value]) + return "".join(clauses), params + try: conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) except sqlite3.Error as e: @@ -408,15 +433,17 @@ def _bm25_only_via_sqlite( candidate_ids: list[int] = [] if tokens: fts_query = " OR ".join(tokens) + filter_sql, filter_params = _metadata_filter_sql("embedding_fulltext_search.rowid") try: rows = conn.execute( - """ + f""" SELECT rowid FROM embedding_fulltext_search WHERE embedding_fulltext_search MATCH ? + {filter_sql} LIMIT ? """, - (fts_query, max_candidates), + (fts_query, *filter_params, max_candidates), ).fetchall() candidate_ids = [r[0] for r in rows] except sqlite3.Error: @@ -434,17 +461,19 @@ def _bm25_only_via_sqlite( # fall back to ordering by primary-key id and finally to an # empty result rather than letting search raise. try: + filter_sql, filter_params = _metadata_filter_sql("e.id") rows = conn.execute( - """ + f""" SELECT e.id FROM embeddings e JOIN segments s ON e.segment_id = s.id JOIN collections c ON s.collection = c.id WHERE c.name = 'mempalace_drawers' + {filter_sql} ORDER BY e.created_at DESC LIMIT ? """, - (max_candidates,), + (*filter_params, max_candidates), ).fetchall() candidate_ids = [r[0] for r in rows] except sqlite3.Error: @@ -453,17 +482,19 @@ def _bm25_only_via_sqlite( exc_info=True, ) try: + filter_sql, filter_params = _metadata_filter_sql("e.id") rows = conn.execute( - """ + f""" SELECT e.id FROM embeddings e JOIN segments s ON e.segment_id = s.id JOIN collections c ON s.collection = c.id WHERE c.name = 'mempalace_drawers' + {filter_sql} ORDER BY e.id DESC LIMIT ? """, - (max_candidates,), + (*filter_params, max_candidates), ).fetchall() candidate_ids = [r[0] for r in rows] except sqlite3.Error: diff --git a/tests/test_hnsw_capacity.py b/tests/test_hnsw_capacity.py index 512fc9c..912def8 100644 --- a/tests/test_hnsw_capacity.py +++ b/tests/test_hnsw_capacity.py @@ -238,14 +238,39 @@ def test_capacity_status_tolerates_flush_lag(tmp_path): assert info["status"] == "ok" -def test_capacity_status_flags_unflushed_with_large_sqlite(tmp_path): - """No pickle + many sqlite rows is its own divergence signal.""" +def test_capacity_status_does_not_flag_unflushed_with_large_sqlite(tmp_path): + """No pickle + many sqlite rows is inconclusive, not divergence.""" seg = "seg-noflush" _seed_chroma_db(str(tmp_path), sqlite_count=10_000, segment_id=seg) info = hnsw_capacity_status(str(tmp_path), COLLECTION) - assert info["diverged"] is True + assert info["diverged"] is False + assert info["status"] == "unknown" + assert info["divergence"] is None assert info["hnsw_count"] is None - assert "never flushed" in info["message"] + assert "capacity unavailable" in info["message"] + assert "leaving vector search enabled" in info["message"] + + +def test_mcp_probe_does_not_disable_vectors_for_unflushed_metadata(tmp_path, monkeypatch): + """The MCP preflight must not route all searches to BM25 on this signal.""" + from mempalace import mcp_server + + seg = "seg-mcp-noflush" + _seed_chroma_db(str(tmp_path), sqlite_count=10_000, segment_id=seg) + + class _Cfg: + palace_path = str(tmp_path) + + monkeypatch.setattr(mcp_server, "_config", _Cfg()) + monkeypatch.setattr(mcp_server, "_vector_disabled", True) + monkeypatch.setattr(mcp_server, "_vector_disabled_reason", "old divergence") + + mcp_server._refresh_vector_disabled_flag() + + assert mcp_server._vector_disabled is False + assert mcp_server._vector_disabled_reason == "" + assert mcp_server._vector_capacity_status["status"] == "unknown" + assert "leaving vector search enabled" in mcp_server._vector_capacity_status["message"] def test_capacity_status_quiet_for_empty_palace(tmp_path): @@ -372,6 +397,17 @@ def _seed_drawers(palace: str, segment_id: str, drawers: list[tuple[str, dict, s conn.close() +def _set_drawer_created_at(palace: str, timestamps: dict[int, str]) -> None: + db_path = os.path.join(palace, "chroma.sqlite3") + conn = sqlite3.connect(db_path) + try: + for emb_id, created_at in timestamps.items(): + conn.execute("UPDATE embeddings SET created_at = ? WHERE id = ?", (created_at, emb_id)) + conn.commit() + finally: + conn.close() + + @pytest.fixture def palace_with_drawers(tmp_path): seg = "seg-bm25" @@ -417,6 +453,122 @@ def test_bm25_fallback_filters_by_wing(palace_with_drawers): assert all(r["wing"] == "design" for r in out["results"]) +def test_bm25_fallback_applies_wing_before_fts_candidate_limit(tmp_path): + seg = "seg-bm25-fts-limit" + _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg) + _seed_drawers( + str(tmp_path), + seg, + [ + ( + "shared token outside target wing", + {"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"}, + "d-1", + ), + ( + "shared token inside target wing", + {"wing": "project", "room": "diary", "source_file": "/x/project.md"}, + "d-2", + ), + ], + ) + + out = _bm25_only_via_sqlite("shared token", str(tmp_path), wing="project", max_candidates=1) + + assert out["total_before_filter"] == 1 + assert len(out["results"]) == 1 + assert out["results"][0]["wing"] == "project" + + +def test_bm25_fallback_applies_room_before_fts_candidate_limit(tmp_path): + seg = "seg-bm25-room-limit" + _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg) + _seed_drawers( + str(tmp_path), + seg, + [ + ( + "shared token wrong room", + {"wing": "project", "room": "scratch", "source_file": "/x/scratch.md"}, + "d-1", + ), + ( + "shared token right room", + {"wing": "project", "room": "diary", "source_file": "/x/diary.md"}, + "d-2", + ), + ], + ) + + out = _bm25_only_via_sqlite( + "shared token", + str(tmp_path), + wing="project", + room="diary", + max_candidates=1, + ) + + assert out["total_before_filter"] == 1 + assert len(out["results"]) == 1 + assert out["results"][0]["wing"] == "project" + assert out["results"][0]["room"] == "diary" + + +def test_bm25_fallback_applies_wing_before_recency_candidate_limit(tmp_path): + seg = "seg-bm25-recency-limit" + _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg) + _seed_drawers( + str(tmp_path), + seg, + [ + ( + "target drawer for short query", + {"wing": "project", "room": "diary", "source_file": "/x/project.md"}, + "d-1", + ), + ( + "newer drawer outside target wing", + {"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"}, + "d-2", + ), + ], + ) + _set_drawer_created_at( + str(tmp_path), + { + 1: "2026-01-01 00:00:00", + 2: "2026-02-01 00:00:00", + }, + ) + + out = _bm25_only_via_sqlite("a", str(tmp_path), wing="project", max_candidates=1) + + assert out["total_before_filter"] == 1 + assert len(out["results"]) == 1 + assert out["results"][0]["wing"] == "project" + + +def test_bm25_fallback_returns_empty_when_filtered_wing_has_no_candidates(tmp_path): + seg = "seg-bm25-empty-filter" + _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg) + _seed_drawers( + str(tmp_path), + seg, + [ + ( + "shared token outside target wing", + {"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"}, + "d-1", + ), + ], + ) + + out = _bm25_only_via_sqlite("shared token", str(tmp_path), wing="project", max_candidates=1) + + assert out["total_before_filter"] == 0 + assert out["results"] == [] + + def test_bm25_fallback_no_palace(tmp_path): out = _bm25_only_via_sqlite("anything", str(tmp_path)) assert "error" in out