fix(repair): detect HNSW capacity divergence and fall back to BM25 (#1222)

When chromadb's HNSW segment freezes at a stale max_elements while sqlite keeps accumulating embeddings, the next chromadb open segfaults the MCP server on every tool call. Adds a pure-filesystem capacity probe (zero chromadb interaction), a `mempalace repair-status` read-only health check, and a BM25-only sqlite fallback so the palace stays reachable even when vector search is unavailable. * `hnsw_capacity_status` reads sqlite + index_metadata.pickle directly via a tight-allowlist unpickler — no hnswlib import, no segment load. * MCP server runs the probe at startup and after every reconnect; sets `_vector_disabled` and routes search to the sqlite FTS5 + BM25 path. * `tool_status` and `tool_reconnect` surface the fallback state. * Threshold tuned for chromadb 1.5.x async-flush lag (2× sync_threshold).
2026-04-26 19:54:00 -03:00
parent 899a5ec4c6
commit 0d349c3d86
6 changed files with 988 additions and 7 deletions
@@ -57,7 +57,12 @@ from .config import (  # noqa: E402
    sanitize_content,
 )
 from .version import __version__  # noqa: E402
-from .backends.chroma import ChromaBackend, ChromaCollection, _pin_hnsw_threads  # noqa: E402
+from .backends.chroma import (  # noqa: E402
+    ChromaBackend,
+    ChromaCollection,
+    _pin_hnsw_threads,
+    hnsw_capacity_status,
+)
 from .query_sanitizer import sanitize_query  # noqa: E402
 from .searcher import search_memories  # noqa: E402
 from .palace_graph import (  # noqa: E402
@@ -108,6 +113,52 @@ _collection_cache = None
 _palace_db_inode = 0  # inode of chroma.sqlite3 at cache time
 _palace_db_mtime = 0.0  # mtime of chroma.sqlite3 at cache time

+# ── Vector-search disabled flag (#1222) ──────────────────────────────────
+# Set when ``hnsw_capacity_status`` reports a divergence between sqlite
+# and the HNSW segment large enough that chromadb would segfault on
+# segment load. While this is set, vector-shaped tools (``search``,
+# ``check_duplicate``) route to the sqlite-only BM25 fallback in
+# :func:`mempalace.searcher._bm25_only_via_sqlite`. Cleared after a
+# successful repair via :func:`tool_reconnect` (which re-runs the probe).
+_vector_disabled = False
+_vector_disabled_reason = ""
+_vector_capacity_status: dict | None = None
+
+
+def _refresh_vector_disabled_flag() -> None:
+    """Re-run the HNSW capacity probe and update the module-level flag.
+
+    Called from :func:`_get_client` whenever the client cache is rebuilt
+    (first open or palace replacement). Cheap — pure sqlite + pickle
+    read, no chromadb interaction. Never raises: a probe that crashes
+    would defeat the point.
+    """
+    global _vector_disabled, _vector_disabled_reason, _vector_capacity_status
+    try:
+        info = hnsw_capacity_status(_config.palace_path, "mempalace_drawers")
+    except Exception:
+        logger.debug("HNSW capacity probe raised", exc_info=True)
+        return
+    _vector_capacity_status = info
+    if info.get("diverged"):
+        if not _vector_disabled:
+            logger.warning(
+                "HNSW capacity divergence detected (%s) — routing search to "
+                "BM25-only sqlite fallback. Run `mempalace repair rebuild` to restore "
+                "vector search.",
+                info.get("message", "unknown"),
+            )
+        _vector_disabled = True
+        _vector_disabled_reason = info.get("message", "")
+    else:
+        if _vector_disabled:
+            logger.info(
+                "HNSW capacity within tolerance (%s) — vector search re-enabled",
+                info.get("message", ""),
+            )
+        _vector_disabled = False
+        _vector_disabled_reason = ""
+

 # ==================== WRITE-AHEAD LOG ====================
 # Every write operation is logged to a JSONL file before execution.
@@ -202,6 +253,11 @@ def _get_client():
    mtime_changed = current_mtime != 0.0 and abs(current_mtime - _palace_db_mtime) > 0.01

    if _client_cache is None or inode_changed or mtime_changed:
+        # Run the HNSW capacity probe BEFORE chromadb opens the segment —
+        # if the index is severely undersized, segment load can segfault
+        # the whole MCP server (#1222). The probe is pure sqlite +
+        # metadata-pickle read; never touches the HNSW binary files.
+        _refresh_vector_disabled_flag()
        _client_cache = ChromaBackend.make_client(_config.palace_path)
        _collection_cache = None
        _metadata_cache = None
@@ -322,6 +378,17 @@ def tool_status():
        "protocol": PALACE_PROTOCOL,
        "aaak_dialect": AAAK_SPEC,
    }
+    if _vector_disabled:
+        # Surface the #1222 fallback state so the AI knows search results
+        # are BM25-only and recommends the operator run repair.
+        result["vector_disabled"] = True
+        result["vector_disabled_reason"] = _vector_disabled_reason
+        if _vector_capacity_status:
+            result["hnsw_capacity"] = {
+                "sqlite_count": _vector_capacity_status.get("sqlite_count"),
+                "hnsw_count": _vector_capacity_status.get("hnsw_count"),
+                "divergence": _vector_capacity_status.get("divergence"),
+            }
    try:
        all_meta = _get_cached_metadata(col)
        for m in all_meta:
@@ -456,6 +523,9 @@ def tool_search(
    dist = (1.0 - min_similarity) if min_similarity is not None else max_distance
    # Mitigate system prompt contamination (Issue #333)
    sanitized = sanitize_query(query)
+    # Ensure the capacity probe has been run at least once before we
+    # decide which path to take — _get_client triggers it on first open.
+    _get_client()
    result = search_memories(
        sanitized["clean_query"],
        palace_path=_config.palace_path,
@@ -463,7 +533,11 @@ def tool_search(
        room=room,
        n_results=limit,
        max_distance=dist,
+        vector_disabled=_vector_disabled,
    )
+    if _vector_disabled:
+        result["vector_disabled"] = True
+        result["vector_disabled_reason"] = _vector_disabled_reason
    # Attach sanitizer metadata for transparency
    if sanitized["was_sanitized"]:
        result["query_sanitized"] = True
@@ -482,6 +556,21 @@ def tool_check_duplicate(content: str, threshold: float = 0.9):
    col = _get_collection()
    if not col:
        return _no_palace()
+    if _vector_disabled:
+        # Without a usable HNSW we can't compute cosine similarity for
+        # near-duplicate detection. Report the limitation rather than
+        # silently returning "not a duplicate" — false negatives here
+        # would let the AI re-file content the palace already holds.
+        return {
+            "is_duplicate": False,
+            "matches": [],
+            "vector_disabled": True,
+            "vector_disabled_reason": _vector_disabled_reason,
+            "hint": (
+                "duplicate detection requires vector search; run "
+                "`mempalace repair rebuild` to restore"
+            ),
+        }
    try:
        results = col.query(
            query_texts=[content],
@@ -1150,10 +1239,22 @@ def tool_reconnect():
    Use after external scripts or CLI commands modify the palace database
    directly, which can leave the in-memory HNSW index stale.
    """
-    global _collection_cache, _palace_db_inode, _palace_db_mtime
+    global \
+        _client_cache, \
+        _collection_cache, \
+        _palace_db_inode, \
+        _palace_db_mtime, \
+        _vector_disabled, \
+        _vector_disabled_reason
+    _client_cache = None
    _collection_cache = None
    _palace_db_inode = 0
    _palace_db_mtime = 0.0
+    # Force probe re-run on next _get_client by clearing the flag now;
+    # _refresh_vector_disabled_flag will re-set it if the divergence
+    # still applies after the reconnect.
+    _vector_disabled = False
+    _vector_disabled_reason = ""
    try:
        col = _get_collection()
        if col is None:
@@ -1161,8 +1262,15 @@ def tool_reconnect():
                "success": False,
                "message": "No palace found after reconnect",
                "drawers": 0,
+                "vector_disabled": _vector_disabled,
            }
-        return {"success": True, "message": "Reconnected to palace", "drawers": col.count()}
+        return {
+            "success": True,
+            "message": "Reconnected to palace",
+            "drawers": col.count(),
+            "vector_disabled": _vector_disabled,
+            "vector_disabled_reason": _vector_disabled_reason,
+        }
    except Exception as e:
        return {"success": False, "error": str(e)}

@@ -1726,6 +1834,10 @@ def _restore_stdout():
 def main():
    _restore_stdout()
    logger.info("MemPalace MCP Server starting...")
+    # Pre-flight: probe HNSW capacity before any tool call so the warning
+    # is visible at startup rather than on first use (#1222). Pure
+    # filesystem read; never opens a chromadb client.
+    _refresh_vector_disabled_flag()
    while True:
        try:
            line = sys.stdin.readline()