fix(repair): detect HNSW capacity divergence and fall back to BM25 (#1222)

When chromadb's HNSW segment freezes at a stale max_elements while
sqlite keeps accumulating embeddings, the next chromadb open segfaults
the MCP server on every tool call. Adds a pure-filesystem capacity probe
(zero chromadb interaction), a `mempalace repair-status` read-only
health check, and a BM25-only sqlite fallback so the palace stays
reachable even when vector search is unavailable.

* `hnsw_capacity_status` reads sqlite + index_metadata.pickle directly
  via a tight-allowlist unpickler — no hnswlib import, no segment load.
* MCP server runs the probe at startup and after every reconnect; sets
  `_vector_disabled` and routes search to the sqlite FTS5 + BM25 path.
* `tool_status` and `tool_reconnect` surface the fallback state.
* Threshold tuned for chromadb 1.5.x async-flush lag (2× sync_threshold).
This commit is contained in:
Igor Lins e Silva
2026-04-26 19:54:00 -03:00
parent 899a5ec4c6
commit 0d349c3d86
6 changed files with 988 additions and 7 deletions
+61 -4
View File
@@ -6,8 +6,9 @@ When ChromaDB's HNSW index accumulates duplicate entries (from repeated
add() calls with the same ID), link_lists.bin can grow unbounded —
terabytes on large palaces — eventually causing segfaults.
This module provides three operations:
This module provides four operations:
status — compare sqlite vs HNSW element counts (read-only health check)
scan — find every corrupt/unfetchable ID in the palace
prune — delete only the corrupt IDs (surgical)
rebuild — extract all drawers, delete the collection, recreate with
@@ -17,6 +18,7 @@ The rebuild backs up ONLY chroma.sqlite3 (the source of truth), not the
full palace directory — so it works even when link_lists.bin is bloated.
Usage (standalone):
python -m mempalace.repair status
python -m mempalace.repair scan [--wing X]
python -m mempalace.repair prune --confirm
python -m mempalace.repair rebuild
@@ -32,7 +34,7 @@ import os
import shutil
import time
from .backends.chroma import ChromaBackend
from .backends.chroma import ChromaBackend, hnsw_capacity_status
COLLECTION_NAME = "mempalace_drawers"
@@ -431,9 +433,62 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
print(f"\n{'=' * 55}\n")
def status(palace_path=None) -> dict:
"""Read-only health check: compare sqlite vs HNSW element counts.
Catches the #1222 failure mode where chromadb's HNSW segment freezes
at a stale ``max_elements`` while sqlite keeps accumulating rows.
Once the divergence is large enough, every tool call segfaults when
chromadb tries to load the undersized HNSW. Running ``mempalace
repair status`` *before* opening the segment lets the operator
discover the problem without crashing the MCP server.
The check itself never opens a chromadb client and never imports
hnswlib — it reads ``chroma.sqlite3`` and ``index_metadata.pickle``
directly via :func:`mempalace.backends.chroma.hnsw_capacity_status`.
Returns the capacity-status dict (also printed). Returns a dict with
``status="unknown"`` when no palace exists at the given path.
"""
palace_path = palace_path or _get_palace_path()
print(f"\n{'=' * 55}")
print(" MemPalace Repair — Status")
print(f"{'=' * 55}\n")
print(f" Palace: {palace_path}")
if not os.path.isdir(palace_path):
print(" No palace found.\n")
return {"status": "unknown", "message": "no palace at path"}
drawers = hnsw_capacity_status(palace_path, "mempalace_drawers")
closets = hnsw_capacity_status(palace_path, "mempalace_closets")
for label, info in (("drawers", drawers), ("closets", closets)):
print(f"\n [{label}]")
if info["sqlite_count"] is None:
print(" sqlite count: (unreadable)")
else:
print(f" sqlite count: {info['sqlite_count']:,}")
if info["hnsw_count"] is None:
print(" hnsw count: (no flushed metadata yet)")
else:
print(f" hnsw count: {info['hnsw_count']:,}")
if info["divergence"] is not None:
print(f" divergence: {info['divergence']:,}")
marker = "DIVERGED" if info["diverged"] else info["status"].upper()
print(f" status: {marker}")
if info["message"]:
print(f" note: {info['message']}")
if drawers["diverged"] or closets["diverged"]:
print("\n Recommended: run `mempalace repair rebuild` to rebuild the index.")
print()
return {"drawers": drawers, "closets": closets}
if __name__ == "__main__":
p = argparse.ArgumentParser(description="MemPalace repair tools")
p.add_argument("command", choices=["scan", "prune", "rebuild"])
p.add_argument("command", choices=["status", "scan", "prune", "rebuild"])
p.add_argument("--palace", default=None, help="Palace directory path")
p.add_argument("--wing", default=None, help="Scan only this wing")
p.add_argument("--confirm", action="store_true", help="Actually delete corrupt IDs")
@@ -441,7 +496,9 @@ if __name__ == "__main__":
path = os.path.expanduser(args.palace) if args.palace else None
if args.command == "scan":
if args.command == "status":
status(palace_path=path)
elif args.command == "scan":
scan_palace(palace_path=path, only_wing=args.wing)
elif args.command == "prune":
prune_corrupt(palace_path=path, confirm=args.confirm)