diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 13976a2..ad7748f 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -211,6 +211,272 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis return moved +def _vector_segment_id(palace_path: str, collection_name: str) -> Optional[str]: + """Return the VECTOR segment UUID for ``collection_name`` or ``None``. + + Reads ``chroma.sqlite3`` directly so we never have to load a segment + that may segfault on open (#1222 is exactly this case). + """ + db_path = os.path.join(palace_path, "chroma.sqlite3") + if not os.path.isfile(db_path): + return None + try: + conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) + try: + row = conn.execute( + """ + SELECT s.id + FROM segments s + JOIN collections c ON s.collection = c.id + WHERE c.name = ? AND s.scope = 'VECTOR' + LIMIT 1 + """, + (collection_name,), + ).fetchone() + return row[0] if row else None + finally: + conn.close() + except sqlite3.Error: + return None + + +class _PersistentDataStub: + """Minimal stand-in for chromadb's ``PersistentData`` during safe unpickling. + + Accepts any constructor args so pickle's REDUCE opcode succeeds, + captures ``__setstate__`` into ``__dict__``. Only used by + :func:`_hnsw_element_count` — never persisted, never re-pickled. + """ + + def __init__(self, *args, **kwargs): + # Some chromadb versions pickle PersistentData by passing init args + # positionally via REDUCE. We don't care about reconstructing the + # object faithfully — we only need the id_to_label dict — so swallow + # all positional args and re-expose the relevant attributes via + # __setstate__ or __dict__ population further down. + pass + + def __setstate__(self, state): + if isinstance(state, dict): + self.__dict__.update(state) + elif isinstance(state, tuple) and len(state) == 2 and isinstance(state[1], dict): + # (slot_state, dict_state) two-tuple form — only the dict part has + # the named attributes we care about. + self.__dict__.update(state[1]) + + +class _SafePersistentDataUnpickler: + """Whitelist-only unpickler for ``index_metadata.pickle``. + + Allows only ``PersistentData`` from chromadb's HNSW module; everything + else raises ``UnpicklingError``. Standard container types (dict, list, + tuple, str, int, float) are handled by the pickle machinery itself + and don't need allowlisting via ``find_class`` — only constructed + classes do. + + This is the same trust model chromadb uses (it pickles its own files), + but with a tight class allowlist so a tampered file can't instantiate + arbitrary classes during deserialization. + """ + + _ALLOWED = frozenset( + { + ( + "chromadb.segment.impl.vector.local_persistent_hnsw", + "PersistentData", + ), + } + ) + + @classmethod + def load(cls, path: str): + import pickle + + class _Restricted(pickle.Unpickler): + def find_class(self, module: str, name: str): + if (module, name) in cls._ALLOWED: + return _PersistentDataStub + raise pickle.UnpicklingError(f"disallowed class: {module}.{name}") + + with open(path, "rb") as f: + return _Restricted(f).load() + + +def _hnsw_element_count(palace_path: str, segment_id: str) -> Optional[int]: + """Return the element count chromadb thinks the HNSW segment holds. + + Reads ``index_metadata.pickle`` via a tight-allowlist unpickler and + counts ``id_to_label`` entries. This is the count chromadb consults + when sizing/loading the HNSW index on next open — distinct from + hnswlib's internal ``cur_element_count`` in the binary files. For + #1222's divergence check this is the number that matters, because it + is what gets compared against ``count() * resize_factor`` when + chromadb decides whether to resize HNSW on load. + + Uses :class:`_SafePersistentDataUnpickler` rather than chromadb's own + ``PersistentData.load_from_file`` so the probe works even when + ``hnswlib`` is not installed (chromadb's persistent_hnsw module + imports hnswlib at module load — a probe that requires hnswlib would + refuse to run in environments where the segfault risk is moot + anyway). The allowlisted unpickler is also the safer default: the + pickle file is owned by the same user, but a tighter trust boundary + costs us nothing. + + Returns ``None`` when the file is absent (fresh / never-flushed + segment) or the unpickle fails. Callers treat ``None`` as "unknown". + """ + pickle_path = os.path.join(palace_path, segment_id, "index_metadata.pickle") + if not os.path.isfile(pickle_path): + return None + try: + pd = _SafePersistentDataUnpickler.load(pickle_path) + # ChromaDB serializes PersistentData differently across versions: + # 1.5.x writes a plain dict via ``__reduce_ex__``; older versions + # pickled the class instance and rely on ``__setstate__`` to + # populate ``__dict__``. Handle both shapes. + if isinstance(pd, dict): + id_to_label = pd.get("id_to_label") + else: + id_to_label = getattr(pd, "id_to_label", None) + if isinstance(id_to_label, dict): + return len(id_to_label) + return None + except Exception: + logger.debug("_hnsw_element_count failed for %s", pickle_path, exc_info=True) + return None + + +# Divergence threshold: chromadb's HNSW flushes asynchronously, so HNSW +# typically lags sqlite by up to ``sync_threshold`` (default 1000) records +# under active write load — that's the *brute-force batch* that hasn't +# been compacted into HNSW yet, plus the un-persisted tail beyond the +# last sync. Two synchronization windows worth (2 × sync_threshold = 2000) +# is a safe steady-state ceiling; anything past that is real divergence, +# not flush-lag. +# +# The #1222 case was 176 613 missing out of 192 997 (91% gone) — orders +# of magnitude past 2000. A typical post-mine palace shows a few hundred +# to ~1000 missing, well under threshold. +_HNSW_DIVERGENCE_ABSOLUTE = 2000 +_HNSW_DIVERGENCE_FRACTION = 0.10 + + +def hnsw_capacity_status(palace_path: str, collection_name: str = "mempalace_drawers") -> dict: + """Compare sqlite embedding count against HNSW element count. + + The #1222 failure mode: ``max_elements`` froze at 16 384 while sqlite + accumulated 192 997 embeddings. Every subsequent tool call segfaulted + when chromadb tried to load the undersized HNSW. This probe runs + *before* anything touches the segment so we can warn (or fall back to + BM25) instead of crashing. + + Returns a dict with: + + * ``segment_id`` — VECTOR segment UUID, or ``None`` if no palace + * ``sqlite_count`` — embeddings present in chroma.sqlite3 + * ``hnsw_count`` — elements chromadb's pickle knows about + * ``divergence`` — ``sqlite_count - hnsw_count`` when both known + * ``diverged`` — True when divergence exceeds the threshold + * ``status`` — ``"ok"`` | ``"diverged"`` | ``"unknown"`` + * ``message`` — human-readable summary + + Never raises — a probe that throws would defeat the point. + """ + out: dict[str, Any] = { + "segment_id": None, + "sqlite_count": None, + "hnsw_count": None, + "divergence": None, + "diverged": False, + "status": "unknown", + "message": "", + } + + try: + seg_id = _vector_segment_id(palace_path, collection_name) + out["segment_id"] = seg_id + + sqlite_count = _sqlite_embedding_count(palace_path, collection_name) + out["sqlite_count"] = sqlite_count + + if seg_id is None or sqlite_count is None: + out["message"] = "palace state unreadable; skipping HNSW capacity check" + return out + + hnsw_count = _hnsw_element_count(palace_path, seg_id) + out["hnsw_count"] = hnsw_count + + if hnsw_count is None: + # No pickle yet — segment hasn't persisted metadata. Could be + # fresh-but-unflushed (normal) or interrupted-mid-flush (bad). + # We can't distinguish without the pickle, so only flag + # divergence when sqlite holds clearly more than two flush + # windows worth — same threshold as the with-pickle path. + if sqlite_count > _HNSW_DIVERGENCE_ABSOLUTE: + out["status"] = "diverged" + out["diverged"] = True + out["divergence"] = sqlite_count + out["message"] = ( + f"sqlite holds {sqlite_count:,} embeddings but the HNSW segment " + "has never flushed metadata — vector search will return nothing " + "until the segment is rebuilt. Run `mempalace repair`." + ) + else: + out["message"] = "HNSW segment metadata not yet flushed; skipping" + return out + + divergence = sqlite_count - hnsw_count + out["divergence"] = divergence + threshold = max(_HNSW_DIVERGENCE_ABSOLUTE, int(sqlite_count * _HNSW_DIVERGENCE_FRACTION)) + if divergence > threshold: + out["status"] = "diverged" + out["diverged"] = True + pct = 100.0 * divergence / max(sqlite_count, 1) + out["message"] = ( + f"HNSW index holds {hnsw_count:,} elements but sqlite has " + f"{sqlite_count:,} embeddings — {divergence:,} drawers ({pct:.0f}%) " + "are invisible to vector search. Run `mempalace repair` to rebuild." + ) + else: + out["status"] = "ok" + out["message"] = ( + f"HNSW {hnsw_count:,} / sqlite {sqlite_count:,} (within flush-lag tolerance)" + ) + except Exception: + logger.debug("hnsw_capacity_status failed", exc_info=True) + out["message"] = "HNSW capacity probe raised; skipping" + return out + + +def _sqlite_embedding_count(palace_path: str, collection_name: str) -> Optional[int]: + """Count rows in chroma.sqlite3.embeddings for ``collection_name``. + + Mirrors :func:`mempalace.repair.sqlite_drawer_count` but kept in this + module so the backend probe doesn't pull in the repair CLI module. + """ + db_path = os.path.join(palace_path, "chroma.sqlite3") + if not os.path.isfile(db_path): + return None + try: + conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) + try: + row = conn.execute( + """ + SELECT COUNT(*) + FROM embeddings e + JOIN segments s ON e.segment_id = s.id + JOIN collections c ON s.collection = c.id + WHERE c.name = ? + """, + (collection_name,), + ).fetchone() + return int(row[0]) if row and row[0] is not None else None + finally: + conn.close() + except sqlite3.Error: + return None + + def _pin_hnsw_threads(collection) -> None: """Best-effort retrofit: pin ``hnsw:num_threads=1`` on an existing collection. diff --git a/mempalace/cli.py b/mempalace/cli.py index 51e3109..80ac9b0 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -607,6 +607,14 @@ def cmd_status(args): status(palace_path=palace_path) +def cmd_repair_status(args): + """Read-only HNSW capacity health check (#1222).""" + from .repair import status as repair_status + + palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path + repair_status(palace_path=palace_path) + + def cmd_repair(args): """Rebuild palace vector index from SQLite metadata.""" import shutil @@ -1125,6 +1133,12 @@ def main(): ), ) + # repair-status — read-only HNSW capacity health check (#1222) + sub.add_parser( + "repair-status", + help="Compare sqlite vs HNSW element counts (read-only; never opens a chromadb client)", + ) + # mcp sub.add_parser( "mcp", @@ -1181,6 +1195,7 @@ def main(): "compress": cmd_compress, "wake-up": cmd_wakeup, "repair": cmd_repair, + "repair-status": cmd_repair_status, "migrate": cmd_migrate, "status": cmd_status, } diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py index 9c87708..e016202 100644 --- a/mempalace/mcp_server.py +++ b/mempalace/mcp_server.py @@ -57,7 +57,12 @@ from .config import ( # noqa: E402 sanitize_content, ) from .version import __version__ # noqa: E402 -from .backends.chroma import ChromaBackend, ChromaCollection, _pin_hnsw_threads # noqa: E402 +from .backends.chroma import ( # noqa: E402 + ChromaBackend, + ChromaCollection, + _pin_hnsw_threads, + hnsw_capacity_status, +) from .query_sanitizer import sanitize_query # noqa: E402 from .searcher import search_memories # noqa: E402 from .palace_graph import ( # noqa: E402 @@ -108,6 +113,52 @@ _collection_cache = None _palace_db_inode = 0 # inode of chroma.sqlite3 at cache time _palace_db_mtime = 0.0 # mtime of chroma.sqlite3 at cache time +# ── Vector-search disabled flag (#1222) ────────────────────────────────── +# Set when ``hnsw_capacity_status`` reports a divergence between sqlite +# and the HNSW segment large enough that chromadb would segfault on +# segment load. While this is set, vector-shaped tools (``search``, +# ``check_duplicate``) route to the sqlite-only BM25 fallback in +# :func:`mempalace.searcher._bm25_only_via_sqlite`. Cleared after a +# successful repair via :func:`tool_reconnect` (which re-runs the probe). +_vector_disabled = False +_vector_disabled_reason = "" +_vector_capacity_status: dict | None = None + + +def _refresh_vector_disabled_flag() -> None: + """Re-run the HNSW capacity probe and update the module-level flag. + + Called from :func:`_get_client` whenever the client cache is rebuilt + (first open or palace replacement). Cheap — pure sqlite + pickle + read, no chromadb interaction. Never raises: a probe that crashes + would defeat the point. + """ + global _vector_disabled, _vector_disabled_reason, _vector_capacity_status + try: + info = hnsw_capacity_status(_config.palace_path, "mempalace_drawers") + except Exception: + logger.debug("HNSW capacity probe raised", exc_info=True) + return + _vector_capacity_status = info + if info.get("diverged"): + if not _vector_disabled: + logger.warning( + "HNSW capacity divergence detected (%s) — routing search to " + "BM25-only sqlite fallback. Run `mempalace repair rebuild` to restore " + "vector search.", + info.get("message", "unknown"), + ) + _vector_disabled = True + _vector_disabled_reason = info.get("message", "") + else: + if _vector_disabled: + logger.info( + "HNSW capacity within tolerance (%s) — vector search re-enabled", + info.get("message", ""), + ) + _vector_disabled = False + _vector_disabled_reason = "" + # ==================== WRITE-AHEAD LOG ==================== # Every write operation is logged to a JSONL file before execution. @@ -202,6 +253,11 @@ def _get_client(): mtime_changed = current_mtime != 0.0 and abs(current_mtime - _palace_db_mtime) > 0.01 if _client_cache is None or inode_changed or mtime_changed: + # Run the HNSW capacity probe BEFORE chromadb opens the segment — + # if the index is severely undersized, segment load can segfault + # the whole MCP server (#1222). The probe is pure sqlite + + # metadata-pickle read; never touches the HNSW binary files. + _refresh_vector_disabled_flag() _client_cache = ChromaBackend.make_client(_config.palace_path) _collection_cache = None _metadata_cache = None @@ -322,6 +378,17 @@ def tool_status(): "protocol": PALACE_PROTOCOL, "aaak_dialect": AAAK_SPEC, } + if _vector_disabled: + # Surface the #1222 fallback state so the AI knows search results + # are BM25-only and recommends the operator run repair. + result["vector_disabled"] = True + result["vector_disabled_reason"] = _vector_disabled_reason + if _vector_capacity_status: + result["hnsw_capacity"] = { + "sqlite_count": _vector_capacity_status.get("sqlite_count"), + "hnsw_count": _vector_capacity_status.get("hnsw_count"), + "divergence": _vector_capacity_status.get("divergence"), + } try: all_meta = _get_cached_metadata(col) for m in all_meta: @@ -456,6 +523,9 @@ def tool_search( dist = (1.0 - min_similarity) if min_similarity is not None else max_distance # Mitigate system prompt contamination (Issue #333) sanitized = sanitize_query(query) + # Ensure the capacity probe has been run at least once before we + # decide which path to take — _get_client triggers it on first open. + _get_client() result = search_memories( sanitized["clean_query"], palace_path=_config.palace_path, @@ -463,7 +533,11 @@ def tool_search( room=room, n_results=limit, max_distance=dist, + vector_disabled=_vector_disabled, ) + if _vector_disabled: + result["vector_disabled"] = True + result["vector_disabled_reason"] = _vector_disabled_reason # Attach sanitizer metadata for transparency if sanitized["was_sanitized"]: result["query_sanitized"] = True @@ -482,6 +556,21 @@ def tool_check_duplicate(content: str, threshold: float = 0.9): col = _get_collection() if not col: return _no_palace() + if _vector_disabled: + # Without a usable HNSW we can't compute cosine similarity for + # near-duplicate detection. Report the limitation rather than + # silently returning "not a duplicate" — false negatives here + # would let the AI re-file content the palace already holds. + return { + "is_duplicate": False, + "matches": [], + "vector_disabled": True, + "vector_disabled_reason": _vector_disabled_reason, + "hint": ( + "duplicate detection requires vector search; run " + "`mempalace repair rebuild` to restore" + ), + } try: results = col.query( query_texts=[content], @@ -1150,10 +1239,22 @@ def tool_reconnect(): Use after external scripts or CLI commands modify the palace database directly, which can leave the in-memory HNSW index stale. """ - global _collection_cache, _palace_db_inode, _palace_db_mtime + global \ + _client_cache, \ + _collection_cache, \ + _palace_db_inode, \ + _palace_db_mtime, \ + _vector_disabled, \ + _vector_disabled_reason + _client_cache = None _collection_cache = None _palace_db_inode = 0 _palace_db_mtime = 0.0 + # Force probe re-run on next _get_client by clearing the flag now; + # _refresh_vector_disabled_flag will re-set it if the divergence + # still applies after the reconnect. + _vector_disabled = False + _vector_disabled_reason = "" try: col = _get_collection() if col is None: @@ -1161,8 +1262,15 @@ def tool_reconnect(): "success": False, "message": "No palace found after reconnect", "drawers": 0, + "vector_disabled": _vector_disabled, } - return {"success": True, "message": "Reconnected to palace", "drawers": col.count()} + return { + "success": True, + "message": "Reconnected to palace", + "drawers": col.count(), + "vector_disabled": _vector_disabled, + "vector_disabled_reason": _vector_disabled_reason, + } except Exception as e: return {"success": False, "error": str(e)} @@ -1726,6 +1834,10 @@ def _restore_stdout(): def main(): _restore_stdout() logger.info("MemPalace MCP Server starting...") + # Pre-flight: probe HNSW capacity before any tool call so the warning + # is visible at startup rather than on first use (#1222). Pure + # filesystem read; never opens a chromadb client. + _refresh_vector_disabled_flag() while True: try: line = sys.stdin.readline() diff --git a/mempalace/repair.py b/mempalace/repair.py index a75ce35..4af32df 100644 --- a/mempalace/repair.py +++ b/mempalace/repair.py @@ -6,8 +6,9 @@ When ChromaDB's HNSW index accumulates duplicate entries (from repeated add() calls with the same ID), link_lists.bin can grow unbounded — terabytes on large palaces — eventually causing segfaults. -This module provides three operations: +This module provides four operations: + status — compare sqlite vs HNSW element counts (read-only health check) scan — find every corrupt/unfetchable ID in the palace prune — delete only the corrupt IDs (surgical) rebuild — extract all drawers, delete the collection, recreate with @@ -17,6 +18,7 @@ The rebuild backs up ONLY chroma.sqlite3 (the source of truth), not the full palace directory — so it works even when link_lists.bin is bloated. Usage (standalone): + python -m mempalace.repair status python -m mempalace.repair scan [--wing X] python -m mempalace.repair prune --confirm python -m mempalace.repair rebuild @@ -32,7 +34,7 @@ import os import shutil import time -from .backends.chroma import ChromaBackend +from .backends.chroma import ChromaBackend, hnsw_capacity_status COLLECTION_NAME = "mempalace_drawers" @@ -431,9 +433,62 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False): print(f"\n{'=' * 55}\n") +def status(palace_path=None) -> dict: + """Read-only health check: compare sqlite vs HNSW element counts. + + Catches the #1222 failure mode where chromadb's HNSW segment freezes + at a stale ``max_elements`` while sqlite keeps accumulating rows. + Once the divergence is large enough, every tool call segfaults when + chromadb tries to load the undersized HNSW. Running ``mempalace + repair status`` *before* opening the segment lets the operator + discover the problem without crashing the MCP server. + + The check itself never opens a chromadb client and never imports + hnswlib — it reads ``chroma.sqlite3`` and ``index_metadata.pickle`` + directly via :func:`mempalace.backends.chroma.hnsw_capacity_status`. + + Returns the capacity-status dict (also printed). Returns a dict with + ``status="unknown"`` when no palace exists at the given path. + """ + palace_path = palace_path or _get_palace_path() + print(f"\n{'=' * 55}") + print(" MemPalace Repair — Status") + print(f"{'=' * 55}\n") + print(f" Palace: {palace_path}") + + if not os.path.isdir(palace_path): + print(" No palace found.\n") + return {"status": "unknown", "message": "no palace at path"} + + drawers = hnsw_capacity_status(palace_path, "mempalace_drawers") + closets = hnsw_capacity_status(palace_path, "mempalace_closets") + + for label, info in (("drawers", drawers), ("closets", closets)): + print(f"\n [{label}]") + if info["sqlite_count"] is None: + print(" sqlite count: (unreadable)") + else: + print(f" sqlite count: {info['sqlite_count']:,}") + if info["hnsw_count"] is None: + print(" hnsw count: (no flushed metadata yet)") + else: + print(f" hnsw count: {info['hnsw_count']:,}") + if info["divergence"] is not None: + print(f" divergence: {info['divergence']:,}") + marker = "DIVERGED" if info["diverged"] else info["status"].upper() + print(f" status: {marker}") + if info["message"]: + print(f" note: {info['message']}") + + if drawers["diverged"] or closets["diverged"]: + print("\n Recommended: run `mempalace repair rebuild` to rebuild the index.") + print() + return {"drawers": drawers, "closets": closets} + + if __name__ == "__main__": p = argparse.ArgumentParser(description="MemPalace repair tools") - p.add_argument("command", choices=["scan", "prune", "rebuild"]) + p.add_argument("command", choices=["status", "scan", "prune", "rebuild"]) p.add_argument("--palace", default=None, help="Palace directory path") p.add_argument("--wing", default=None, help="Scan only this wing") p.add_argument("--confirm", action="store_true", help="Actually delete corrupt IDs") @@ -441,7 +496,9 @@ if __name__ == "__main__": path = os.path.expanduser(args.palace) if args.palace else None - if args.command == "scan": + if args.command == "status": + status(palace_path=path) + elif args.command == "scan": scan_palace(palace_path=path, only_wing=args.wing) elif args.command == "prune": prune_corrupt(palace_path=path, confirm=args.confirm) diff --git a/mempalace/searcher.py b/mempalace/searcher.py index c2fcdb4..bee8509 100644 --- a/mempalace/searcher.py +++ b/mempalace/searcher.py @@ -11,7 +11,9 @@ hide drawers the direct path would have found. import logging import math +import os import re +import sqlite3 from pathlib import Path from .palace import get_closets_collection, get_collection @@ -363,6 +365,158 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r print() +def _bm25_only_via_sqlite( + query: str, + palace_path: str, + wing: str = None, + room: str = None, + n_results: int = 5, + max_candidates: int = 500, +) -> dict: + """BM25-only search reading drawers directly from chroma.sqlite3. + + Used when HNSW is diverged or unloadable (#1222). Bypasses chromadb's + Python client entirely so a corrupt vector segment can't segfault the + MCP server. Routes through chromadb's own FTS5 trigram index + (``embedding_fulltext_search``) for candidate selection, then re-ranks + with the same Okapi-BM25 used in :func:`_hybrid_rank` so the result + shape matches the vector path. + + The query is split into ≥3-char trigram-tokens and OR-joined for the + FTS5 MATCH — chromadb writes the index with ``tokenize='trigram'``, + so single-character tokens never match. When no usable token survives + (e.g. "is a"), candidate selection falls back to the most-recent + ``max_candidates`` rows so we still return *something* rather than + nothing. + """ + db_path = os.path.join(palace_path, "chroma.sqlite3") + if not os.path.isfile(db_path): + return { + "error": "No palace found", + "hint": "Run: mempalace init