diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 835ce72..afd8083 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -1,5 +1,6 @@ """ChromaDB-backed MemPalace storage backend (RFC 001 reference implementation).""" +import datetime as _dt import logging import os import sqlite3 @@ -48,6 +49,88 @@ def _validate_where(where: Optional[dict]) -> None: stack.extend(x for x in v if isinstance(x, dict)) +def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> list[str]: + """Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3. + + When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk + HNSW segment is significantly older than ``chroma.sqlite3``, the Rust + graph-walk can dereference dangling neighbor pointers for entries that + exist in the metadata segment but not in the HNSW index, and segfault + in a background thread on the next ``count()`` or ``query(...)`` call. + + This is the same failure mode reported at #823 (semantic search stale + after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on + ``count()`` with chromadb 1.5.5), and acknowledged as by-design at + chroma-core/chroma#2594. On one fork palace (135K drawers), the drift + caused a 65–85% crash rate on fresh-process opens; fresh-process + crash rate dropped to 0% after the segment dir was renamed out of the + way and ChromaDB rebuilt lazily. + + Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer + than the segment's ``data_level0.bin``, the segment is considered + suspect and renamed to ``.drift-``. ChromaDB reopens + cleanly without it and writes fresh index files on next use. The + original directory is renamed, not deleted, so recovery remains + possible if the heuristic misfires. + + The default threshold (1h) is deliberately conservative — ChromaDB's + HNSW flush cadence means legitimate drift is normally on the order of + seconds to minutes. A segment that is more than an hour out of date is + almost certainly in a "crashed mid-write" state. + + Args: + palace_path: path to the palace directory containing ``chroma.sqlite3`` + stale_seconds: minimum mtime gap to treat a segment as stale + + Returns: + List of paths that were quarantined (empty if nothing drifted). + """ + db_path = os.path.join(palace_path, "chroma.sqlite3") + if not os.path.isfile(db_path): + return [] + try: + sqlite_mtime = os.path.getmtime(db_path) + except OSError: + return [] + + moved: list[str] = [] + try: + entries = os.listdir(palace_path) + except OSError: + return [] + + for name in entries: + if "-" not in name or name.startswith(".") or ".drift-" in name: + continue + seg_dir = os.path.join(palace_path, name) + if not os.path.isdir(seg_dir): + continue + hnsw_bin = os.path.join(seg_dir, "data_level0.bin") + if not os.path.isfile(hnsw_bin): + continue + try: + hnsw_mtime = os.path.getmtime(hnsw_bin) + except OSError: + continue + if sqlite_mtime - hnsw_mtime < stale_seconds: + continue + stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S") + target = f"{seg_dir}.drift-{stamp}" + try: + os.rename(seg_dir, target) + moved.append(target) + logger.warning( + "Quarantined stale HNSW segment %s " + "(sqlite %.0fs newer than HNSW); renamed to %s", + seg_dir, + sqlite_mtime - hnsw_mtime, + target, + ) + except OSError: + logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir) + return moved + + def _fix_blob_seq_ids(palace_path: str) -> None: """Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER. diff --git a/tests/test_backends.py b/tests/test_backends.py index 2ca2d5a..b3f009a 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -1,3 +1,4 @@ +import os import sqlite3 import chromadb @@ -11,7 +12,12 @@ from mempalace.backends import ( available_backends, get_backend, ) -from mempalace.backends.chroma import ChromaBackend, ChromaCollection, _fix_blob_seq_ids +from mempalace.backends.chroma import ( + ChromaBackend, + ChromaCollection, + _fix_blob_seq_ids, + quarantine_stale_hnsw, +) class _FakeCollection: @@ -372,3 +378,68 @@ def test_fix_blob_seq_ids_noop_without_blobs(tmp_path): def test_fix_blob_seq_ids_noop_without_database(tmp_path): """No error when palace has no chroma.sqlite3.""" _fix_blob_seq_ids(str(tmp_path)) # should not raise + + +# ── quarantine_stale_hnsw ───────────────────────────────────────────────── + + +def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime): + """Helper: build a palace dir with one HNSW segment + sqlite at given mtimes.""" + palace = tmp_path / "palace" + palace.mkdir() + (palace / "chroma.sqlite3").write_text("") + seg = palace / "abcd-1234-5678" + seg.mkdir() + (seg / "data_level0.bin").write_text("") + os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime)) + os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime)) + return palace, seg + + +def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path): + """Segment whose data_level0.bin is 2h older than sqlite gets renamed.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert len(moved) == 1 + assert ".drift-" in moved[0] + assert not seg.exists() + # the renamed directory still exists and contains the original file + renamed = list(palace.iterdir()) + drift_dirs = [p for p in renamed if ".drift-" in p.name] + assert len(drift_dirs) == 1 + assert (drift_dirs[0] / "data_level0.bin").exists() + + +def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path): + """Segment with recent mtime vs sqlite is not touched.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert moved == [] + assert seg.exists() + + +def test_quarantine_stale_hnsw_no_palace(tmp_path): + """Missing palace path or chroma.sqlite3: return [] without raising.""" + assert quarantine_stale_hnsw(str(tmp_path / "missing")) == [] + empty = tmp_path / "empty" + empty.mkdir() + assert quarantine_stale_hnsw(str(empty)) == [] + + +def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path): + """Directories already named with ``.drift-`` suffix are never re-renamed.""" + now = 1_700_000_000.0 + palace = tmp_path / "palace" + palace.mkdir() + (palace / "chroma.sqlite3").write_text("") + os.utime(palace / "chroma.sqlite3", (now, now)) + drift = palace / "abcd-1234.drift-20260101-000000" + drift.mkdir() + (drift / "data_level0.bin").write_text("") + os.utime(drift / "data_level0.bin", (now - 99999, now - 99999)) + + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert moved == [] + assert drift.exists()