feat(backends): quarantine_stale_hnsw — recover from HNSW/sqlite drift

Add a helper that renames HNSW segment directories whose
`data_level0.bin` is significantly older than `chroma.sqlite3`. Drift
between the on-disk HNSW graph and the live embeddings table is the
root cause of a segfault class where the Rust graph-walk dereferences
dangling neighbor pointers for entries in the metadata segment that no
longer exist in the HNSW index, crashing in a background thread on
`count()` or `query()`.

Issue #823 describes the same drift as a silent-staleness symptom
(semantic search returns stale results after `add_drawer` because
`data_level0.bin` lags the sqlite metadata under the default
`sync_threshold=1000`). Under heavier load or after an interrupted
write, the same drift can escalate from "silent stale results" to
"SIGSEGV on next open," which is the failure mode observed at
neo-cortex-mcp#2 (chromadb 1.5.5, Python 3.12) and acknowledged at
chroma-core/chroma#2594.

On one 135K-drawer palace where `index_metadata.pickle` claimed 137,813
elements against 135,464 rows in sqlite (2,349-entry drift), fresh
Python processes crashed in `col.count()` 17/20 times; after renaming
the segment dir out of the way and letting ChromaDB rebuild lazily, the
same 20-run check went to 0 crashes.

The recovery path #823 suggests (export / recreate / reimport) is heavy
— it re-embeds every drawer. This helper is lighter: rename the segment
dir so ChromaDB reopens without it, and the indexer rebuilds lazily on
the next write. The original directory is renamed (not deleted) so the
operator can recover if the heuristic misfires.

If `chroma.sqlite3` is more than `stale_seconds` (default 3600) newer
than the segment's `data_level0.bin`, the segment is considered
suspect. One hour is deliberately conservative — normal HNSW flush
cadence is seconds to minutes, so an hour of drift implies a crashed
mid-write, not routine lag.

- Additive: exposes `quarantine_stale_hnsw(palace_path, stale_seconds)`
  as a helper. Not wired into `_client()` / startup on this PR — the
  goal is to land the primitive first so operators and higher layers
  can opt in. A follow-up could call it automatically on palace open
  behind an env var or config flag.
- Closes #823 by giving operators a first-class recovery path without
  having to install `chromadb-ops` or re-mine.

Four new tests in `tests/test_backends.py`:
- renames drifted segment, preserves original files under `.drift-TS` suffix
- leaves fresh segments alone
- no-op on missing palace path / missing `chroma.sqlite3`
- skips already-quarantined (`.drift-` suffixed) directories

`pytest tests/test_backends.py` → 11 passed. `ruff check` / `ruff format
--check` — clean.
This commit is contained in:
jp
2026-04-18 10:03:22 -07:00
parent 109d7f267c
commit 0c38deaab5
2 changed files with 155 additions and 1 deletions
+83
View File
@@ -1,5 +1,6 @@
"""ChromaDB-backed MemPalace storage backend (RFC 001 reference implementation)."""
import datetime as _dt
import logging
import os
import sqlite3
@@ -48,6 +49,88 @@ def _validate_where(where: Optional[dict]) -> None:
stack.extend(x for x in v if isinstance(x, dict))
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> list[str]:
"""Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3.
When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk
HNSW segment is significantly older than ``chroma.sqlite3``, the Rust
graph-walk can dereference dangling neighbor pointers for entries that
exist in the metadata segment but not in the HNSW index, and segfault
in a background thread on the next ``count()`` or ``query(...)`` call.
This is the same failure mode reported at #823 (semantic search stale
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
``count()`` with chromadb 1.5.5), and acknowledged as by-design at
chroma-core/chroma#2594. On one fork palace (135K drawers), the drift
caused a 6585% crash rate on fresh-process opens; fresh-process
crash rate dropped to 0% after the segment dir was renamed out of the
way and ChromaDB rebuilt lazily.
Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer
than the segment's ``data_level0.bin``, the segment is considered
suspect and renamed to ``<uuid>.drift-<timestamp>``. ChromaDB reopens
cleanly without it and writes fresh index files on next use. The
original directory is renamed, not deleted, so recovery remains
possible if the heuristic misfires.
The default threshold (1h) is deliberately conservative — ChromaDB's
HNSW flush cadence means legitimate drift is normally on the order of
seconds to minutes. A segment that is more than an hour out of date is
almost certainly in a "crashed mid-write" state.
Args:
palace_path: path to the palace directory containing ``chroma.sqlite3``
stale_seconds: minimum mtime gap to treat a segment as stale
Returns:
List of paths that were quarantined (empty if nothing drifted).
"""
db_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
return []
try:
sqlite_mtime = os.path.getmtime(db_path)
except OSError:
return []
moved: list[str] = []
try:
entries = os.listdir(palace_path)
except OSError:
return []
for name in entries:
if "-" not in name or name.startswith(".") or ".drift-" in name:
continue
seg_dir = os.path.join(palace_path, name)
if not os.path.isdir(seg_dir):
continue
hnsw_bin = os.path.join(seg_dir, "data_level0.bin")
if not os.path.isfile(hnsw_bin):
continue
try:
hnsw_mtime = os.path.getmtime(hnsw_bin)
except OSError:
continue
if sqlite_mtime - hnsw_mtime < stale_seconds:
continue
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
target = f"{seg_dir}.drift-{stamp}"
try:
os.rename(seg_dir, target)
moved.append(target)
logger.warning(
"Quarantined stale HNSW segment %s "
"(sqlite %.0fs newer than HNSW); renamed to %s",
seg_dir,
sqlite_mtime - hnsw_mtime,
target,
)
except OSError:
logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir)
return moved
def _fix_blob_seq_ids(palace_path: str) -> None:
"""Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
+72 -1
View File
@@ -1,3 +1,4 @@
import os
import sqlite3
import chromadb
@@ -11,7 +12,12 @@ from mempalace.backends import (
available_backends,
get_backend,
)
from mempalace.backends.chroma import ChromaBackend, ChromaCollection, _fix_blob_seq_ids
from mempalace.backends.chroma import (
ChromaBackend,
ChromaCollection,
_fix_blob_seq_ids,
quarantine_stale_hnsw,
)
class _FakeCollection:
@@ -372,3 +378,68 @@ def test_fix_blob_seq_ids_noop_without_blobs(tmp_path):
def test_fix_blob_seq_ids_noop_without_database(tmp_path):
"""No error when palace has no chroma.sqlite3."""
_fix_blob_seq_ids(str(tmp_path)) # should not raise
# ── quarantine_stale_hnsw ─────────────────────────────────────────────────
def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime):
"""Helper: build a palace dir with one HNSW segment + sqlite at given mtimes."""
palace = tmp_path / "palace"
palace.mkdir()
(palace / "chroma.sqlite3").write_text("")
seg = palace / "abcd-1234-5678"
seg.mkdir()
(seg / "data_level0.bin").write_text("")
os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime))
os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime))
return palace, seg
def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path):
"""Segment whose data_level0.bin is 2h older than sqlite gets renamed."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert len(moved) == 1
assert ".drift-" in moved[0]
assert not seg.exists()
# the renamed directory still exists and contains the original file
renamed = list(palace.iterdir())
drift_dirs = [p for p in renamed if ".drift-" in p.name]
assert len(drift_dirs) == 1
assert (drift_dirs[0] / "data_level0.bin").exists()
def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path):
"""Segment with recent mtime vs sqlite is not touched."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert moved == []
assert seg.exists()
def test_quarantine_stale_hnsw_no_palace(tmp_path):
"""Missing palace path or chroma.sqlite3: return [] without raising."""
assert quarantine_stale_hnsw(str(tmp_path / "missing")) == []
empty = tmp_path / "empty"
empty.mkdir()
assert quarantine_stale_hnsw(str(empty)) == []
def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path):
"""Directories already named with ``.drift-`` suffix are never re-renamed."""
now = 1_700_000_000.0
palace = tmp_path / "palace"
palace.mkdir()
(palace / "chroma.sqlite3").write_text("")
os.utime(palace / "chroma.sqlite3", (now, now))
drift = palace / "abcd-1234.drift-20260101-000000"
drift.mkdir()
(drift / "data_level0.bin").write_text("")
os.utime(drift / "data_level0.bin", (now - 99999, now - 99999))
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert moved == []
assert drift.exists()