Merge pull request #1000 from jphein/fix/quarantine-stale-hnsw
feat(backends): quarantine_stale_hnsw — recover from HNSW/sqlite drift (closes #823)
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
"""ChromaDB-backed MemPalace storage backend (RFC 001 reference implementation)."""
|
||||
|
||||
import datetime as _dt
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
@@ -48,6 +49,88 @@ def _validate_where(where: Optional[dict]) -> None:
|
||||
stack.extend(x for x in v if isinstance(x, dict))
|
||||
|
||||
|
||||
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> list[str]:
|
||||
"""Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3.
|
||||
|
||||
When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk
|
||||
HNSW segment is significantly older than ``chroma.sqlite3``, the Rust
|
||||
graph-walk can dereference dangling neighbor pointers for entries that
|
||||
exist in the metadata segment but not in the HNSW index, and segfault
|
||||
in a background thread on the next ``count()`` or ``query(...)`` call.
|
||||
|
||||
This is the same failure mode reported at #823 (semantic search stale
|
||||
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
|
||||
``count()`` with chromadb 1.5.5), and acknowledged as by-design at
|
||||
chroma-core/chroma#2594. On one fork palace (135K drawers), the drift
|
||||
caused a 65–85% crash rate on fresh-process opens; fresh-process
|
||||
crash rate dropped to 0% after the segment dir was renamed out of the
|
||||
way and ChromaDB rebuilt lazily.
|
||||
|
||||
Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer
|
||||
than the segment's ``data_level0.bin``, the segment is considered
|
||||
suspect and renamed to ``<uuid>.drift-<timestamp>``. ChromaDB reopens
|
||||
cleanly without it and writes fresh index files on next use. The
|
||||
original directory is renamed, not deleted, so recovery remains
|
||||
possible if the heuristic misfires.
|
||||
|
||||
The default threshold (1h) is deliberately conservative — ChromaDB's
|
||||
HNSW flush cadence means legitimate drift is normally on the order of
|
||||
seconds to minutes. A segment that is more than an hour out of date is
|
||||
almost certainly in a "crashed mid-write" state.
|
||||
|
||||
Args:
|
||||
palace_path: path to the palace directory containing ``chroma.sqlite3``
|
||||
stale_seconds: minimum mtime gap to treat a segment as stale
|
||||
|
||||
Returns:
|
||||
List of paths that were quarantined (empty if nothing drifted).
|
||||
"""
|
||||
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||
if not os.path.isfile(db_path):
|
||||
return []
|
||||
try:
|
||||
sqlite_mtime = os.path.getmtime(db_path)
|
||||
except OSError:
|
||||
return []
|
||||
|
||||
moved: list[str] = []
|
||||
try:
|
||||
entries = os.listdir(palace_path)
|
||||
except OSError:
|
||||
return []
|
||||
|
||||
for name in entries:
|
||||
if "-" not in name or name.startswith(".") or ".drift-" in name:
|
||||
continue
|
||||
seg_dir = os.path.join(palace_path, name)
|
||||
if not os.path.isdir(seg_dir):
|
||||
continue
|
||||
hnsw_bin = os.path.join(seg_dir, "data_level0.bin")
|
||||
if not os.path.isfile(hnsw_bin):
|
||||
continue
|
||||
try:
|
||||
hnsw_mtime = os.path.getmtime(hnsw_bin)
|
||||
except OSError:
|
||||
continue
|
||||
if sqlite_mtime - hnsw_mtime < stale_seconds:
|
||||
continue
|
||||
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
target = f"{seg_dir}.drift-{stamp}"
|
||||
try:
|
||||
os.rename(seg_dir, target)
|
||||
moved.append(target)
|
||||
logger.warning(
|
||||
"Quarantined stale HNSW segment %s "
|
||||
"(sqlite %.0fs newer than HNSW); renamed to %s",
|
||||
seg_dir,
|
||||
sqlite_mtime - hnsw_mtime,
|
||||
target,
|
||||
)
|
||||
except OSError:
|
||||
logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir)
|
||||
return moved
|
||||
|
||||
|
||||
def _fix_blob_seq_ids(palace_path: str) -> None:
|
||||
"""Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
|
||||
|
||||
|
||||
+72
-1
@@ -1,3 +1,4 @@
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
import chromadb
|
||||
@@ -11,7 +12,12 @@ from mempalace.backends import (
|
||||
available_backends,
|
||||
get_backend,
|
||||
)
|
||||
from mempalace.backends.chroma import ChromaBackend, ChromaCollection, _fix_blob_seq_ids
|
||||
from mempalace.backends.chroma import (
|
||||
ChromaBackend,
|
||||
ChromaCollection,
|
||||
_fix_blob_seq_ids,
|
||||
quarantine_stale_hnsw,
|
||||
)
|
||||
|
||||
|
||||
class _FakeCollection:
|
||||
@@ -372,3 +378,68 @@ def test_fix_blob_seq_ids_noop_without_blobs(tmp_path):
|
||||
def test_fix_blob_seq_ids_noop_without_database(tmp_path):
|
||||
"""No error when palace has no chroma.sqlite3."""
|
||||
_fix_blob_seq_ids(str(tmp_path)) # should not raise
|
||||
|
||||
|
||||
# ── quarantine_stale_hnsw ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime):
|
||||
"""Helper: build a palace dir with one HNSW segment + sqlite at given mtimes."""
|
||||
palace = tmp_path / "palace"
|
||||
palace.mkdir()
|
||||
(palace / "chroma.sqlite3").write_text("")
|
||||
seg = palace / "abcd-1234-5678"
|
||||
seg.mkdir()
|
||||
(seg / "data_level0.bin").write_text("")
|
||||
os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime))
|
||||
os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime))
|
||||
return palace, seg
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path):
|
||||
"""Segment whose data_level0.bin is 2h older than sqlite gets renamed."""
|
||||
now = 1_700_000_000.0
|
||||
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now)
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
assert len(moved) == 1
|
||||
assert ".drift-" in moved[0]
|
||||
assert not seg.exists()
|
||||
# the renamed directory still exists and contains the original file
|
||||
renamed = list(palace.iterdir())
|
||||
drift_dirs = [p for p in renamed if ".drift-" in p.name]
|
||||
assert len(drift_dirs) == 1
|
||||
assert (drift_dirs[0] / "data_level0.bin").exists()
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path):
|
||||
"""Segment with recent mtime vs sqlite is not touched."""
|
||||
now = 1_700_000_000.0
|
||||
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now)
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
assert moved == []
|
||||
assert seg.exists()
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_no_palace(tmp_path):
|
||||
"""Missing palace path or chroma.sqlite3: return [] without raising."""
|
||||
assert quarantine_stale_hnsw(str(tmp_path / "missing")) == []
|
||||
empty = tmp_path / "empty"
|
||||
empty.mkdir()
|
||||
assert quarantine_stale_hnsw(str(empty)) == []
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path):
|
||||
"""Directories already named with ``.drift-`` suffix are never re-renamed."""
|
||||
now = 1_700_000_000.0
|
||||
palace = tmp_path / "palace"
|
||||
palace.mkdir()
|
||||
(palace / "chroma.sqlite3").write_text("")
|
||||
os.utime(palace / "chroma.sqlite3", (now, now))
|
||||
drift = palace / "abcd-1234.drift-20260101-000000"
|
||||
drift.mkdir()
|
||||
(drift / "data_level0.bin").write_text("")
|
||||
os.utime(drift / "data_level0.bin", (now - 99999, now - 99999))
|
||||
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
assert moved == []
|
||||
assert drift.exists()
|
||||
|
||||
Reference in New Issue
Block a user