feat(backends): quarantine_stale_hnsw — recover from HNSW/sqlite drift
Add a helper that renames HNSW segment directories whose `data_level0.bin` is significantly older than `chroma.sqlite3`. Drift between the on-disk HNSW graph and the live embeddings table is the root cause of a segfault class where the Rust graph-walk dereferences dangling neighbor pointers for entries in the metadata segment that no longer exist in the HNSW index, crashing in a background thread on `count()` or `query()`. Issue #823 describes the same drift as a silent-staleness symptom (semantic search returns stale results after `add_drawer` because `data_level0.bin` lags the sqlite metadata under the default `sync_threshold=1000`). Under heavier load or after an interrupted write, the same drift can escalate from "silent stale results" to "SIGSEGV on next open," which is the failure mode observed at neo-cortex-mcp#2 (chromadb 1.5.5, Python 3.12) and acknowledged at chroma-core/chroma#2594. On one 135K-drawer palace where `index_metadata.pickle` claimed 137,813 elements against 135,464 rows in sqlite (2,349-entry drift), fresh Python processes crashed in `col.count()` 17/20 times; after renaming the segment dir out of the way and letting ChromaDB rebuild lazily, the same 20-run check went to 0 crashes. The recovery path #823 suggests (export / recreate / reimport) is heavy — it re-embeds every drawer. This helper is lighter: rename the segment dir so ChromaDB reopens without it, and the indexer rebuilds lazily on the next write. The original directory is renamed (not deleted) so the operator can recover if the heuristic misfires. If `chroma.sqlite3` is more than `stale_seconds` (default 3600) newer than the segment's `data_level0.bin`, the segment is considered suspect. One hour is deliberately conservative — normal HNSW flush cadence is seconds to minutes, so an hour of drift implies a crashed mid-write, not routine lag. - Additive: exposes `quarantine_stale_hnsw(palace_path, stale_seconds)` as a helper. Not wired into `_client()` / startup on this PR — the goal is to land the primitive first so operators and higher layers can opt in. A follow-up could call it automatically on palace open behind an env var or config flag. - Closes #823 by giving operators a first-class recovery path without having to install `chromadb-ops` or re-mine. Four new tests in `tests/test_backends.py`: - renames drifted segment, preserves original files under `.drift-TS` suffix - leaves fresh segments alone - no-op on missing palace path / missing `chroma.sqlite3` - skips already-quarantined (`.drift-` suffixed) directories `pytest tests/test_backends.py` → 11 passed. `ruff check` / `ruff format --check` — clean.
This commit is contained in:
+72
-1
@@ -1,3 +1,4 @@
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
import chromadb
|
||||
@@ -11,7 +12,12 @@ from mempalace.backends import (
|
||||
available_backends,
|
||||
get_backend,
|
||||
)
|
||||
from mempalace.backends.chroma import ChromaBackend, ChromaCollection, _fix_blob_seq_ids
|
||||
from mempalace.backends.chroma import (
|
||||
ChromaBackend,
|
||||
ChromaCollection,
|
||||
_fix_blob_seq_ids,
|
||||
quarantine_stale_hnsw,
|
||||
)
|
||||
|
||||
|
||||
class _FakeCollection:
|
||||
@@ -372,3 +378,68 @@ def test_fix_blob_seq_ids_noop_without_blobs(tmp_path):
|
||||
def test_fix_blob_seq_ids_noop_without_database(tmp_path):
|
||||
"""No error when palace has no chroma.sqlite3."""
|
||||
_fix_blob_seq_ids(str(tmp_path)) # should not raise
|
||||
|
||||
|
||||
# ── quarantine_stale_hnsw ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime):
|
||||
"""Helper: build a palace dir with one HNSW segment + sqlite at given mtimes."""
|
||||
palace = tmp_path / "palace"
|
||||
palace.mkdir()
|
||||
(palace / "chroma.sqlite3").write_text("")
|
||||
seg = palace / "abcd-1234-5678"
|
||||
seg.mkdir()
|
||||
(seg / "data_level0.bin").write_text("")
|
||||
os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime))
|
||||
os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime))
|
||||
return palace, seg
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path):
|
||||
"""Segment whose data_level0.bin is 2h older than sqlite gets renamed."""
|
||||
now = 1_700_000_000.0
|
||||
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now)
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
assert len(moved) == 1
|
||||
assert ".drift-" in moved[0]
|
||||
assert not seg.exists()
|
||||
# the renamed directory still exists and contains the original file
|
||||
renamed = list(palace.iterdir())
|
||||
drift_dirs = [p for p in renamed if ".drift-" in p.name]
|
||||
assert len(drift_dirs) == 1
|
||||
assert (drift_dirs[0] / "data_level0.bin").exists()
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path):
|
||||
"""Segment with recent mtime vs sqlite is not touched."""
|
||||
now = 1_700_000_000.0
|
||||
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now)
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
assert moved == []
|
||||
assert seg.exists()
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_no_palace(tmp_path):
|
||||
"""Missing palace path or chroma.sqlite3: return [] without raising."""
|
||||
assert quarantine_stale_hnsw(str(tmp_path / "missing")) == []
|
||||
empty = tmp_path / "empty"
|
||||
empty.mkdir()
|
||||
assert quarantine_stale_hnsw(str(empty)) == []
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path):
|
||||
"""Directories already named with ``.drift-`` suffix are never re-renamed."""
|
||||
now = 1_700_000_000.0
|
||||
palace = tmp_path / "palace"
|
||||
palace.mkdir()
|
||||
(palace / "chroma.sqlite3").write_text("")
|
||||
os.utime(palace / "chroma.sqlite3", (now, now))
|
||||
drift = palace / "abcd-1234.drift-20260101-000000"
|
||||
drift.mkdir()
|
||||
(drift / "data_level0.bin").write_text("")
|
||||
os.utime(drift / "data_level0.bin", (now - 99999, now - 99999))
|
||||
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
assert moved == []
|
||||
assert drift.exists()
|
||||
|
||||
Reference in New Issue
Block a user