Merge pull request #1173 from jphein/fix/quarantine-on-make-client

fix: call quarantine_stale_hnsw() in make_client(); lower threshold to 5min
This commit is contained in:
Igor Lins e Silva
2026-04-26 18:22:14 -03:00
committed by GitHub
3 changed files with 281 additions and 34 deletions
+135 -26
View File
@@ -49,41 +49,105 @@ def _validate_where(where: Optional[dict]) -> None:
stack.extend(x for x in v if isinstance(x, dict)) stack.extend(x for x in v if isinstance(x, dict))
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> list[str]: def _segment_appears_healthy(seg_dir: str) -> bool:
"""Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3. """Return True if a chromadb HNSW segment dir looks intact.
When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk Sniff-tests the chromadb-written segment metadata file
HNSW segment is significantly older than ``chroma.sqlite3``, the Rust (``index_metadata.pickle``) for its expected format bytes without
graph-walk can dereference dangling neighbor pointers for entries that parsing it. ChromaDB writes that file after a successful HNSW flush;
exist in the metadata segment but not in the HNSW index, and segfault a complete write starts with byte ``0x80`` and ends with byte
in a background thread on the next ``count()`` or ``query(...)`` call. ``0x2e`` (the protocol/terminator byte sequence chromadb serializes
with). If both bytes are present and the file is non-trivially sized,
chromadb will load the segment cleanly even when its on-disk mtime
trails ``chroma.sqlite3`` — which is the *steady state* under
chromadb 1.5.x's async batched flush, not corruption.
This is the same failure mode reported at #823 (semantic search stale A missing metadata file is treated as "fresh / never-flushed" and
considered healthy. Renaming an empty dir orphans nothing, and a
real corruption case manifests as a present-but-malformed file or a
chromadb load error caught downstream by palace-daemon's
``_auto_repair`` retry path.
Deliberately format-sniffs only; never deserializes. Deserialization
can execute arbitrary code, and the byte-sniff is sufficient to
distinguish a complete write from truncation, zero-fill, or
partial-flush corruption.
Assumes pickle protocol >= 2 (``0x80`` PROTO marker). Matches what
chromadb writes today; if a future chromadb version emits protocol
0/1 segments, this check would start returning False on healthy
files and quarantine_stale_hnsw would conservatively rename them
out of the way (lazy rebuild on next open recovers).
"""
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
if not os.path.isfile(meta_path):
# No metadata file yet — segment hasn't flushed (fresh / empty).
# Renaming would orphan nothing; consider healthy.
return True
try:
size = os.path.getsize(meta_path)
# A real chromadb metadata file is at least tens of bytes; a
# smaller-than-floor file is almost certainly truncated.
if size < 16:
return False
with open(meta_path, "rb") as f:
head = f.read(2)
f.seek(-1, 2) # last byte
tail = f.read(1)
except OSError:
return False
return len(head) == 2 and head[0] == 0x80 and tail == b"\x2e"
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
"""Rename HNSW segment dirs that are both stale-by-mtime AND fail an
integrity sniff-test.
Catches the segfault failure mode from #823 (semantic search stale
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
``count()`` with chromadb 1.5.5), and acknowledged as by-design at ``count()`` with chromadb 1.5.5), and acknowledged as by-design at
chroma-core/chroma#2594. On one fork palace (135K drawers), the drift chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb
caused a 6585% crash rate on fresh-process opens; fresh-process rebuild lazily on next open instead of segfaulting.
crash rate dropped to 0% after the segment dir was renamed out of the
way and ChromaDB rebuilt lazily.
Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer Two-stage check:
than the segment's ``data_level0.bin``, the segment is considered
suspect and renamed to ``<uuid>.drift-<timestamp>``. ChromaDB reopens
cleanly without it and writes fresh index files on next use. The
original directory is renamed, not deleted, so recovery remains
possible if the heuristic misfires.
The default threshold (1h) is deliberately conservative — ChromaDB's 1. **mtime gate.** If ``chroma.sqlite3`` is less than
HNSW flush cadence means legitimate drift is normally on the order of ``stale_seconds`` newer than the segment's ``data_level0.bin``,
seconds to minutes. A segment that is more than an hour out of date is skip — chromadb is in normal write-path territory.
almost certainly in a "crashed mid-write" state.
2. **Integrity gate** (``_segment_appears_healthy``). Even when the
mtime gap exceeds the threshold, a segment whose
``index_metadata.pickle`` passes a format sniff-test is healthy:
chromadb 1.5.x flushes HNSW state asynchronously and a clean
shutdown does NOT force-flush, so the on-disk HNSW is *always*
somewhat older than ``chroma.sqlite3``. Production observation
(2026-04-26 disks daemon): three of three segments quarantined
on every cold start, with 538-557s gaps, leaving the 151K-drawer
palace with vector_ranked=0 until rebuild. Renaming a healthy
segment based on mtime alone destroys a valid index — chromadb
creates an empty replacement, orphaning every drawer in sqlite
from vector recall until the operator runs ``mempalace repair
--mode rebuild`` (15+ min on a 151K palace).
Only segments that pass stage 1 (suspiciously stale) AND fail stage
2 (metadata file truncated, zero-filled, or absent-with-data) are
renamed to ``<uuid>.drift-<timestamp>``. The original directory is
renamed, not deleted, so recovery remains possible if the heuristic
misfires.
The default threshold (5 min) is advisory under daemon-strict; the
integrity gate is what actually distinguishes corruption from flush
lag. The threshold still matters for the cross-machine replication
case (#823), where it bounds how stale a Syncthing-replicated
segment can be before we look harder at it.
Args: Args:
palace_path: path to the palace directory containing ``chroma.sqlite3`` palace_path: path to the palace directory containing ``chroma.sqlite3``
stale_seconds: minimum mtime gap to treat a segment as stale stale_seconds: minimum mtime gap to *consider* a segment for quarantine
Returns: Returns:
List of paths that were quarantined (empty if nothing drifted). List of paths that were quarantined (empty if nothing actually
looked corrupt).
""" """
db_path = os.path.join(palace_path, "chroma.sqlite3") db_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path): if not os.path.isfile(db_path):
@@ -114,19 +178,35 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li
continue continue
if sqlite_mtime - hnsw_mtime < stale_seconds: if sqlite_mtime - hnsw_mtime < stale_seconds:
continue continue
# Stage 2: integrity gate. mtime drift is necessary but not
# sufficient — chromadb's async flush makes drift the steady-
# state condition. A healthy segment metadata file proves
# chromadb can open the segment without segfault; don't
# quarantine a healthy index.
if _segment_appears_healthy(seg_dir):
logger.info(
"HNSW mtime gap %.0fs on %s exceeds threshold but segment "
"metadata file is intact — flush-lag, not corruption. "
"Leaving in place.",
sqlite_mtime - hnsw_mtime,
seg_dir,
)
continue
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S") stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
target = f"{seg_dir}.drift-{stamp}" target = f"{seg_dir}.drift-{stamp}"
try: try:
os.rename(seg_dir, target) os.rename(seg_dir, target)
moved.append(target) moved.append(target)
logger.warning( logger.warning(
"Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s", "Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s",
seg_dir, seg_dir,
sqlite_mtime - hnsw_mtime, sqlite_mtime - hnsw_mtime,
target, target,
) )
except OSError: except OSError:
logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir) logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir)
return moved return moved
@@ -535,6 +615,28 @@ class ChromaBackend(BaseBackend):
# Public static helpers (legacy; prefer :meth:`get_collection`) # Public static helpers (legacy; prefer :meth:`get_collection`)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Per-process record of palaces that have already had quarantine_stale_hnsw
# invoked at least once. The proactive drift check is a *cold-start*
# protection — it catches HNSW segments that arrived stale relative to
# ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore,
# crashed-mid-write). Once a long-running process has opened the palace
# cleanly, re-firing on every reconnect is a *runtime thrash*: the
# daemon's own writes bump sqlite mtime but HNSW flushes batch on
# chromadb's internal cadence, so the mtime gap naturally exceeds the
# threshold under steady write load even though nothing is corrupt.
# Real runtime drift is still handled — palace-daemon's ``_auto_repair``
# calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors,
# which bypasses this gate.
#
# Thread-safety: this set is mutated without a lock. Two concurrent
# ``make_client()`` calls for the same palace can both pass the
# membership check and both invoke ``quarantine_stale_hnsw``. That's
# safe because the function is idempotent (mtime check + timestamped
# rename of distinct directories), so the worst-case race produces
# one redundant rename attempt that no-ops. Idempotency is the
# safety property; locking would add cost without correctness gain.
_quarantined_paths: set[str] = set()
@staticmethod @staticmethod
def make_client(palace_path: str): def make_client(palace_path: str):
"""Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first). """Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first).
@@ -542,8 +644,15 @@ class ChromaBackend(BaseBackend):
Deprecated-ish: exposed for legacy long-lived callers that manage their Deprecated-ish: exposed for legacy long-lived callers that manage their
own client cache. New code should obtain a collection through own client cache. New code should obtain a collection through
:meth:`get_collection` which manages caching internally. :meth:`get_collection` which manages caching internally.
Quarantines stale HNSW segments **once per palace per process**. See
:attr:`_quarantined_paths` for the rationale (cold-start protection
vs. runtime thrash on steady-write daemons).
""" """
_fix_blob_seq_ids(palace_path) _fix_blob_seq_ids(palace_path)
if palace_path not in ChromaBackend._quarantined_paths:
quarantine_stale_hnsw(palace_path)
ChromaBackend._quarantined_paths.add(palace_path)
return chromadb.PersistentClient(path=palace_path) return chromadb.PersistentClient(path=palace_path)
@staticmethod @staticmethod
+8
View File
@@ -46,6 +46,14 @@ def _reset_mcp_cache():
mcp_server._collection_cache = None mcp_server._collection_cache = None
except (ImportError, AttributeError): except (ImportError, AttributeError):
pass pass
try:
# Reset the per-process quarantine gate so tests don't leak
# state through ChromaBackend._quarantined_paths.
from mempalace.backends.chroma import ChromaBackend
ChromaBackend._quarantined_paths.clear()
except (ImportError, AttributeError):
pass
_clear_cache() _clear_cache()
yield yield
+138 -8
View File
@@ -1,5 +1,6 @@
import os import os
import sqlite3 import sqlite3
from pathlib import Path
import chromadb import chromadb
import pytest import pytest
@@ -384,36 +385,102 @@ def test_fix_blob_seq_ids_noop_without_database(tmp_path):
# ── quarantine_stale_hnsw ───────────────────────────────────────────────── # ── quarantine_stale_hnsw ─────────────────────────────────────────────────
def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime): # Marker bytes for the chromadb segment metadata file. A complete
"""Helper: build a palace dir with one HNSW segment + sqlite at given mtimes.""" # write begins with PROTO opcode (0x80) and ends with STOP opcode
# (0x2e); _segment_appears_healthy sniffs these bytes without parsing
# the file.
_HEALTHY_META = b"\x80\x04" + b"\x00" * 32 + b"\x2e"
_CORRUPT_META = b"\x00" * 64
def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime, meta_bytes=_HEALTHY_META):
"""Helper: build a palace dir with one HNSW segment + sqlite at given
mtimes. ``meta_bytes`` controls whether the segment looks healthy
(default), corrupt (``_CORRUPT_META``), or has no metadata file at
all (``None``)."""
palace = tmp_path / "palace" palace = tmp_path / "palace"
palace.mkdir() palace.mkdir()
(palace / "chroma.sqlite3").write_text("") (palace / "chroma.sqlite3").write_text("")
seg = palace / "abcd-1234-5678" seg = palace / "abcd-1234-5678"
seg.mkdir() seg.mkdir()
(seg / "data_level0.bin").write_text("") (seg / "data_level0.bin").write_text("")
if meta_bytes is not None:
(seg / "index_metadata.pickle").write_bytes(meta_bytes)
os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime)) os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime))
os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime)) os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime))
return palace, seg return palace, seg
def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path): def test_quarantine_stale_hnsw_renames_corrupt_segment(tmp_path):
"""Segment whose data_level0.bin is 2h older than sqlite gets renamed.""" """Segment with stale mtime AND a malformed metadata file gets renamed."""
now = 1_700_000_000.0 now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now) palace, seg = _make_palace_with_segment(
tmp_path,
hnsw_mtime=now - 7200,
sqlite_mtime=now,
meta_bytes=_CORRUPT_META,
)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert len(moved) == 1 assert len(moved) == 1
assert ".drift-" in moved[0] assert ".drift-" in moved[0]
assert not seg.exists() assert not seg.exists()
# the renamed directory still exists and contains the original file
renamed = list(palace.iterdir()) renamed = list(palace.iterdir())
drift_dirs = [p for p in renamed if ".drift-" in p.name] drift_dirs = [p for p in renamed if ".drift-" in p.name]
assert len(drift_dirs) == 1 assert len(drift_dirs) == 1
assert (drift_dirs[0] / "data_level0.bin").exists() assert (drift_dirs[0] / "data_level0.bin").exists()
def test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone(tmp_path):
"""Segment with stale mtime but a complete metadata file is NOT
renamed — this is the chromadb-1.5.x async-flush steady state, not
corruption. Production case at 06:24 PDT 2026-04-26: cold-start
quarantine renamed three healthy segments after a clean shutdown,
leaving 151K-drawer palace with vector_ranked=0."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(
tmp_path,
hnsw_mtime=now - 7200,
sqlite_mtime=now,
meta_bytes=_HEALTHY_META,
)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert moved == []
assert seg.exists()
def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path):
"""Segment with no metadata file is treated as fresh / never-flushed
and not quarantined — renaming an empty dir orphans nothing."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(
tmp_path,
hnsw_mtime=now - 7200,
sqlite_mtime=now,
meta_bytes=None,
)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert moved == []
assert seg.exists()
def test_quarantine_stale_hnsw_renames_truncated_metadata(tmp_path):
"""Segment with a truncated (under-floor-size) metadata file is
quarantined — shape of a partial-flush during process kill."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(
tmp_path,
hnsw_mtime=now - 7200,
sqlite_mtime=now,
meta_bytes=b"\x80\x04",
)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert len(moved) == 1
assert ".drift-" in moved[0]
def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path): def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path):
"""Segment with recent mtime vs sqlite is not touched.""" """Segment with recent mtime vs sqlite is not touched (mtime gate
short-circuits before integrity gate)."""
now = 1_700_000_000.0 now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now) palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
@@ -446,7 +513,70 @@ def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path):
assert drift.exists() assert drift.exists()
# ── _pin_hnsw_threads ───────────────────────────────────────────────────── # ── make_client cold-start gate ──────────────────────────────────────────
def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch):
"""Quarantine fires on first ``make_client()`` for a palace, then is
skipped on subsequent calls — prevents runtime thrash where a daemon's
own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes,
making the mtime heuristic falsely trigger every reconnect."""
from mempalace.backends.chroma import ChromaBackend
palace_path = str(tmp_path / "palace")
os.makedirs(palace_path, exist_ok=True)
(Path(palace_path) / "chroma.sqlite3").write_text("")
# Reset the per-process cache so this test is independent of others.
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
calls: list[str] = []
def _spy(path, stale_seconds=300.0):
calls.append(path)
return []
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
ChromaBackend.make_client(palace_path)
ChromaBackend.make_client(palace_path)
ChromaBackend.make_client(palace_path)
assert calls == [
palace_path
], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
"""Two distinct palaces each get one quarantine attempt — the gate is
keyed by palace path, not global."""
from mempalace.backends.chroma import ChromaBackend
palace_a = str(tmp_path / "palace_a")
palace_b = str(tmp_path / "palace_b")
for p in (palace_a, palace_b):
os.makedirs(p, exist_ok=True)
(Path(p) / "chroma.sqlite3").write_text("")
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
calls: list[str] = []
def _spy(path, stale_seconds=300.0):
calls.append(path)
return []
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
ChromaBackend.make_client(palace_a)
ChromaBackend.make_client(palace_b)
ChromaBackend.make_client(palace_a) # already gated
ChromaBackend.make_client(palace_b) # already gated
assert calls == [palace_a, palace_b]
# ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ──
def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path): def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path):