diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index c8d2f46..d43b884 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -49,41 +49,105 @@ def _validate_where(where: Optional[dict]) -> None: stack.extend(x for x in v if isinstance(x, dict)) -def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> list[str]: - """Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3. +def _segment_appears_healthy(seg_dir: str) -> bool: + """Return True if a chromadb HNSW segment dir looks intact. - When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk - HNSW segment is significantly older than ``chroma.sqlite3``, the Rust - graph-walk can dereference dangling neighbor pointers for entries that - exist in the metadata segment but not in the HNSW index, and segfault - in a background thread on the next ``count()`` or ``query(...)`` call. + Sniff-tests the chromadb-written segment metadata file + (``index_metadata.pickle``) for its expected format bytes without + parsing it. ChromaDB writes that file after a successful HNSW flush; + a complete write starts with byte ``0x80`` and ends with byte + ``0x2e`` (the protocol/terminator byte sequence chromadb serializes + with). If both bytes are present and the file is non-trivially sized, + chromadb will load the segment cleanly even when its on-disk mtime + trails ``chroma.sqlite3`` — which is the *steady state* under + chromadb 1.5.x's async batched flush, not corruption. - This is the same failure mode reported at #823 (semantic search stale + A missing metadata file is treated as "fresh / never-flushed" and + considered healthy. Renaming an empty dir orphans nothing, and a + real corruption case manifests as a present-but-malformed file or a + chromadb load error caught downstream by palace-daemon's + ``_auto_repair`` retry path. + + Deliberately format-sniffs only; never deserializes. Deserialization + can execute arbitrary code, and the byte-sniff is sufficient to + distinguish a complete write from truncation, zero-fill, or + partial-flush corruption. + + Assumes pickle protocol >= 2 (``0x80`` PROTO marker). Matches what + chromadb writes today; if a future chromadb version emits protocol + 0/1 segments, this check would start returning False on healthy + files and quarantine_stale_hnsw would conservatively rename them + out of the way (lazy rebuild on next open recovers). + """ + meta_path = os.path.join(seg_dir, "index_metadata.pickle") + if not os.path.isfile(meta_path): + # No metadata file yet — segment hasn't flushed (fresh / empty). + # Renaming would orphan nothing; consider healthy. + return True + try: + size = os.path.getsize(meta_path) + # A real chromadb metadata file is at least tens of bytes; a + # smaller-than-floor file is almost certainly truncated. + if size < 16: + return False + with open(meta_path, "rb") as f: + head = f.read(2) + f.seek(-1, 2) # last byte + tail = f.read(1) + except OSError: + return False + return len(head) == 2 and head[0] == 0x80 and tail == b"\x2e" + + +def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]: + """Rename HNSW segment dirs that are both stale-by-mtime AND fail an + integrity sniff-test. + + Catches the segfault failure mode from #823 (semantic search stale after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on ``count()`` with chromadb 1.5.5), and acknowledged as by-design at - chroma-core/chroma#2594. On one fork palace (135K drawers), the drift - caused a 65–85% crash rate on fresh-process opens; fresh-process - crash rate dropped to 0% after the segment dir was renamed out of the - way and ChromaDB rebuilt lazily. + chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb + rebuild lazily on next open instead of segfaulting. - Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer - than the segment's ``data_level0.bin``, the segment is considered - suspect and renamed to ``.drift-``. ChromaDB reopens - cleanly without it and writes fresh index files on next use. The - original directory is renamed, not deleted, so recovery remains - possible if the heuristic misfires. + Two-stage check: - The default threshold (1h) is deliberately conservative — ChromaDB's - HNSW flush cadence means legitimate drift is normally on the order of - seconds to minutes. A segment that is more than an hour out of date is - almost certainly in a "crashed mid-write" state. + 1. **mtime gate.** If ``chroma.sqlite3`` is less than + ``stale_seconds`` newer than the segment's ``data_level0.bin``, + skip — chromadb is in normal write-path territory. + + 2. **Integrity gate** (``_segment_appears_healthy``). Even when the + mtime gap exceeds the threshold, a segment whose + ``index_metadata.pickle`` passes a format sniff-test is healthy: + chromadb 1.5.x flushes HNSW state asynchronously and a clean + shutdown does NOT force-flush, so the on-disk HNSW is *always* + somewhat older than ``chroma.sqlite3``. Production observation + (2026-04-26 disks daemon): three of three segments quarantined + on every cold start, with 538-557s gaps, leaving the 151K-drawer + palace with vector_ranked=0 until rebuild. Renaming a healthy + segment based on mtime alone destroys a valid index — chromadb + creates an empty replacement, orphaning every drawer in sqlite + from vector recall until the operator runs ``mempalace repair + --mode rebuild`` (15+ min on a 151K palace). + + Only segments that pass stage 1 (suspiciously stale) AND fail stage + 2 (metadata file truncated, zero-filled, or absent-with-data) are + renamed to ``.drift-``. The original directory is + renamed, not deleted, so recovery remains possible if the heuristic + misfires. + + The default threshold (5 min) is advisory under daemon-strict; the + integrity gate is what actually distinguishes corruption from flush + lag. The threshold still matters for the cross-machine replication + case (#823), where it bounds how stale a Syncthing-replicated + segment can be before we look harder at it. Args: palace_path: path to the palace directory containing ``chroma.sqlite3`` - stale_seconds: minimum mtime gap to treat a segment as stale + stale_seconds: minimum mtime gap to *consider* a segment for quarantine Returns: - List of paths that were quarantined (empty if nothing drifted). + List of paths that were quarantined (empty if nothing actually + looked corrupt). """ db_path = os.path.join(palace_path, "chroma.sqlite3") if not os.path.isfile(db_path): @@ -114,19 +178,35 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li continue if sqlite_mtime - hnsw_mtime < stale_seconds: continue + + # Stage 2: integrity gate. mtime drift is necessary but not + # sufficient — chromadb's async flush makes drift the steady- + # state condition. A healthy segment metadata file proves + # chromadb can open the segment without segfault; don't + # quarantine a healthy index. + if _segment_appears_healthy(seg_dir): + logger.info( + "HNSW mtime gap %.0fs on %s exceeds threshold but segment " + "metadata file is intact — flush-lag, not corruption. " + "Leaving in place.", + sqlite_mtime - hnsw_mtime, + seg_dir, + ) + continue + stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S") target = f"{seg_dir}.drift-{stamp}" try: os.rename(seg_dir, target) moved.append(target) logger.warning( - "Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s", + "Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s", seg_dir, sqlite_mtime - hnsw_mtime, target, ) except OSError: - logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir) + logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir) return moved @@ -535,6 +615,28 @@ class ChromaBackend(BaseBackend): # Public static helpers (legacy; prefer :meth:`get_collection`) # ------------------------------------------------------------------ + # Per-process record of palaces that have already had quarantine_stale_hnsw + # invoked at least once. The proactive drift check is a *cold-start* + # protection — it catches HNSW segments that arrived stale relative to + # ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore, + # crashed-mid-write). Once a long-running process has opened the palace + # cleanly, re-firing on every reconnect is a *runtime thrash*: the + # daemon's own writes bump sqlite mtime but HNSW flushes batch on + # chromadb's internal cadence, so the mtime gap naturally exceeds the + # threshold under steady write load even though nothing is corrupt. + # Real runtime drift is still handled — palace-daemon's ``_auto_repair`` + # calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors, + # which bypasses this gate. + # + # Thread-safety: this set is mutated without a lock. Two concurrent + # ``make_client()`` calls for the same palace can both pass the + # membership check and both invoke ``quarantine_stale_hnsw``. That's + # safe because the function is idempotent (mtime check + timestamped + # rename of distinct directories), so the worst-case race produces + # one redundant rename attempt that no-ops. Idempotency is the + # safety property; locking would add cost without correctness gain. + _quarantined_paths: set[str] = set() + @staticmethod def make_client(palace_path: str): """Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first). @@ -542,8 +644,15 @@ class ChromaBackend(BaseBackend): Deprecated-ish: exposed for legacy long-lived callers that manage their own client cache. New code should obtain a collection through :meth:`get_collection` which manages caching internally. + + Quarantines stale HNSW segments **once per palace per process**. See + :attr:`_quarantined_paths` for the rationale (cold-start protection + vs. runtime thrash on steady-write daemons). """ _fix_blob_seq_ids(palace_path) + if palace_path not in ChromaBackend._quarantined_paths: + quarantine_stale_hnsw(palace_path) + ChromaBackend._quarantined_paths.add(palace_path) return chromadb.PersistentClient(path=palace_path) @staticmethod diff --git a/tests/conftest.py b/tests/conftest.py index 7b2bb77..4ed82ca 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,6 +46,14 @@ def _reset_mcp_cache(): mcp_server._collection_cache = None except (ImportError, AttributeError): pass + try: + # Reset the per-process quarantine gate so tests don't leak + # state through ChromaBackend._quarantined_paths. + from mempalace.backends.chroma import ChromaBackend + + ChromaBackend._quarantined_paths.clear() + except (ImportError, AttributeError): + pass _clear_cache() yield diff --git a/tests/test_backends.py b/tests/test_backends.py index e47eb6f..e632bdc 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -1,5 +1,6 @@ import os import sqlite3 +from pathlib import Path import chromadb import pytest @@ -384,36 +385,102 @@ def test_fix_blob_seq_ids_noop_without_database(tmp_path): # ── quarantine_stale_hnsw ───────────────────────────────────────────────── -def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime): - """Helper: build a palace dir with one HNSW segment + sqlite at given mtimes.""" +# Marker bytes for the chromadb segment metadata file. A complete +# write begins with PROTO opcode (0x80) and ends with STOP opcode +# (0x2e); _segment_appears_healthy sniffs these bytes without parsing +# the file. +_HEALTHY_META = b"\x80\x04" + b"\x00" * 32 + b"\x2e" +_CORRUPT_META = b"\x00" * 64 + + +def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime, meta_bytes=_HEALTHY_META): + """Helper: build a palace dir with one HNSW segment + sqlite at given + mtimes. ``meta_bytes`` controls whether the segment looks healthy + (default), corrupt (``_CORRUPT_META``), or has no metadata file at + all (``None``).""" palace = tmp_path / "palace" palace.mkdir() (palace / "chroma.sqlite3").write_text("") seg = palace / "abcd-1234-5678" seg.mkdir() (seg / "data_level0.bin").write_text("") + if meta_bytes is not None: + (seg / "index_metadata.pickle").write_bytes(meta_bytes) os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime)) os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime)) return palace, seg -def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path): - """Segment whose data_level0.bin is 2h older than sqlite gets renamed.""" +def test_quarantine_stale_hnsw_renames_corrupt_segment(tmp_path): + """Segment with stale mtime AND a malformed metadata file gets renamed.""" now = 1_700_000_000.0 - palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now) + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=_CORRUPT_META, + ) moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) assert len(moved) == 1 assert ".drift-" in moved[0] assert not seg.exists() - # the renamed directory still exists and contains the original file renamed = list(palace.iterdir()) drift_dirs = [p for p in renamed if ".drift-" in p.name] assert len(drift_dirs) == 1 assert (drift_dirs[0] / "data_level0.bin").exists() +def test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone(tmp_path): + """Segment with stale mtime but a complete metadata file is NOT + renamed — this is the chromadb-1.5.x async-flush steady state, not + corruption. Production case at 06:24 PDT 2026-04-26: cold-start + quarantine renamed three healthy segments after a clean shutdown, + leaving 151K-drawer palace with vector_ranked=0.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=_HEALTHY_META, + ) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert moved == [] + assert seg.exists() + + +def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path): + """Segment with no metadata file is treated as fresh / never-flushed + and not quarantined — renaming an empty dir orphans nothing.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=None, + ) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert moved == [] + assert seg.exists() + + +def test_quarantine_stale_hnsw_renames_truncated_metadata(tmp_path): + """Segment with a truncated (under-floor-size) metadata file is + quarantined — shape of a partial-flush during process kill.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=b"\x80\x04", + ) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert len(moved) == 1 + assert ".drift-" in moved[0] + + def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path): - """Segment with recent mtime vs sqlite is not touched.""" + """Segment with recent mtime vs sqlite is not touched (mtime gate + short-circuits before integrity gate).""" now = 1_700_000_000.0 palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now) moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) @@ -446,7 +513,70 @@ def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path): assert drift.exists() -# ── _pin_hnsw_threads ───────────────────────────────────────────────────── +# ── make_client cold-start gate ────────────────────────────────────────── + + +def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch): + """Quarantine fires on first ``make_client()`` for a palace, then is + skipped on subsequent calls — prevents runtime thrash where a daemon's + own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes, + making the mtime heuristic falsely trigger every reconnect.""" + from mempalace.backends.chroma import ChromaBackend + + palace_path = str(tmp_path / "palace") + os.makedirs(palace_path, exist_ok=True) + (Path(palace_path) / "chroma.sqlite3").write_text("") + + # Reset the per-process cache so this test is independent of others. + monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set()) + + calls: list[str] = [] + + def _spy(path, stale_seconds=300.0): + calls.append(path) + return [] + + monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy) + + ChromaBackend.make_client(palace_path) + ChromaBackend.make_client(palace_path) + ChromaBackend.make_client(palace_path) + + assert calls == [ + palace_path + ], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect" + + +def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch): + """Two distinct palaces each get one quarantine attempt — the gate is + keyed by palace path, not global.""" + from mempalace.backends.chroma import ChromaBackend + + palace_a = str(tmp_path / "palace_a") + palace_b = str(tmp_path / "palace_b") + for p in (palace_a, palace_b): + os.makedirs(p, exist_ok=True) + (Path(p) / "chroma.sqlite3").write_text("") + + monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set()) + + calls: list[str] = [] + + def _spy(path, stale_seconds=300.0): + calls.append(path) + return [] + + monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy) + + ChromaBackend.make_client(palace_a) + ChromaBackend.make_client(palace_b) + ChromaBackend.make_client(palace_a) # already gated + ChromaBackend.make_client(palace_b) # already gated + + assert calls == [palace_a, palace_b] + + +# ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ── def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path):