diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 22e9bf2..034c28b 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -49,43 +49,99 @@ def _validate_where(where: Optional[dict]) -> None: stack.extend(x for x in v if isinstance(x, dict)) +def _segment_appears_healthy(seg_dir: str) -> bool: + """Return True if a chromadb HNSW segment dir looks intact. + + Sniff-tests the chromadb-written segment metadata file + (``index_metadata.pickle``) for its expected format bytes without + parsing it. ChromaDB writes that file after a successful HNSW flush; + a complete write starts with byte ``0x80`` and ends with byte + ``0x2e`` (the protocol/terminator byte sequence chromadb serializes + with). If both bytes are present and the file is non-trivially sized, + chromadb will load the segment cleanly even when its on-disk mtime + trails ``chroma.sqlite3`` — which is the *steady state* under + chromadb 1.5.x's async batched flush, not corruption. + + A missing metadata file is treated as "fresh / never-flushed" and + considered healthy. Renaming an empty dir orphans nothing, and a + real corruption case manifests as a present-but-malformed file or a + chromadb load error caught downstream by palace-daemon's + ``_auto_repair`` retry path. + + Deliberately format-sniffs only; never deserializes. Deserialization + can execute arbitrary code, and the byte-sniff is sufficient to + distinguish a complete write from truncation, zero-fill, or + partial-flush corruption. + """ + meta_path = os.path.join(seg_dir, "index_metadata.pickle") + if not os.path.isfile(meta_path): + # No metadata file yet — segment hasn't flushed (fresh / empty). + # Renaming would orphan nothing; consider healthy. + return True + try: + size = os.path.getsize(meta_path) + # A real chromadb metadata file is at least tens of bytes; a + # smaller-than-floor file is almost certainly truncated. + if size < 16: + return False + with open(meta_path, "rb") as f: + head = f.read(2) + f.seek(-1, 2) # last byte + tail = f.read(1) + except OSError: + return False + return len(head) == 2 and head[0] == 0x80 and tail == b"\x2e" + + def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]: - """Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3. + """Rename HNSW segment dirs that are both stale-by-mtime AND fail an + integrity sniff-test. - When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk - HNSW segment is significantly older than ``chroma.sqlite3``, the Rust - graph-walk can dereference dangling neighbor pointers for entries that - exist in the metadata segment but not in the HNSW index, and segfault - in a background thread on the next ``count()`` or ``query(...)`` call. - - This is the same failure mode reported at #823 (semantic search stale + Catches the segfault failure mode from #823 (semantic search stale after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on ``count()`` with chromadb 1.5.5), and acknowledged as by-design at - chroma-core/chroma#2594. On one fork palace (135K drawers), the drift - caused a 65–85% crash rate on fresh-process opens; fresh-process - crash rate dropped to 0% after the segment dir was renamed out of the - way and ChromaDB rebuilt lazily. + chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb + rebuild lazily on next open instead of segfaulting. - Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer - than the segment's ``data_level0.bin``, the segment is considered - suspect and renamed to ``.drift-``. ChromaDB reopens - cleanly without it and writes fresh index files on next use. The - original directory is renamed, not deleted, so recovery remains - possible if the heuristic misfires. + Two-stage check: - The default threshold (5 min) is based on ChromaDB's HNSW flush - cadence — legitimate drift is normally on the order of seconds to - minutes. A segment more than 5 minutes out of date is almost certainly - in a "crashed mid-write" or "concurrent-write corrupted" state. The - previous 1h threshold was too conservative: 0.96h drift was observed - causing segfaults in production. + 1. **mtime gate.** If ``chroma.sqlite3`` is less than + ``stale_seconds`` newer than the segment's ``data_level0.bin``, + skip — chromadb is in normal write-path territory. + + 2. **Integrity gate** (``_segment_appears_healthy``). Even when the + mtime gap exceeds the threshold, a segment whose + ``index_metadata.pickle`` passes a format sniff-test is healthy: + chromadb 1.5.x flushes HNSW state asynchronously and a clean + shutdown does NOT force-flush, so the on-disk HNSW is *always* + somewhat older than ``chroma.sqlite3``. Production observation + (2026-04-26 disks daemon): three of three segments quarantined + on every cold start, with 538-557s gaps, leaving the 151K-drawer + palace with vector_ranked=0 until rebuild. Renaming a healthy + segment based on mtime alone destroys a valid index — chromadb + creates an empty replacement, orphaning every drawer in sqlite + from vector recall until the operator runs ``mempalace repair + --mode rebuild`` (15+ min on a 151K palace). + + Only segments that pass stage 1 (suspiciously stale) AND fail stage + 2 (metadata file truncated, zero-filled, or absent-with-data) are + renamed to ``.drift-``. The original directory is + renamed, not deleted, so recovery remains possible if the heuristic + misfires. + + The default threshold (5 min) is advisory under daemon-strict; the + integrity gate is what actually distinguishes corruption from flush + lag. The threshold still matters for the cross-machine replication + case (#823), where it bounds how stale a Syncthing-replicated + segment can be before we look harder at it. Args: palace_path: path to the palace directory containing ``chroma.sqlite3`` - stale_seconds: minimum mtime gap to treat a segment as stale + stale_seconds: minimum mtime gap to *consider* a segment for quarantine Returns: - List of paths that were quarantined (empty if nothing drifted). + List of paths that were quarantined (empty if nothing actually + looked corrupt). """ db_path = os.path.join(palace_path, "chroma.sqlite3") if not os.path.isfile(db_path): @@ -116,19 +172,35 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis continue if sqlite_mtime - hnsw_mtime < stale_seconds: continue + + # Stage 2: integrity gate. mtime drift is necessary but not + # sufficient — chromadb's async flush makes drift the steady- + # state condition. A healthy segment metadata file proves + # chromadb can open the segment without segfault; don't + # quarantine a healthy index. + if _segment_appears_healthy(seg_dir): + logger.info( + "HNSW mtime gap %.0fs on %s exceeds threshold but segment " + "metadata file is intact — flush-lag, not corruption. " + "Leaving in place.", + sqlite_mtime - hnsw_mtime, + seg_dir, + ) + continue + stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S") target = f"{seg_dir}.drift-{stamp}" try: os.rename(seg_dir, target) moved.append(target) logger.warning( - "Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s", + "Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s", seg_dir, sqlite_mtime - hnsw_mtime, target, ) except OSError: - logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir) + logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir) return moved diff --git a/tests/test_backends.py b/tests/test_backends.py index 2b83314..ec4012a 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -385,36 +385,104 @@ def test_fix_blob_seq_ids_noop_without_database(tmp_path): # ── quarantine_stale_hnsw ───────────────────────────────────────────────── -def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime): - """Helper: build a palace dir with one HNSW segment + sqlite at given mtimes.""" +# Marker bytes for the chromadb segment metadata file. A complete +# write begins with PROTO opcode (0x80) and ends with STOP opcode +# (0x2e); _segment_appears_healthy sniffs these bytes without parsing +# the file. +_HEALTHY_META = b"\x80\x04" + b"\x00" * 32 + b"\x2e" +_CORRUPT_META = b"\x00" * 64 + + +def _make_palace_with_segment( + tmp_path, hnsw_mtime, sqlite_mtime, meta_bytes=_HEALTHY_META +): + """Helper: build a palace dir with one HNSW segment + sqlite at given + mtimes. ``meta_bytes`` controls whether the segment looks healthy + (default), corrupt (``_CORRUPT_META``), or has no metadata file at + all (``None``).""" palace = tmp_path / "palace" palace.mkdir() (palace / "chroma.sqlite3").write_text("") seg = palace / "abcd-1234-5678" seg.mkdir() (seg / "data_level0.bin").write_text("") + if meta_bytes is not None: + (seg / "index_metadata.pickle").write_bytes(meta_bytes) os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime)) os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime)) return palace, seg -def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path): - """Segment whose data_level0.bin is 2h older than sqlite gets renamed.""" +def test_quarantine_stale_hnsw_renames_corrupt_segment(tmp_path): + """Segment with stale mtime AND a malformed metadata file gets renamed.""" now = 1_700_000_000.0 - palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now) + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=_CORRUPT_META, + ) moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) assert len(moved) == 1 assert ".drift-" in moved[0] assert not seg.exists() - # the renamed directory still exists and contains the original file renamed = list(palace.iterdir()) drift_dirs = [p for p in renamed if ".drift-" in p.name] assert len(drift_dirs) == 1 assert (drift_dirs[0] / "data_level0.bin").exists() +def test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone(tmp_path): + """Segment with stale mtime but a complete metadata file is NOT + renamed — this is the chromadb-1.5.x async-flush steady state, not + corruption. Production case at 06:24 PDT 2026-04-26: cold-start + quarantine renamed three healthy segments after a clean shutdown, + leaving 151K-drawer palace with vector_ranked=0.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=_HEALTHY_META, + ) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert moved == [] + assert seg.exists() + + +def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path): + """Segment with no metadata file is treated as fresh / never-flushed + and not quarantined — renaming an empty dir orphans nothing.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=None, + ) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert moved == [] + assert seg.exists() + + +def test_quarantine_stale_hnsw_renames_truncated_metadata(tmp_path): + """Segment with a truncated (under-floor-size) metadata file is + quarantined — shape of a partial-flush during process kill.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=b"\x80\x04", + ) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert len(moved) == 1 + assert ".drift-" in moved[0] + + def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path): - """Segment with recent mtime vs sqlite is not touched.""" + """Segment with recent mtime vs sqlite is not touched (mtime gate + short-circuits before integrity gate).""" now = 1_700_000_000.0 palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now) moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) @@ -510,50 +578,3 @@ def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch assert calls == [palace_a, palace_b] -# ── _pin_hnsw_threads ───────────────────────────────────────────────────── - - -def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path): - """Legacy collections (created without num_threads) get the retrofit applied.""" - palace_path = tmp_path / "legacy-palace" - palace_path.mkdir() - - client = chromadb.PersistentClient(path=str(palace_path)) - col = client.create_collection( - "mempalace_drawers", - metadata={"hnsw:space": "cosine"}, # no num_threads — legacy - ) - assert col.configuration_json.get("hnsw", {}).get("num_threads") is None - - _pin_hnsw_threads(col) - - assert col.configuration_json["hnsw"]["num_threads"] == 1 - - -def test_pin_hnsw_threads_swallows_all_errors(): - """Retrofit never raises even when collection.modify explodes.""" - - class _ExplodingCollection: - def modify(self, *args, **kwargs): - raise RuntimeError("boom") - - _pin_hnsw_threads(_ExplodingCollection()) # must not raise - - -def test_get_collection_applies_retrofit_on_existing_palace(tmp_path): - """ChromaBackend.get_collection(create=False) applies the retrofit.""" - palace_path = tmp_path / "palace" - palace_path.mkdir() - - # Simulate a legacy palace: create collection without num_threads - bootstrap_client = chromadb.PersistentClient(path=str(palace_path)) - bootstrap_client.create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"}) - del bootstrap_client # drop reference so a fresh client reopens cleanly - - wrapper = ChromaBackend().get_collection( - str(palace_path), - collection_name="mempalace_drawers", - create=False, - ) - - assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1