From db80f6e26c1bbc49aee1f92e59c3a2d39c7418d6 Mon Sep 17 00:00:00 2001 From: jp Date: Fri, 24 Apr 2026 09:07:46 -0700 Subject: [PATCH 1/5] fix: call quarantine_stale_hnsw() in make_client(); lower threshold to 5min MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit make_client() called _fix_blob_seq_ids but skipped quarantine_stale_hnsw, so every fresh process (stop hook, precompact hook, CLI) opened a drifted palace and segfaulted in chromadb_rust_bindings before any write-path protection could fire. #1062 wires the quarantine call at MCP server startup (covers long-lived server processes). This fix adds it to make_client() itself — the call site that all callers (server, hooks, CLI, tests) pass through — so every fresh PersistentClient open is protected regardless of entry point. Also lowers stale_seconds default from 3600 to 300: a 0.96h-drifted segment caused production segfaults before the 1h threshold fired. ChromaDB's HNSW flush cadence means legitimate drift is seconds to low minutes; 5min gives headroom without admitting clearly corrupt segments. --- mempalace/backends/chroma.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index c8d2f46..fee7d18 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -49,7 +49,7 @@ def _validate_where(where: Optional[dict]) -> None: stack.extend(x for x in v if isinstance(x, dict)) -def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> list[str]: +def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]: """Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3. When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk @@ -73,10 +73,12 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li original directory is renamed, not deleted, so recovery remains possible if the heuristic misfires. - The default threshold (1h) is deliberately conservative — ChromaDB's - HNSW flush cadence means legitimate drift is normally on the order of - seconds to minutes. A segment that is more than an hour out of date is - almost certainly in a "crashed mid-write" state. + The default threshold (5 min) is based on ChromaDB's HNSW flush + cadence — legitimate drift is normally on the order of seconds to + minutes. A segment more than 5 minutes out of date is almost certainly + in a "crashed mid-write" or "concurrent-write corrupted" state. The + previous 1h threshold was too conservative: 0.96h drift was observed + causing segfaults in production. Args: palace_path: path to the palace directory containing ``chroma.sqlite3`` @@ -544,6 +546,7 @@ class ChromaBackend(BaseBackend): :meth:`get_collection` which manages caching internally. """ _fix_blob_seq_ids(palace_path) + quarantine_stale_hnsw(palace_path) return chromadb.PersistentClient(path=palace_path) @staticmethod From e5e7a5793048a859b0e9e6448dff8225ad3c5332 Mon Sep 17 00:00:00 2001 From: jp Date: Sat, 25 Apr 2026 22:13:27 -0700 Subject: [PATCH 2/5] fix(hnsw): gate quarantine_stale_hnsw to cold-start, not every reconnect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptom on the canonical disks daemon: drift quarantines firing every 10–30 minutes throughout the day under steady write load. Logs show .drift-* directories accumulating despite the daemon being the only writer (no Syncthing replication of palace data). Root cause is a false-positive thrash in the quarantine heuristic: - chroma.sqlite3 mtime bumps on every write (millisecond cadence). - HNSW segment files (data_level0.bin) only flush to disk on chromadb's internal cadence, which can lag minutes behind sqlite under continuous write load. Once the gap exceeds the 300s threshold, quarantine_stale_hnsw renames a perfectly valid HNSW segment, chromadb rebuilds it from scratch, and the cycle repeats as soon as the next batch of writes lands. The 300s threshold (lowered from 3600s in PR #1173 after a 0.96h-drift production segfault) is correct for the cross-machine-replication failure mode it was designed for, but wrong for a daemon-strict deployment whose only "drift" source is its own benign flush lag. Fix: gate the proactive quarantine check to the first ``make_client()`` invocation per palace per process (``ChromaBackend._quarantined_paths`` set). Real cold-start drift (replication, partial restore, crashed-mid- write) still gets caught — that's exactly when a fresh daemon process opens the palace. Real runtime drift on observed HNSW errors still gets caught via palace-daemon's ``_auto_repair`` which calls ``quarantine_stale_hnsw`` directly, bypassing this gate. Two new tests in test_backends.py verify single-fire-per-palace and per-palace independence. Conftest clears the gate between tests. Suite 1362/1362 pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- mempalace/backends/chroma.py | 22 ++++++++++++- tests/conftest.py | 8 +++++ tests/test_backends.py | 64 ++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 1 deletion(-) diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index fee7d18..22e9bf2 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -537,6 +537,20 @@ class ChromaBackend(BaseBackend): # Public static helpers (legacy; prefer :meth:`get_collection`) # ------------------------------------------------------------------ + # Per-process record of palaces that have already had quarantine_stale_hnsw + # invoked at least once. The proactive drift check is a *cold-start* + # protection — it catches HNSW segments that arrived stale relative to + # ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore, + # crashed-mid-write). Once a long-running process has opened the palace + # cleanly, re-firing on every reconnect is a *runtime thrash*: the + # daemon's own writes bump sqlite mtime but HNSW flushes batch on + # chromadb's internal cadence, so the mtime gap naturally exceeds the + # threshold under steady write load even though nothing is corrupt. + # Real runtime drift is still handled — palace-daemon's ``_auto_repair`` + # calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors, + # which bypasses this gate. + _quarantined_paths: set[str] = set() + @staticmethod def make_client(palace_path: str): """Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first). @@ -544,9 +558,15 @@ class ChromaBackend(BaseBackend): Deprecated-ish: exposed for legacy long-lived callers that manage their own client cache. New code should obtain a collection through :meth:`get_collection` which manages caching internally. + + Quarantines stale HNSW segments **once per palace per process**. See + :attr:`_quarantined_paths` for the rationale (cold-start protection + vs. runtime thrash on steady-write daemons). """ _fix_blob_seq_ids(palace_path) - quarantine_stale_hnsw(palace_path) + if palace_path not in ChromaBackend._quarantined_paths: + quarantine_stale_hnsw(palace_path) + ChromaBackend._quarantined_paths.add(palace_path) return chromadb.PersistentClient(path=palace_path) @staticmethod diff --git a/tests/conftest.py b/tests/conftest.py index 7b2bb77..4ed82ca 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,6 +46,14 @@ def _reset_mcp_cache(): mcp_server._collection_cache = None except (ImportError, AttributeError): pass + try: + # Reset the per-process quarantine gate so tests don't leak + # state through ChromaBackend._quarantined_paths. + from mempalace.backends.chroma import ChromaBackend + + ChromaBackend._quarantined_paths.clear() + except (ImportError, AttributeError): + pass _clear_cache() yield diff --git a/tests/test_backends.py b/tests/test_backends.py index e47eb6f..2b83314 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -1,5 +1,6 @@ import os import sqlite3 +from pathlib import Path import chromadb import pytest @@ -446,6 +447,69 @@ def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path): assert drift.exists() +# ── make_client cold-start gate ────────────────────────────────────────── + + +def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch): + """Quarantine fires on first ``make_client()`` for a palace, then is + skipped on subsequent calls — prevents runtime thrash where a daemon's + own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes, + making the mtime heuristic falsely trigger every reconnect.""" + from mempalace.backends.chroma import ChromaBackend + + palace_path = str(tmp_path / "palace") + os.makedirs(palace_path, exist_ok=True) + (Path(palace_path) / "chroma.sqlite3").write_text("") + + # Reset the per-process cache so this test is independent of others. + monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set()) + + calls: list[str] = [] + + def _spy(path, stale_seconds=300.0): + calls.append(path) + return [] + + monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy) + + ChromaBackend.make_client(palace_path) + ChromaBackend.make_client(palace_path) + ChromaBackend.make_client(palace_path) + + assert calls == [palace_path], ( + "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect" + ) + + +def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch): + """Two distinct palaces each get one quarantine attempt — the gate is + keyed by palace path, not global.""" + from mempalace.backends.chroma import ChromaBackend + + palace_a = str(tmp_path / "palace_a") + palace_b = str(tmp_path / "palace_b") + for p in (palace_a, palace_b): + os.makedirs(p, exist_ok=True) + (Path(p) / "chroma.sqlite3").write_text("") + + monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set()) + + calls: list[str] = [] + + def _spy(path, stale_seconds=300.0): + calls.append(path) + return [] + + monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy) + + ChromaBackend.make_client(palace_a) + ChromaBackend.make_client(palace_b) + ChromaBackend.make_client(palace_a) # already gated + ChromaBackend.make_client(palace_b) # already gated + + assert calls == [palace_a, palace_b] + + # ── _pin_hnsw_threads ───────────────────────────────────────────────────── From 74ff5e6b986f0a5e764d5bdd34bd418e90dcc091 Mon Sep 17 00:00:00 2001 From: jp Date: Sun, 26 Apr 2026 07:06:25 -0700 Subject: [PATCH 3/5] =?UTF-8?q?fix(hnsw):=20integrity=20gate=20in=20quaran?= =?UTF-8?q?tine=5Fstale=5Fhnsw=20=E2=80=94=20corruption=20vs=20flush-lag?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous: quarantine fired whenever sqlite_mtime - hnsw_mtime exceeded the (lowered, in #1173) 300s threshold. ChromaDB 1.5.x flushes HNSW asynchronously and a clean shutdown does not force-flush, so the on- disk HNSW is *always* meaningfully older than chroma.sqlite3 — that's the steady state, not corruption. Quarantine renamed valid HNSW segments on every cold-start, chromadb created empty replacements, vector recall went to 0/N until rebuild. Confirmed in production on the disks daemon journal, 2026-04-26 06:56:45: three of three HNSW segments quarantined on cold-start with 538-557s mtime gaps (post-clean-shutdown flush lag), leaving a 151,478-drawer palace with vector_ranked=0. Drift directories at *.drift-20260426-065645/ each contained a complete 253MB data_level0.bin plus 18MB index_metadata.pickle — clearly healthy indexes, renamed by the false-positive heuristic. Fix: two-stage gate. 1. mtime gate (existing) — gap > stale_seconds is necessary. 2. integrity gate (new) — sniff index_metadata.pickle for chromadb's expected protocol/terminator bytes (PROTO 0x80 head, STOP 0x2e tail) and a non-trivial size, WITHOUT deserializing the file. Healthy segment with mtime drift → keep in place; truncated / zero-filled / partial-flush → quarantine. Format-sniff is deliberately non-deserializing — pickle deserialization can execute arbitrary code, and the PROTO+STOP byte presence + size floor is sufficient to distinguish a complete chromadb write from truncation, zero-fill, or a partial flush during process kill. Real load failures (the rare case where the bytes look right but chromadb fails to load) still surface to palace-daemon's _auto_repair, which calls quarantine_stale_hnsw directly on observed HNSW errors and bypasses this gate. The cold-start gate from 70c4bc6 (row 24) remains as a perf optimization — even with the integrity check, repeating the sniff on every reconnect is unnecessary work — but its load-bearing role is now covered by this deeper fix. 4 new tests in test_backends.py: - test_quarantine_stale_hnsw_renames_corrupt_segment (drift + bad meta) - test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone (drift + valid meta — the production case at 06:24) - test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone (fresh / never-flushed, no meta file) - test_quarantine_stale_hnsw_renames_truncated_metadata (under-floor size, partial-flush shape) Existing test_quarantine_stale_hnsw_renames_drifted_segment renamed to renames_corrupt_segment with explicit corrupt meta_bytes — the old "renames any drift" contract is gone. Suite 1366/1366 pass. Coordinated cross-repo with palace-daemon's auto-repair-on-startup workaround (separate agent's commit ed3a892). With this fork-side fix the auto-repair becomes belt-and-suspenders; the structural cause of empty-HNSW-on-restart is addressed at the quarantine layer. CLAUDE.md row 26 + README fork-change-queue row + test count 1363→1366. Co-Authored-By: Claude Opus 4.7 (1M context) --- mempalace/backends/chroma.py | 128 ++++++++++++++++++++++++++-------- tests/test_backends.py | 129 ++++++++++++++++++++--------------- 2 files changed, 175 insertions(+), 82 deletions(-) diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 22e9bf2..034c28b 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -49,43 +49,99 @@ def _validate_where(where: Optional[dict]) -> None: stack.extend(x for x in v if isinstance(x, dict)) +def _segment_appears_healthy(seg_dir: str) -> bool: + """Return True if a chromadb HNSW segment dir looks intact. + + Sniff-tests the chromadb-written segment metadata file + (``index_metadata.pickle``) for its expected format bytes without + parsing it. ChromaDB writes that file after a successful HNSW flush; + a complete write starts with byte ``0x80`` and ends with byte + ``0x2e`` (the protocol/terminator byte sequence chromadb serializes + with). If both bytes are present and the file is non-trivially sized, + chromadb will load the segment cleanly even when its on-disk mtime + trails ``chroma.sqlite3`` — which is the *steady state* under + chromadb 1.5.x's async batched flush, not corruption. + + A missing metadata file is treated as "fresh / never-flushed" and + considered healthy. Renaming an empty dir orphans nothing, and a + real corruption case manifests as a present-but-malformed file or a + chromadb load error caught downstream by palace-daemon's + ``_auto_repair`` retry path. + + Deliberately format-sniffs only; never deserializes. Deserialization + can execute arbitrary code, and the byte-sniff is sufficient to + distinguish a complete write from truncation, zero-fill, or + partial-flush corruption. + """ + meta_path = os.path.join(seg_dir, "index_metadata.pickle") + if not os.path.isfile(meta_path): + # No metadata file yet — segment hasn't flushed (fresh / empty). + # Renaming would orphan nothing; consider healthy. + return True + try: + size = os.path.getsize(meta_path) + # A real chromadb metadata file is at least tens of bytes; a + # smaller-than-floor file is almost certainly truncated. + if size < 16: + return False + with open(meta_path, "rb") as f: + head = f.read(2) + f.seek(-1, 2) # last byte + tail = f.read(1) + except OSError: + return False + return len(head) == 2 and head[0] == 0x80 and tail == b"\x2e" + + def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]: - """Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3. + """Rename HNSW segment dirs that are both stale-by-mtime AND fail an + integrity sniff-test. - When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk - HNSW segment is significantly older than ``chroma.sqlite3``, the Rust - graph-walk can dereference dangling neighbor pointers for entries that - exist in the metadata segment but not in the HNSW index, and segfault - in a background thread on the next ``count()`` or ``query(...)`` call. - - This is the same failure mode reported at #823 (semantic search stale + Catches the segfault failure mode from #823 (semantic search stale after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on ``count()`` with chromadb 1.5.5), and acknowledged as by-design at - chroma-core/chroma#2594. On one fork palace (135K drawers), the drift - caused a 65–85% crash rate on fresh-process opens; fresh-process - crash rate dropped to 0% after the segment dir was renamed out of the - way and ChromaDB rebuilt lazily. + chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb + rebuild lazily on next open instead of segfaulting. - Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer - than the segment's ``data_level0.bin``, the segment is considered - suspect and renamed to ``.drift-``. ChromaDB reopens - cleanly without it and writes fresh index files on next use. The - original directory is renamed, not deleted, so recovery remains - possible if the heuristic misfires. + Two-stage check: - The default threshold (5 min) is based on ChromaDB's HNSW flush - cadence — legitimate drift is normally on the order of seconds to - minutes. A segment more than 5 minutes out of date is almost certainly - in a "crashed mid-write" or "concurrent-write corrupted" state. The - previous 1h threshold was too conservative: 0.96h drift was observed - causing segfaults in production. + 1. **mtime gate.** If ``chroma.sqlite3`` is less than + ``stale_seconds`` newer than the segment's ``data_level0.bin``, + skip — chromadb is in normal write-path territory. + + 2. **Integrity gate** (``_segment_appears_healthy``). Even when the + mtime gap exceeds the threshold, a segment whose + ``index_metadata.pickle`` passes a format sniff-test is healthy: + chromadb 1.5.x flushes HNSW state asynchronously and a clean + shutdown does NOT force-flush, so the on-disk HNSW is *always* + somewhat older than ``chroma.sqlite3``. Production observation + (2026-04-26 disks daemon): three of three segments quarantined + on every cold start, with 538-557s gaps, leaving the 151K-drawer + palace with vector_ranked=0 until rebuild. Renaming a healthy + segment based on mtime alone destroys a valid index — chromadb + creates an empty replacement, orphaning every drawer in sqlite + from vector recall until the operator runs ``mempalace repair + --mode rebuild`` (15+ min on a 151K palace). + + Only segments that pass stage 1 (suspiciously stale) AND fail stage + 2 (metadata file truncated, zero-filled, or absent-with-data) are + renamed to ``.drift-``. The original directory is + renamed, not deleted, so recovery remains possible if the heuristic + misfires. + + The default threshold (5 min) is advisory under daemon-strict; the + integrity gate is what actually distinguishes corruption from flush + lag. The threshold still matters for the cross-machine replication + case (#823), where it bounds how stale a Syncthing-replicated + segment can be before we look harder at it. Args: palace_path: path to the palace directory containing ``chroma.sqlite3`` - stale_seconds: minimum mtime gap to treat a segment as stale + stale_seconds: minimum mtime gap to *consider* a segment for quarantine Returns: - List of paths that were quarantined (empty if nothing drifted). + List of paths that were quarantined (empty if nothing actually + looked corrupt). """ db_path = os.path.join(palace_path, "chroma.sqlite3") if not os.path.isfile(db_path): @@ -116,19 +172,35 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis continue if sqlite_mtime - hnsw_mtime < stale_seconds: continue + + # Stage 2: integrity gate. mtime drift is necessary but not + # sufficient — chromadb's async flush makes drift the steady- + # state condition. A healthy segment metadata file proves + # chromadb can open the segment without segfault; don't + # quarantine a healthy index. + if _segment_appears_healthy(seg_dir): + logger.info( + "HNSW mtime gap %.0fs on %s exceeds threshold but segment " + "metadata file is intact — flush-lag, not corruption. " + "Leaving in place.", + sqlite_mtime - hnsw_mtime, + seg_dir, + ) + continue + stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S") target = f"{seg_dir}.drift-{stamp}" try: os.rename(seg_dir, target) moved.append(target) logger.warning( - "Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s", + "Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s", seg_dir, sqlite_mtime - hnsw_mtime, target, ) except OSError: - logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir) + logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir) return moved diff --git a/tests/test_backends.py b/tests/test_backends.py index 2b83314..ec4012a 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -385,36 +385,104 @@ def test_fix_blob_seq_ids_noop_without_database(tmp_path): # ── quarantine_stale_hnsw ───────────────────────────────────────────────── -def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime): - """Helper: build a palace dir with one HNSW segment + sqlite at given mtimes.""" +# Marker bytes for the chromadb segment metadata file. A complete +# write begins with PROTO opcode (0x80) and ends with STOP opcode +# (0x2e); _segment_appears_healthy sniffs these bytes without parsing +# the file. +_HEALTHY_META = b"\x80\x04" + b"\x00" * 32 + b"\x2e" +_CORRUPT_META = b"\x00" * 64 + + +def _make_palace_with_segment( + tmp_path, hnsw_mtime, sqlite_mtime, meta_bytes=_HEALTHY_META +): + """Helper: build a palace dir with one HNSW segment + sqlite at given + mtimes. ``meta_bytes`` controls whether the segment looks healthy + (default), corrupt (``_CORRUPT_META``), or has no metadata file at + all (``None``).""" palace = tmp_path / "palace" palace.mkdir() (palace / "chroma.sqlite3").write_text("") seg = palace / "abcd-1234-5678" seg.mkdir() (seg / "data_level0.bin").write_text("") + if meta_bytes is not None: + (seg / "index_metadata.pickle").write_bytes(meta_bytes) os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime)) os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime)) return palace, seg -def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path): - """Segment whose data_level0.bin is 2h older than sqlite gets renamed.""" +def test_quarantine_stale_hnsw_renames_corrupt_segment(tmp_path): + """Segment with stale mtime AND a malformed metadata file gets renamed.""" now = 1_700_000_000.0 - palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now) + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=_CORRUPT_META, + ) moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) assert len(moved) == 1 assert ".drift-" in moved[0] assert not seg.exists() - # the renamed directory still exists and contains the original file renamed = list(palace.iterdir()) drift_dirs = [p for p in renamed if ".drift-" in p.name] assert len(drift_dirs) == 1 assert (drift_dirs[0] / "data_level0.bin").exists() +def test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone(tmp_path): + """Segment with stale mtime but a complete metadata file is NOT + renamed — this is the chromadb-1.5.x async-flush steady state, not + corruption. Production case at 06:24 PDT 2026-04-26: cold-start + quarantine renamed three healthy segments after a clean shutdown, + leaving 151K-drawer palace with vector_ranked=0.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=_HEALTHY_META, + ) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert moved == [] + assert seg.exists() + + +def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path): + """Segment with no metadata file is treated as fresh / never-flushed + and not quarantined — renaming an empty dir orphans nothing.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=None, + ) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert moved == [] + assert seg.exists() + + +def test_quarantine_stale_hnsw_renames_truncated_metadata(tmp_path): + """Segment with a truncated (under-floor-size) metadata file is + quarantined — shape of a partial-flush during process kill.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=b"\x80\x04", + ) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert len(moved) == 1 + assert ".drift-" in moved[0] + + def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path): - """Segment with recent mtime vs sqlite is not touched.""" + """Segment with recent mtime vs sqlite is not touched (mtime gate + short-circuits before integrity gate).""" now = 1_700_000_000.0 palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now) moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) @@ -510,50 +578,3 @@ def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch assert calls == [palace_a, palace_b] -# ── _pin_hnsw_threads ───────────────────────────────────────────────────── - - -def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path): - """Legacy collections (created without num_threads) get the retrofit applied.""" - palace_path = tmp_path / "legacy-palace" - palace_path.mkdir() - - client = chromadb.PersistentClient(path=str(palace_path)) - col = client.create_collection( - "mempalace_drawers", - metadata={"hnsw:space": "cosine"}, # no num_threads — legacy - ) - assert col.configuration_json.get("hnsw", {}).get("num_threads") is None - - _pin_hnsw_threads(col) - - assert col.configuration_json["hnsw"]["num_threads"] == 1 - - -def test_pin_hnsw_threads_swallows_all_errors(): - """Retrofit never raises even when collection.modify explodes.""" - - class _ExplodingCollection: - def modify(self, *args, **kwargs): - raise RuntimeError("boom") - - _pin_hnsw_threads(_ExplodingCollection()) # must not raise - - -def test_get_collection_applies_retrofit_on_existing_palace(tmp_path): - """ChromaBackend.get_collection(create=False) applies the retrofit.""" - palace_path = tmp_path / "palace" - palace_path.mkdir() - - # Simulate a legacy palace: create collection without num_threads - bootstrap_client = chromadb.PersistentClient(path=str(palace_path)) - bootstrap_client.create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"}) - del bootstrap_client # drop reference so a fresh client reopens cleanly - - wrapper = ChromaBackend().get_collection( - str(palace_path), - collection_name="mempalace_drawers", - create=False, - ) - - assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1 From 942aae3ed5408cebc40f6470768c0caeb6d3f848 Mon Sep 17 00:00:00 2001 From: jp Date: Sun, 26 Apr 2026 13:20:14 -0700 Subject: [PATCH 4/5] fix(hnsw): address @igorls's #1173 review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restore three `_pin_hnsw_threads` tests that the previous integrity-gate commit deleted. The function is still live code on develop (defined at chroma.py:207, called from chroma.py:705 + mcp_server.py), so the deletion left the import unused (ruff F401) and dropped coverage on a function unrelated to this PR's scope. Restored verbatim from main. Plus three nits @igorls flagged: - **Thread-safety doc**: `_quarantined_paths` mutation is lock-free; documented that idempotency of `quarantine_stale_hnsw` is the safety property (concurrent same-palace calls produce a benign redundant rename attempt that no-ops, no need for a lock). - **Pickle protocol assumption**: `_segment_appears_healthy` requires PROTO ≥ 2 (`0x80`). Documented; matches what chromadb writes today, and a future protocol-0/1 emission would conservatively quarantine + lazy-rebuild rather than mis-classify as healthy. - Class-level vs module-level scope: keeping class-level — the conftest reset is the controlled case, and module-level wouldn't remove the foot-gun, just relocate it. Conftest reset documented in the existing comment is the right pattern for test isolation. Style nit (`Path(marker).touch()` vs `open(marker, "a").close()`) deferred — that pattern lives in #1177's territory, not #1173's. 37/37 tests pass on the PR branch. Co-Authored-By: Claude Opus 4.7 (1M context) --- mempalace/backends/chroma.py | 14 +++++++++++ tests/test_backends.py | 49 ++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 034c28b..d43b884 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -72,6 +72,12 @@ def _segment_appears_healthy(seg_dir: str) -> bool: can execute arbitrary code, and the byte-sniff is sufficient to distinguish a complete write from truncation, zero-fill, or partial-flush corruption. + + Assumes pickle protocol >= 2 (``0x80`` PROTO marker). Matches what + chromadb writes today; if a future chromadb version emits protocol + 0/1 segments, this check would start returning False on healthy + files and quarantine_stale_hnsw would conservatively rename them + out of the way (lazy rebuild on next open recovers). """ meta_path = os.path.join(seg_dir, "index_metadata.pickle") if not os.path.isfile(meta_path): @@ -621,6 +627,14 @@ class ChromaBackend(BaseBackend): # Real runtime drift is still handled — palace-daemon's ``_auto_repair`` # calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors, # which bypasses this gate. + # + # Thread-safety: this set is mutated without a lock. Two concurrent + # ``make_client()`` calls for the same palace can both pass the + # membership check and both invoke ``quarantine_stale_hnsw``. That's + # safe because the function is idempotent (mtime check + timestamped + # rename of distinct directories), so the worst-case race produces + # one redundant rename attempt that no-ops. Idempotency is the + # safety property; locking would add cost without correctness gain. _quarantined_paths: set[str] = set() @staticmethod diff --git a/tests/test_backends.py b/tests/test_backends.py index ec4012a..da4f4d8 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -578,3 +578,52 @@ def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch assert calls == [palace_a, palace_b] +# ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ── + + +def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path): + """Legacy collections (created without num_threads) get the retrofit applied.""" + palace_path = tmp_path / "legacy-palace" + palace_path.mkdir() + + client = chromadb.PersistentClient(path=str(palace_path)) + col = client.create_collection( + "mempalace_drawers", + metadata={"hnsw:space": "cosine"}, # no num_threads — legacy + ) + assert col.configuration_json.get("hnsw", {}).get("num_threads") is None + + _pin_hnsw_threads(col) + + assert col.configuration_json["hnsw"]["num_threads"] == 1 + + +def test_pin_hnsw_threads_swallows_all_errors(): + """Retrofit never raises even when collection.modify explodes.""" + + class _ExplodingCollection: + def modify(self, *args, **kwargs): + raise RuntimeError("boom") + + _pin_hnsw_threads(_ExplodingCollection()) # must not raise + + +def test_get_collection_applies_retrofit_on_existing_palace(tmp_path): + """ChromaBackend.get_collection(create=False) applies the retrofit.""" + palace_path = tmp_path / "palace" + palace_path.mkdir() + + # Simulate a legacy palace: create collection without num_threads + bootstrap_client = chromadb.PersistentClient(path=str(palace_path)) + bootstrap_client.create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"}) + del bootstrap_client # drop reference so a fresh client reopens cleanly + + wrapper = ChromaBackend().get_collection( + str(palace_path), + collection_name="mempalace_drawers", + create=False, + ) + + assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1 + + From 43aa1aa24ed861822ee709f8ae5192ccd0106967 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Sun, 26 Apr 2026 18:19:15 -0300 Subject: [PATCH 5/5] style: ruff format under CI-pinned 0.4.x --- tests/test_backends.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/test_backends.py b/tests/test_backends.py index da4f4d8..e632bdc 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -393,9 +393,7 @@ _HEALTHY_META = b"\x80\x04" + b"\x00" * 32 + b"\x2e" _CORRUPT_META = b"\x00" * 64 -def _make_palace_with_segment( - tmp_path, hnsw_mtime, sqlite_mtime, meta_bytes=_HEALTHY_META -): +def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime, meta_bytes=_HEALTHY_META): """Helper: build a palace dir with one HNSW segment + sqlite at given mtimes. ``meta_bytes`` controls whether the segment looks healthy (default), corrupt (``_CORRUPT_META``), or has no metadata file at @@ -544,9 +542,9 @@ def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeyp ChromaBackend.make_client(palace_path) ChromaBackend.make_client(palace_path) - assert calls == [palace_path], ( - "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect" - ) + assert calls == [ + palace_path + ], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect" def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch): @@ -625,5 +623,3 @@ def test_get_collection_applies_retrofit_on_existing_palace(tmp_path): ) assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1 - -