From db80f6e26c1bbc49aee1f92e59c3a2d39c7418d6 Mon Sep 17 00:00:00 2001
From: jp <jp@jphein.com>
Date: Fri, 24 Apr 2026 09:07:46 -0700
Subject: [PATCH 1/5] fix: call quarantine_stale_hnsw() in make_client(); lower
 threshold to 5min
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

make_client() called _fix_blob_seq_ids but skipped quarantine_stale_hnsw,
so every fresh process (stop hook, precompact hook, CLI) opened a drifted
palace and segfaulted in chromadb_rust_bindings before any write-path
protection could fire.

#1062 wires the quarantine call at MCP server startup (covers long-lived
server processes). This fix adds it to make_client() itself — the call
site that all callers (server, hooks, CLI, tests) pass through — so every
fresh PersistentClient open is protected regardless of entry point.

Also lowers stale_seconds default from 3600 to 300: a 0.96h-drifted
segment caused production segfaults before the 1h threshold fired.
ChromaDB's HNSW flush cadence means legitimate drift is seconds to low
minutes; 5min gives headroom without admitting clearly corrupt segments.
---
 mempalace/backends/chroma.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index c8d2f46..fee7d18 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -49,7 +49,7 @@ def _validate_where(where: Optional[dict]) -> None:
                 stack.extend(x for x in v if isinstance(x, dict))
 
 
-def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> list[str]:
+def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
     """Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3.
 
     When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk
@@ -73,10 +73,12 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li
     original directory is renamed, not deleted, so recovery remains
     possible if the heuristic misfires.
 
-    The default threshold (1h) is deliberately conservative — ChromaDB's
-    HNSW flush cadence means legitimate drift is normally on the order of
-    seconds to minutes. A segment that is more than an hour out of date is
-    almost certainly in a "crashed mid-write" state.
+    The default threshold (5 min) is based on ChromaDB's HNSW flush
+    cadence — legitimate drift is normally on the order of seconds to
+    minutes. A segment more than 5 minutes out of date is almost certainly
+    in a "crashed mid-write" or "concurrent-write corrupted" state. The
+    previous 1h threshold was too conservative: 0.96h drift was observed
+    causing segfaults in production.
 
     Args:
         palace_path: path to the palace directory containing ``chroma.sqlite3``
@@ -544,6 +546,7 @@ class ChromaBackend(BaseBackend):
         :meth:`get_collection` which manages caching internally.
         """
         _fix_blob_seq_ids(palace_path)
+        quarantine_stale_hnsw(palace_path)
         return chromadb.PersistentClient(path=palace_path)
 
     @staticmethod

From e5e7a5793048a859b0e9e6448dff8225ad3c5332 Mon Sep 17 00:00:00 2001
From: jp <jp@jphein.com>
Date: Sat, 25 Apr 2026 22:13:27 -0700
Subject: [PATCH 2/5] fix(hnsw): gate quarantine_stale_hnsw to cold-start, not
 every reconnect
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Symptom on the canonical disks daemon: drift quarantines firing every
10–30 minutes throughout the day under steady write load. Logs show
.drift-* directories accumulating despite the daemon being the only
writer (no Syncthing replication of palace data).

Root cause is a false-positive thrash in the quarantine heuristic:

- chroma.sqlite3 mtime bumps on every write (millisecond cadence).
- HNSW segment files (data_level0.bin) only flush to disk on
  chromadb's internal cadence, which can lag minutes behind sqlite
  under continuous write load.

Once the gap exceeds the 300s threshold, quarantine_stale_hnsw renames
a perfectly valid HNSW segment, chromadb rebuilds it from scratch, and
the cycle repeats as soon as the next batch of writes lands. The 300s
threshold (lowered from 3600s in PR #1173 after a 0.96h-drift production
segfault) is correct for the cross-machine-replication failure mode it
was designed for, but wrong for a daemon-strict deployment whose only
"drift" source is its own benign flush lag.

Fix: gate the proactive quarantine check to the first ``make_client()``
invocation per palace per process (``ChromaBackend._quarantined_paths``
set). Real cold-start drift (replication, partial restore, crashed-mid-
write) still gets caught — that's exactly when a fresh daemon process
opens the palace. Real runtime drift on observed HNSW errors still gets
caught via palace-daemon's ``_auto_repair`` which calls
``quarantine_stale_hnsw`` directly, bypassing this gate.

Two new tests in test_backends.py verify single-fire-per-palace and
per-palace independence. Conftest clears the gate between tests.

Suite 1362/1362 pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/backends/chroma.py | 22 ++++++++++++-
 tests/conftest.py            |  8 +++++
 tests/test_backends.py       | 64 ++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index fee7d18..22e9bf2 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -537,6 +537,20 @@ class ChromaBackend(BaseBackend):
     # Public static helpers (legacy; prefer :meth:`get_collection`)
     # ------------------------------------------------------------------
 
+    # Per-process record of palaces that have already had quarantine_stale_hnsw
+    # invoked at least once. The proactive drift check is a *cold-start*
+    # protection — it catches HNSW segments that arrived stale relative to
+    # ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore,
+    # crashed-mid-write). Once a long-running process has opened the palace
+    # cleanly, re-firing on every reconnect is a *runtime thrash*: the
+    # daemon's own writes bump sqlite mtime but HNSW flushes batch on
+    # chromadb's internal cadence, so the mtime gap naturally exceeds the
+    # threshold under steady write load even though nothing is corrupt.
+    # Real runtime drift is still handled — palace-daemon's ``_auto_repair``
+    # calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors,
+    # which bypasses this gate.
+    _quarantined_paths: set[str] = set()
+
     @staticmethod
     def make_client(palace_path: str):
         """Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first).
@@ -544,9 +558,15 @@ class ChromaBackend(BaseBackend):
         Deprecated-ish: exposed for legacy long-lived callers that manage their
         own client cache. New code should obtain a collection through
         :meth:`get_collection` which manages caching internally.
+
+        Quarantines stale HNSW segments **once per palace per process**. See
+        :attr:`_quarantined_paths` for the rationale (cold-start protection
+        vs. runtime thrash on steady-write daemons).
         """
         _fix_blob_seq_ids(palace_path)
-        quarantine_stale_hnsw(palace_path)
+        if palace_path not in ChromaBackend._quarantined_paths:
+            quarantine_stale_hnsw(palace_path)
+            ChromaBackend._quarantined_paths.add(palace_path)
         return chromadb.PersistentClient(path=palace_path)
 
     @staticmethod
diff --git a/tests/conftest.py b/tests/conftest.py
index 7b2bb77..4ed82ca 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -46,6 +46,14 @@ def _reset_mcp_cache():
             mcp_server._collection_cache = None
         except (ImportError, AttributeError):
             pass
+        try:
+            # Reset the per-process quarantine gate so tests don't leak
+            # state through ChromaBackend._quarantined_paths.
+            from mempalace.backends.chroma import ChromaBackend
+
+            ChromaBackend._quarantined_paths.clear()
+        except (ImportError, AttributeError):
+            pass
 
     _clear_cache()
     yield
diff --git a/tests/test_backends.py b/tests/test_backends.py
index e47eb6f..2b83314 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -1,5 +1,6 @@
 import os
 import sqlite3
+from pathlib import Path
 
 import chromadb
 import pytest
@@ -446,6 +447,69 @@ def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path):
     assert drift.exists()
 
 
+# ── make_client cold-start gate ──────────────────────────────────────────
+
+
+def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch):
+    """Quarantine fires on first ``make_client()`` for a palace, then is
+    skipped on subsequent calls — prevents runtime thrash where a daemon's
+    own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes,
+    making the mtime heuristic falsely trigger every reconnect."""
+    from mempalace.backends.chroma import ChromaBackend
+
+    palace_path = str(tmp_path / "palace")
+    os.makedirs(palace_path, exist_ok=True)
+    (Path(palace_path) / "chroma.sqlite3").write_text("")
+
+    # Reset the per-process cache so this test is independent of others.
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+
+    calls: list[str] = []
+
+    def _spy(path, stale_seconds=300.0):
+        calls.append(path)
+        return []
+
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
+
+    ChromaBackend.make_client(palace_path)
+    ChromaBackend.make_client(palace_path)
+    ChromaBackend.make_client(palace_path)
+
+    assert calls == [palace_path], (
+        "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
+    )
+
+
+def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
+    """Two distinct palaces each get one quarantine attempt — the gate is
+    keyed by palace path, not global."""
+    from mempalace.backends.chroma import ChromaBackend
+
+    palace_a = str(tmp_path / "palace_a")
+    palace_b = str(tmp_path / "palace_b")
+    for p in (palace_a, palace_b):
+        os.makedirs(p, exist_ok=True)
+        (Path(p) / "chroma.sqlite3").write_text("")
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+
+    calls: list[str] = []
+
+    def _spy(path, stale_seconds=300.0):
+        calls.append(path)
+        return []
+
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
+
+    ChromaBackend.make_client(palace_a)
+    ChromaBackend.make_client(palace_b)
+    ChromaBackend.make_client(palace_a)  # already gated
+    ChromaBackend.make_client(palace_b)  # already gated
+
+    assert calls == [palace_a, palace_b]
+
+
 # ── _pin_hnsw_threads ─────────────────────────────────────────────────────
 
 

From 74ff5e6b986f0a5e764d5bdd34bd418e90dcc091 Mon Sep 17 00:00:00 2001
From: jp <jp@jphein.com>
Date: Sun, 26 Apr 2026 07:06:25 -0700
Subject: [PATCH 3/5] =?UTF-8?q?fix(hnsw):=20integrity=20gate=20in=20quaran?=
 =?UTF-8?q?tine=5Fstale=5Fhnsw=20=E2=80=94=20corruption=20vs=20flush-lag?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous: quarantine fired whenever sqlite_mtime - hnsw_mtime exceeded
the (lowered, in #1173) 300s threshold. ChromaDB 1.5.x flushes HNSW
asynchronously and a clean shutdown does not force-flush, so the on-
disk HNSW is *always* meaningfully older than chroma.sqlite3 — that's
the steady state, not corruption. Quarantine renamed valid HNSW
segments on every cold-start, chromadb created empty replacements,
vector recall went to 0/N until rebuild.

Confirmed in production on the disks daemon journal, 2026-04-26
06:56:45: three of three HNSW segments quarantined on cold-start with
538-557s mtime gaps (post-clean-shutdown flush lag), leaving a
151,478-drawer palace with vector_ranked=0. Drift directories at
*.drift-20260426-065645/ each contained a complete 253MB data_level0.bin
plus 18MB index_metadata.pickle — clearly healthy indexes, renamed by
the false-positive heuristic.

Fix: two-stage gate.

  1. mtime gate (existing) — gap > stale_seconds is necessary.
  2. integrity gate (new) — sniff index_metadata.pickle for chromadb's
     expected protocol/terminator bytes (PROTO 0x80 head, STOP 0x2e
     tail) and a non-trivial size, WITHOUT deserializing the file.
     Healthy segment with mtime drift → keep in place; truncated /
     zero-filled / partial-flush → quarantine.

Format-sniff is deliberately non-deserializing — pickle deserialization
can execute arbitrary code, and the PROTO+STOP byte presence + size
floor is sufficient to distinguish a complete chromadb write from
truncation, zero-fill, or a partial flush during process kill. Real
load failures (the rare case where the bytes look right but chromadb
fails to load) still surface to palace-daemon's _auto_repair, which
calls quarantine_stale_hnsw directly on observed HNSW errors and
bypasses this gate.

The cold-start gate from 70c4bc6 (row 24) remains as a perf optimization
— even with the integrity check, repeating the sniff on every reconnect
is unnecessary work — but its load-bearing role is now covered by this
deeper fix.

4 new tests in test_backends.py:

  - test_quarantine_stale_hnsw_renames_corrupt_segment (drift + bad meta)
  - test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone
    (drift + valid meta — the production case at 06:24)
  - test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone
    (fresh / never-flushed, no meta file)
  - test_quarantine_stale_hnsw_renames_truncated_metadata (under-floor
    size, partial-flush shape)

Existing test_quarantine_stale_hnsw_renames_drifted_segment renamed
to renames_corrupt_segment with explicit corrupt meta_bytes — the old
"renames any drift" contract is gone.

Suite 1366/1366 pass.

Coordinated cross-repo with palace-daemon's auto-repair-on-startup
workaround (separate agent's commit ed3a892). With this fork-side fix
the auto-repair becomes belt-and-suspenders; the structural cause
of empty-HNSW-on-restart is addressed at the quarantine layer.

CLAUDE.md row 26 + README fork-change-queue row + test count
1363→1366.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/backends/chroma.py | 128 ++++++++++++++++++++++++++--------
 tests/test_backends.py       | 129 ++++++++++++++++++++---------------
 2 files changed, 175 insertions(+), 82 deletions(-)

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index 22e9bf2..034c28b 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -49,43 +49,99 @@ def _validate_where(where: Optional[dict]) -> None:
                 stack.extend(x for x in v if isinstance(x, dict))
 
 
+def _segment_appears_healthy(seg_dir: str) -> bool:
+    """Return True if a chromadb HNSW segment dir looks intact.
+
+    Sniff-tests the chromadb-written segment metadata file
+    (``index_metadata.pickle``) for its expected format bytes without
+    parsing it. ChromaDB writes that file after a successful HNSW flush;
+    a complete write starts with byte ``0x80`` and ends with byte
+    ``0x2e`` (the protocol/terminator byte sequence chromadb serializes
+    with). If both bytes are present and the file is non-trivially sized,
+    chromadb will load the segment cleanly even when its on-disk mtime
+    trails ``chroma.sqlite3`` — which is the *steady state* under
+    chromadb 1.5.x's async batched flush, not corruption.
+
+    A missing metadata file is treated as "fresh / never-flushed" and
+    considered healthy. Renaming an empty dir orphans nothing, and a
+    real corruption case manifests as a present-but-malformed file or a
+    chromadb load error caught downstream by palace-daemon's
+    ``_auto_repair`` retry path.
+
+    Deliberately format-sniffs only; never deserializes. Deserialization
+    can execute arbitrary code, and the byte-sniff is sufficient to
+    distinguish a complete write from truncation, zero-fill, or
+    partial-flush corruption.
+    """
+    meta_path = os.path.join(seg_dir, "index_metadata.pickle")
+    if not os.path.isfile(meta_path):
+        # No metadata file yet — segment hasn't flushed (fresh / empty).
+        # Renaming would orphan nothing; consider healthy.
+        return True
+    try:
+        size = os.path.getsize(meta_path)
+        # A real chromadb metadata file is at least tens of bytes; a
+        # smaller-than-floor file is almost certainly truncated.
+        if size < 16:
+            return False
+        with open(meta_path, "rb") as f:
+            head = f.read(2)
+            f.seek(-1, 2)  # last byte
+            tail = f.read(1)
+    except OSError:
+        return False
+    return len(head) == 2 and head[0] == 0x80 and tail == b"\x2e"
+
+
 def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
-    """Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3.
+    """Rename HNSW segment dirs that are both stale-by-mtime AND fail an
+    integrity sniff-test.
 
-    When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk
-    HNSW segment is significantly older than ``chroma.sqlite3``, the Rust
-    graph-walk can dereference dangling neighbor pointers for entries that
-    exist in the metadata segment but not in the HNSW index, and segfault
-    in a background thread on the next ``count()`` or ``query(...)`` call.
-
-    This is the same failure mode reported at #823 (semantic search stale
+    Catches the segfault failure mode from #823 (semantic search stale
     after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
     ``count()`` with chromadb 1.5.5), and acknowledged as by-design at
-    chroma-core/chroma#2594. On one fork palace (135K drawers), the drift
-    caused a 65–85% crash rate on fresh-process opens; fresh-process
-    crash rate dropped to 0% after the segment dir was renamed out of the
-    way and ChromaDB rebuilt lazily.
+    chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb
+    rebuild lazily on next open instead of segfaulting.
 
-    Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer
-    than the segment's ``data_level0.bin``, the segment is considered
-    suspect and renamed to ``<uuid>.drift-<timestamp>``. ChromaDB reopens
-    cleanly without it and writes fresh index files on next use. The
-    original directory is renamed, not deleted, so recovery remains
-    possible if the heuristic misfires.
+    Two-stage check:
 
-    The default threshold (5 min) is based on ChromaDB's HNSW flush
-    cadence — legitimate drift is normally on the order of seconds to
-    minutes. A segment more than 5 minutes out of date is almost certainly
-    in a "crashed mid-write" or "concurrent-write corrupted" state. The
-    previous 1h threshold was too conservative: 0.96h drift was observed
-    causing segfaults in production.
+    1. **mtime gate.** If ``chroma.sqlite3`` is less than
+       ``stale_seconds`` newer than the segment's ``data_level0.bin``,
+       skip — chromadb is in normal write-path territory.
+
+    2. **Integrity gate** (``_segment_appears_healthy``). Even when the
+       mtime gap exceeds the threshold, a segment whose
+       ``index_metadata.pickle`` passes a format sniff-test is healthy:
+       chromadb 1.5.x flushes HNSW state asynchronously and a clean
+       shutdown does NOT force-flush, so the on-disk HNSW is *always*
+       somewhat older than ``chroma.sqlite3``. Production observation
+       (2026-04-26 disks daemon): three of three segments quarantined
+       on every cold start, with 538-557s gaps, leaving the 151K-drawer
+       palace with vector_ranked=0 until rebuild. Renaming a healthy
+       segment based on mtime alone destroys a valid index — chromadb
+       creates an empty replacement, orphaning every drawer in sqlite
+       from vector recall until the operator runs ``mempalace repair
+       --mode rebuild`` (15+ min on a 151K palace).
+
+    Only segments that pass stage 1 (suspiciously stale) AND fail stage
+    2 (metadata file truncated, zero-filled, or absent-with-data) are
+    renamed to ``<uuid>.drift-<timestamp>``. The original directory is
+    renamed, not deleted, so recovery remains possible if the heuristic
+    misfires.
+
+    The default threshold (5 min) is advisory under daemon-strict; the
+    integrity gate is what actually distinguishes corruption from flush
+    lag. The threshold still matters for the cross-machine replication
+    case (#823), where it bounds how stale a Syncthing-replicated
+    segment can be before we look harder at it.
 
     Args:
         palace_path: path to the palace directory containing ``chroma.sqlite3``
-        stale_seconds: minimum mtime gap to treat a segment as stale
+        stale_seconds: minimum mtime gap to *consider* a segment for quarantine
 
     Returns:
-        List of paths that were quarantined (empty if nothing drifted).
+        List of paths that were quarantined (empty if nothing actually
+        looked corrupt).
     """
     db_path = os.path.join(palace_path, "chroma.sqlite3")
     if not os.path.isfile(db_path):
@@ -116,19 +172,35 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
             continue
         if sqlite_mtime - hnsw_mtime < stale_seconds:
             continue
+
+        # Stage 2: integrity gate. mtime drift is necessary but not
+        # sufficient — chromadb's async flush makes drift the steady-
+        # state condition. A healthy segment metadata file proves
+        # chromadb can open the segment without segfault; don't
+        # quarantine a healthy index.
+        if _segment_appears_healthy(seg_dir):
+            logger.info(
+                "HNSW mtime gap %.0fs on %s exceeds threshold but segment "
+                "metadata file is intact — flush-lag, not corruption. "
+                "Leaving in place.",
+                sqlite_mtime - hnsw_mtime,
+                seg_dir,
+            )
+            continue
+
         stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
         target = f"{seg_dir}.drift-{stamp}"
         try:
             os.rename(seg_dir, target)
             moved.append(target)
             logger.warning(
-                "Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s",
+                "Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s",
                 seg_dir,
                 sqlite_mtime - hnsw_mtime,
                 target,
             )
         except OSError:
-            logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir)
+            logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir)
     return moved
 
 
diff --git a/tests/test_backends.py b/tests/test_backends.py
index 2b83314..ec4012a 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -385,36 +385,104 @@ def test_fix_blob_seq_ids_noop_without_database(tmp_path):
 # ── quarantine_stale_hnsw ─────────────────────────────────────────────────
 
 
-def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime):
-    """Helper: build a palace dir with one HNSW segment + sqlite at given mtimes."""
+# Marker bytes for the chromadb segment metadata file. A complete
+# write begins with PROTO opcode (0x80) and ends with STOP opcode
+# (0x2e); _segment_appears_healthy sniffs these bytes without parsing
+# the file.
+_HEALTHY_META = b"\x80\x04" + b"\x00" * 32 + b"\x2e"
+_CORRUPT_META = b"\x00" * 64
+
+
+def _make_palace_with_segment(
+    tmp_path, hnsw_mtime, sqlite_mtime, meta_bytes=_HEALTHY_META
+):
+    """Helper: build a palace dir with one HNSW segment + sqlite at given
+    mtimes. ``meta_bytes`` controls whether the segment looks healthy
+    (default), corrupt (``_CORRUPT_META``), or has no metadata file at
+    all (``None``)."""
     palace = tmp_path / "palace"
     palace.mkdir()
     (palace / "chroma.sqlite3").write_text("")
     seg = palace / "abcd-1234-5678"
     seg.mkdir()
     (seg / "data_level0.bin").write_text("")
+    if meta_bytes is not None:
+        (seg / "index_metadata.pickle").write_bytes(meta_bytes)
     os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime))
     os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime))
     return palace, seg
 
 
-def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path):
-    """Segment whose data_level0.bin is 2h older than sqlite gets renamed."""
+def test_quarantine_stale_hnsw_renames_corrupt_segment(tmp_path):
+    """Segment with stale mtime AND a malformed metadata file gets renamed."""
     now = 1_700_000_000.0
-    palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now)
+    palace, seg = _make_palace_with_segment(
+        tmp_path,
+        hnsw_mtime=now - 7200,
+        sqlite_mtime=now,
+        meta_bytes=_CORRUPT_META,
+    )
     moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
     assert len(moved) == 1
     assert ".drift-" in moved[0]
     assert not seg.exists()
-    # the renamed directory still exists and contains the original file
     renamed = list(palace.iterdir())
     drift_dirs = [p for p in renamed if ".drift-" in p.name]
     assert len(drift_dirs) == 1
     assert (drift_dirs[0] / "data_level0.bin").exists()
 
 
+def test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone(tmp_path):
+    """Segment with stale mtime but a complete metadata file is NOT
+    renamed — this is the chromadb-1.5.x async-flush steady state, not
+    corruption. Production case at 06:24 PDT 2026-04-26: cold-start
+    quarantine renamed three healthy segments after a clean shutdown,
+    leaving 151K-drawer palace with vector_ranked=0."""
+    now = 1_700_000_000.0
+    palace, seg = _make_palace_with_segment(
+        tmp_path,
+        hnsw_mtime=now - 7200,
+        sqlite_mtime=now,
+        meta_bytes=_HEALTHY_META,
+    )
+    moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
+    assert moved == []
+    assert seg.exists()
+
+
+def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path):
+    """Segment with no metadata file is treated as fresh / never-flushed
+    and not quarantined — renaming an empty dir orphans nothing."""
+    now = 1_700_000_000.0
+    palace, seg = _make_palace_with_segment(
+        tmp_path,
+        hnsw_mtime=now - 7200,
+        sqlite_mtime=now,
+        meta_bytes=None,
+    )
+    moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
+    assert moved == []
+    assert seg.exists()
+
+
+def test_quarantine_stale_hnsw_renames_truncated_metadata(tmp_path):
+    """Segment with a truncated (under-floor-size) metadata file is
+    quarantined — shape of a partial-flush during process kill."""
+    now = 1_700_000_000.0
+    palace, seg = _make_palace_with_segment(
+        tmp_path,
+        hnsw_mtime=now - 7200,
+        sqlite_mtime=now,
+        meta_bytes=b"\x80\x04",
+    )
+    moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
+    assert len(moved) == 1
+    assert ".drift-" in moved[0]
+
+
 def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path):
-    """Segment with recent mtime vs sqlite is not touched."""
+    """Segment with recent mtime vs sqlite is not touched (mtime gate
+    short-circuits before integrity gate)."""
     now = 1_700_000_000.0
     palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now)
     moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
@@ -510,50 +578,3 @@ def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch
     assert calls == [palace_a, palace_b]
 
 
-# ── _pin_hnsw_threads ─────────────────────────────────────────────────────
-
-
-def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path):
-    """Legacy collections (created without num_threads) get the retrofit applied."""
-    palace_path = tmp_path / "legacy-palace"
-    palace_path.mkdir()
-
-    client = chromadb.PersistentClient(path=str(palace_path))
-    col = client.create_collection(
-        "mempalace_drawers",
-        metadata={"hnsw:space": "cosine"},  # no num_threads — legacy
-    )
-    assert col.configuration_json.get("hnsw", {}).get("num_threads") is None
-
-    _pin_hnsw_threads(col)
-
-    assert col.configuration_json["hnsw"]["num_threads"] == 1
-
-
-def test_pin_hnsw_threads_swallows_all_errors():
-    """Retrofit never raises even when collection.modify explodes."""
-
-    class _ExplodingCollection:
-        def modify(self, *args, **kwargs):
-            raise RuntimeError("boom")
-
-    _pin_hnsw_threads(_ExplodingCollection())  # must not raise
-
-
-def test_get_collection_applies_retrofit_on_existing_palace(tmp_path):
-    """ChromaBackend.get_collection(create=False) applies the retrofit."""
-    palace_path = tmp_path / "palace"
-    palace_path.mkdir()
-
-    # Simulate a legacy palace: create collection without num_threads
-    bootstrap_client = chromadb.PersistentClient(path=str(palace_path))
-    bootstrap_client.create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
-    del bootstrap_client  # drop reference so a fresh client reopens cleanly
-
-    wrapper = ChromaBackend().get_collection(
-        str(palace_path),
-        collection_name="mempalace_drawers",
-        create=False,
-    )
-
-    assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1

From 942aae3ed5408cebc40f6470768c0caeb6d3f848 Mon Sep 17 00:00:00 2001
From: jp <jp@jphein.com>
Date: Sun, 26 Apr 2026 13:20:14 -0700
Subject: [PATCH 4/5] fix(hnsw): address @igorls's #1173 review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Restore three `_pin_hnsw_threads` tests that the previous integrity-gate
commit deleted. The function is still live code on develop (defined at
chroma.py:207, called from chroma.py:705 + mcp_server.py), so the
deletion left the import unused (ruff F401) and dropped coverage on a
function unrelated to this PR's scope. Restored verbatim from main.

Plus three nits @igorls flagged:

- **Thread-safety doc**: `_quarantined_paths` mutation is lock-free;
  documented that idempotency of `quarantine_stale_hnsw` is the safety
  property (concurrent same-palace calls produce a benign redundant
  rename attempt that no-ops, no need for a lock).
- **Pickle protocol assumption**: `_segment_appears_healthy` requires
  PROTO ≥ 2 (`0x80`). Documented; matches what chromadb writes today,
  and a future protocol-0/1 emission would conservatively quarantine
  + lazy-rebuild rather than mis-classify as healthy.
- Class-level vs module-level scope: keeping class-level — the
  conftest reset is the controlled case, and module-level wouldn't
  remove the foot-gun, just relocate it. Conftest reset documented in
  the existing comment is the right pattern for test isolation.

Style nit (`Path(marker).touch()` vs `open(marker, "a").close()`)
deferred — that pattern lives in #1177's territory, not #1173's.

37/37 tests pass on the PR branch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/backends/chroma.py | 14 +++++++++++
 tests/test_backends.py       | 49 ++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index 034c28b..d43b884 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -72,6 +72,12 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
     can execute arbitrary code, and the byte-sniff is sufficient to
     distinguish a complete write from truncation, zero-fill, or
     partial-flush corruption.
+
+    Assumes pickle protocol >= 2 (``0x80`` PROTO marker). Matches what
+    chromadb writes today; if a future chromadb version emits protocol
+    0/1 segments, this check would start returning False on healthy
+    files and quarantine_stale_hnsw would conservatively rename them
+    out of the way (lazy rebuild on next open recovers).
     """
     meta_path = os.path.join(seg_dir, "index_metadata.pickle")
     if not os.path.isfile(meta_path):
@@ -621,6 +627,14 @@ class ChromaBackend(BaseBackend):
     # Real runtime drift is still handled — palace-daemon's ``_auto_repair``
     # calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors,
     # which bypasses this gate.
+    #
+    # Thread-safety: this set is mutated without a lock. Two concurrent
+    # ``make_client()`` calls for the same palace can both pass the
+    # membership check and both invoke ``quarantine_stale_hnsw``. That's
+    # safe because the function is idempotent (mtime check + timestamped
+    # rename of distinct directories), so the worst-case race produces
+    # one redundant rename attempt that no-ops. Idempotency is the
+    # safety property; locking would add cost without correctness gain.
     _quarantined_paths: set[str] = set()
 
     @staticmethod
diff --git a/tests/test_backends.py b/tests/test_backends.py
index ec4012a..da4f4d8 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -578,3 +578,52 @@ def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch
     assert calls == [palace_a, palace_b]
 
 
+# ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ──
+
+
+def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path):
+    """Legacy collections (created without num_threads) get the retrofit applied."""
+    palace_path = tmp_path / "legacy-palace"
+    palace_path.mkdir()
+
+    client = chromadb.PersistentClient(path=str(palace_path))
+    col = client.create_collection(
+        "mempalace_drawers",
+        metadata={"hnsw:space": "cosine"},  # no num_threads — legacy
+    )
+    assert col.configuration_json.get("hnsw", {}).get("num_threads") is None
+
+    _pin_hnsw_threads(col)
+
+    assert col.configuration_json["hnsw"]["num_threads"] == 1
+
+
+def test_pin_hnsw_threads_swallows_all_errors():
+    """Retrofit never raises even when collection.modify explodes."""
+
+    class _ExplodingCollection:
+        def modify(self, *args, **kwargs):
+            raise RuntimeError("boom")
+
+    _pin_hnsw_threads(_ExplodingCollection())  # must not raise
+
+
+def test_get_collection_applies_retrofit_on_existing_palace(tmp_path):
+    """ChromaBackend.get_collection(create=False) applies the retrofit."""
+    palace_path = tmp_path / "palace"
+    palace_path.mkdir()
+
+    # Simulate a legacy palace: create collection without num_threads
+    bootstrap_client = chromadb.PersistentClient(path=str(palace_path))
+    bootstrap_client.create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
+    del bootstrap_client  # drop reference so a fresh client reopens cleanly
+
+    wrapper = ChromaBackend().get_collection(
+        str(palace_path),
+        collection_name="mempalace_drawers",
+        create=False,
+    )
+
+    assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1
+
+

From 43aa1aa24ed861822ee709f8ae5192ccd0106967 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sun, 26 Apr 2026 18:19:15 -0300
Subject: [PATCH 5/5] style: ruff format under CI-pinned 0.4.x

---
 tests/test_backends.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/tests/test_backends.py b/tests/test_backends.py
index da4f4d8..e632bdc 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -393,9 +393,7 @@ _HEALTHY_META = b"\x80\x04" + b"\x00" * 32 + b"\x2e"
 _CORRUPT_META = b"\x00" * 64
 
 
-def _make_palace_with_segment(
-    tmp_path, hnsw_mtime, sqlite_mtime, meta_bytes=_HEALTHY_META
-):
+def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime, meta_bytes=_HEALTHY_META):
     """Helper: build a palace dir with one HNSW segment + sqlite at given
     mtimes. ``meta_bytes`` controls whether the segment looks healthy
     (default), corrupt (``_CORRUPT_META``), or has no metadata file at
@@ -544,9 +542,9 @@ def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeyp
     ChromaBackend.make_client(palace_path)
     ChromaBackend.make_client(palace_path)
 
-    assert calls == [palace_path], (
-        "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
-    )
+    assert calls == [
+        palace_path
+    ], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
 
 
 def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
@@ -625,5 +623,3 @@ def test_get_collection_applies_retrofit_on_existing_palace(tmp_path):
     )
 
     assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1
-
-