From 10733f1df474ea9e97a7a74e8b33ede5239a8138 Mon Sep 17 00:00:00 2001 From: igorls <4753812+igorls@users.noreply.github.com> Date: Sat, 2 May 2026 22:56:36 -0300 Subject: [PATCH] fix(backends/chroma): wire quarantine_stale_hnsw into _client() to prevent SIGSEGV on stale HNSW (#1121, #1132, #1263) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #1173 wired quarantine_stale_hnsw into the static make_client() helper but not into the instance _client() method. As a result every non-MCP entry point (CLI mining, search, repair, status) — which all use get_collection / _get_or_create_collection / _client() — skipped the cold-start quarantine pass and could SIGSEGV on a stale HNSW segment left over from a partial flush, replicated palace, or crashed-mid-write. Refactor: extract the (_fix_blob_seq_ids + gated quarantine_stale_hnsw) pre-open pass into a single private static helper ChromaBackend._prepare_palace_for_open(). Both make_client() and _client() now route through it, so the _quarantined_paths once-per- palace-per-process gate is preserved (no runtime thrash on hot paths) and behaviour stays identical — the fix is purely about extending the existing protection to the path that was missing it. Tests: - test_client_quarantines_corrupt_segment_on_first_open mirrors the existing make_client test and verifies _client() actually renames a corrupt segment on first open. - test_client_quarantines_only_on_first_call_per_palace verifies the cache gate prevents re-running quarantine across repeated _client() calls — important because _client() is hit on every backend op. Closes #1121. Closes #1132. Closes #1263. Co-Authored-By: Claude Opus 4.7 (1M context) --- mempalace/backends/chroma.py | 32 ++++++++++++++++--- tests/test_backends.py | 61 ++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 5 deletions(-) diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 01ac627..d9b99a4 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -993,7 +993,7 @@ class ChromaBackend(BaseBackend): ) if cached is None or inode_changed or mtime_changed or mtime_appeared: - _fix_blob_seq_ids(palace_path) + ChromaBackend._prepare_palace_for_open(palace_path) cached = chromadb.PersistentClient(path=palace_path) self._clients[palace_path] = cached # Re-stat after the client constructor runs: chromadb creates @@ -1028,6 +1028,31 @@ class ChromaBackend(BaseBackend): # safety property; locking would add cost without correctness gain. _quarantined_paths: set[str] = set() + @staticmethod + def _prepare_palace_for_open(palace_path: str) -> None: + """Run the pre-open safety pass shared by :meth:`make_client` and + :meth:`_client`. + + Two steps, both required before constructing a ``PersistentClient``: + + 1. ``_fix_blob_seq_ids`` — repairs the BLOB seq_id quirk that bites + certain chromadb migrations. + 2. ``quarantine_stale_hnsw`` — gated by :attr:`_quarantined_paths` so + it fires once per palace per process. This is the SIGSEGV + prevention path for stale HNSW segments (see #1121, #1132, #1263); + wiring it through this helper means CLI mining, search, repair, + and status all benefit, not just the legacy ``make_client`` + callers. + + Idempotent: safe to call from any code path that is about to open or + re-open a palace. The ``_quarantined_paths`` gate prevents thrash on + hot paths (e.g. ``_client()`` is called on every backend operation). + """ + _fix_blob_seq_ids(palace_path) + if palace_path not in ChromaBackend._quarantined_paths: + quarantine_stale_hnsw(palace_path) + ChromaBackend._quarantined_paths.add(palace_path) + @staticmethod def make_client(palace_path: str): """Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first). @@ -1040,10 +1065,7 @@ class ChromaBackend(BaseBackend): :attr:`_quarantined_paths` for the rationale (cold-start protection vs. runtime thrash on steady-write daemons). """ - _fix_blob_seq_ids(palace_path) - if palace_path not in ChromaBackend._quarantined_paths: - quarantine_stale_hnsw(palace_path) - ChromaBackend._quarantined_paths.add(palace_path) + ChromaBackend._prepare_palace_for_open(palace_path) return chromadb.PersistentClient(path=palace_path) @staticmethod diff --git a/tests/test_backends.py b/tests/test_backends.py index 5efa71b..8364dc7 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -764,6 +764,67 @@ def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch assert calls == [palace_a, palace_b] +# ── _client() cold-start gate (#1121, #1132, #1263) ────────────────────── + + +def test_client_quarantines_corrupt_segment_on_first_open(tmp_path, monkeypatch): + """The instance ``_client()`` path must run ``quarantine_stale_hnsw`` + on first open, mirroring the ``make_client()`` static helper. Before + PR #1173's wiring was extended here, CLI mining / search / repair / + status all skipped the quarantine pass and would SIGSEGV on a stale + HNSW segment (#1121, #1132, #1263).""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment( + tmp_path, + hnsw_mtime=now - 7200, + sqlite_mtime=now, + meta_bytes=_CORRUPT_META, + ) + + monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set()) + + backend = ChromaBackend() + try: + backend._client(str(palace)) + finally: + backend.close() + + assert not seg.exists(), "_client() should have quarantined the corrupt segment" + drift_dirs = [p for p in palace.iterdir() if ".drift-" in p.name] + assert len(drift_dirs) == 1 + + +def test_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch): + """Repeated ``_client()`` calls for the same palace re-run quarantine + at most once — the ``_quarantined_paths`` gate prevents runtime + thrash on hot paths (``_client()`` is hit on every backend op).""" + palace_path = str(tmp_path / "palace") + os.makedirs(palace_path, exist_ok=True) + (Path(palace_path) / "chroma.sqlite3").write_text("") + + monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set()) + + calls: list[str] = [] + + def _spy(path, stale_seconds=300.0): + calls.append(path) + return [] + + monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy) + + backend = ChromaBackend() + try: + backend._client(palace_path) + backend._client(palace_path) + backend._client(palace_path) + finally: + backend.close() + + assert calls == [palace_path], ( + "quarantine_stale_hnsw should fire once per palace per process from _client(), not on every call" + ) + + # ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ──