fix(hnsw): gate quarantine_stale_hnsw to cold-start, not every reconnect

Symptom on the canonical disks daemon: drift quarantines firing every
10–30 minutes throughout the day under steady write load. Logs show
.drift-* directories accumulating despite the daemon being the only
writer (no Syncthing replication of palace data).

Root cause is a false-positive thrash in the quarantine heuristic:

- chroma.sqlite3 mtime bumps on every write (millisecond cadence).
- HNSW segment files (data_level0.bin) only flush to disk on
  chromadb's internal cadence, which can lag minutes behind sqlite
  under continuous write load.

Once the gap exceeds the 300s threshold, quarantine_stale_hnsw renames
a perfectly valid HNSW segment, chromadb rebuilds it from scratch, and
the cycle repeats as soon as the next batch of writes lands. The 300s
threshold (lowered from 3600s in PR #1173 after a 0.96h-drift production
segfault) is correct for the cross-machine-replication failure mode it
was designed for, but wrong for a daemon-strict deployment whose only
"drift" source is its own benign flush lag.

Fix: gate the proactive quarantine check to the first ``make_client()``
invocation per palace per process (``ChromaBackend._quarantined_paths``
set). Real cold-start drift (replication, partial restore, crashed-mid-
write) still gets caught — that's exactly when a fresh daemon process
opens the palace. Real runtime drift on observed HNSW errors still gets
caught via palace-daemon's ``_auto_repair`` which calls
``quarantine_stale_hnsw`` directly, bypassing this gate.

Two new tests in test_backends.py verify single-fire-per-palace and
per-palace independence. Conftest clears the gate between tests.

Suite 1362/1362 pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
jp
2026-04-25 22:13:27 -07:00
parent db80f6e26c
commit e5e7a57930
3 changed files with 93 additions and 1 deletions
+21 -1
View File
@@ -537,6 +537,20 @@ class ChromaBackend(BaseBackend):
# Public static helpers (legacy; prefer :meth:`get_collection`) # Public static helpers (legacy; prefer :meth:`get_collection`)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Per-process record of palaces that have already had quarantine_stale_hnsw
# invoked at least once. The proactive drift check is a *cold-start*
# protection — it catches HNSW segments that arrived stale relative to
# ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore,
# crashed-mid-write). Once a long-running process has opened the palace
# cleanly, re-firing on every reconnect is a *runtime thrash*: the
# daemon's own writes bump sqlite mtime but HNSW flushes batch on
# chromadb's internal cadence, so the mtime gap naturally exceeds the
# threshold under steady write load even though nothing is corrupt.
# Real runtime drift is still handled — palace-daemon's ``_auto_repair``
# calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors,
# which bypasses this gate.
_quarantined_paths: set[str] = set()
@staticmethod @staticmethod
def make_client(palace_path: str): def make_client(palace_path: str):
"""Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first). """Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first).
@@ -544,9 +558,15 @@ class ChromaBackend(BaseBackend):
Deprecated-ish: exposed for legacy long-lived callers that manage their Deprecated-ish: exposed for legacy long-lived callers that manage their
own client cache. New code should obtain a collection through own client cache. New code should obtain a collection through
:meth:`get_collection` which manages caching internally. :meth:`get_collection` which manages caching internally.
Quarantines stale HNSW segments **once per palace per process**. See
:attr:`_quarantined_paths` for the rationale (cold-start protection
vs. runtime thrash on steady-write daemons).
""" """
_fix_blob_seq_ids(palace_path) _fix_blob_seq_ids(palace_path)
quarantine_stale_hnsw(palace_path) if palace_path not in ChromaBackend._quarantined_paths:
quarantine_stale_hnsw(palace_path)
ChromaBackend._quarantined_paths.add(palace_path)
return chromadb.PersistentClient(path=palace_path) return chromadb.PersistentClient(path=palace_path)
@staticmethod @staticmethod
+8
View File
@@ -46,6 +46,14 @@ def _reset_mcp_cache():
mcp_server._collection_cache = None mcp_server._collection_cache = None
except (ImportError, AttributeError): except (ImportError, AttributeError):
pass pass
try:
# Reset the per-process quarantine gate so tests don't leak
# state through ChromaBackend._quarantined_paths.
from mempalace.backends.chroma import ChromaBackend
ChromaBackend._quarantined_paths.clear()
except (ImportError, AttributeError):
pass
_clear_cache() _clear_cache()
yield yield
+64
View File
@@ -1,5 +1,6 @@
import os import os
import sqlite3 import sqlite3
from pathlib import Path
import chromadb import chromadb
import pytest import pytest
@@ -446,6 +447,69 @@ def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path):
assert drift.exists() assert drift.exists()
# ── make_client cold-start gate ──────────────────────────────────────────
def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch):
"""Quarantine fires on first ``make_client()`` for a palace, then is
skipped on subsequent calls — prevents runtime thrash where a daemon's
own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes,
making the mtime heuristic falsely trigger every reconnect."""
from mempalace.backends.chroma import ChromaBackend
palace_path = str(tmp_path / "palace")
os.makedirs(palace_path, exist_ok=True)
(Path(palace_path) / "chroma.sqlite3").write_text("")
# Reset the per-process cache so this test is independent of others.
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
calls: list[str] = []
def _spy(path, stale_seconds=300.0):
calls.append(path)
return []
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
ChromaBackend.make_client(palace_path)
ChromaBackend.make_client(palace_path)
ChromaBackend.make_client(palace_path)
assert calls == [palace_path], (
"quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
)
def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
"""Two distinct palaces each get one quarantine attempt — the gate is
keyed by palace path, not global."""
from mempalace.backends.chroma import ChromaBackend
palace_a = str(tmp_path / "palace_a")
palace_b = str(tmp_path / "palace_b")
for p in (palace_a, palace_b):
os.makedirs(p, exist_ok=True)
(Path(p) / "chroma.sqlite3").write_text("")
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
calls: list[str] = []
def _spy(path, stale_seconds=300.0):
calls.append(path)
return []
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
ChromaBackend.make_client(palace_a)
ChromaBackend.make_client(palace_b)
ChromaBackend.make_client(palace_a) # already gated
ChromaBackend.make_client(palace_b) # already gated
assert calls == [palace_a, palace_b]
# ── _pin_hnsw_threads ───────────────────────────────────────────────────── # ── _pin_hnsw_threads ─────────────────────────────────────────────────────