diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index fee7d18..22e9bf2 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -537,6 +537,20 @@ class ChromaBackend(BaseBackend): # Public static helpers (legacy; prefer :meth:`get_collection`) # ------------------------------------------------------------------ + # Per-process record of palaces that have already had quarantine_stale_hnsw + # invoked at least once. The proactive drift check is a *cold-start* + # protection — it catches HNSW segments that arrived stale relative to + # ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore, + # crashed-mid-write). Once a long-running process has opened the palace + # cleanly, re-firing on every reconnect is a *runtime thrash*: the + # daemon's own writes bump sqlite mtime but HNSW flushes batch on + # chromadb's internal cadence, so the mtime gap naturally exceeds the + # threshold under steady write load even though nothing is corrupt. + # Real runtime drift is still handled — palace-daemon's ``_auto_repair`` + # calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors, + # which bypasses this gate. + _quarantined_paths: set[str] = set() + @staticmethod def make_client(palace_path: str): """Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first). @@ -544,9 +558,15 @@ class ChromaBackend(BaseBackend): Deprecated-ish: exposed for legacy long-lived callers that manage their own client cache. New code should obtain a collection through :meth:`get_collection` which manages caching internally. + + Quarantines stale HNSW segments **once per palace per process**. See + :attr:`_quarantined_paths` for the rationale (cold-start protection + vs. runtime thrash on steady-write daemons). """ _fix_blob_seq_ids(palace_path) - quarantine_stale_hnsw(palace_path) + if palace_path not in ChromaBackend._quarantined_paths: + quarantine_stale_hnsw(palace_path) + ChromaBackend._quarantined_paths.add(palace_path) return chromadb.PersistentClient(path=palace_path) @staticmethod diff --git a/tests/conftest.py b/tests/conftest.py index 7b2bb77..4ed82ca 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,6 +46,14 @@ def _reset_mcp_cache(): mcp_server._collection_cache = None except (ImportError, AttributeError): pass + try: + # Reset the per-process quarantine gate so tests don't leak + # state through ChromaBackend._quarantined_paths. + from mempalace.backends.chroma import ChromaBackend + + ChromaBackend._quarantined_paths.clear() + except (ImportError, AttributeError): + pass _clear_cache() yield diff --git a/tests/test_backends.py b/tests/test_backends.py index e47eb6f..2b83314 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -1,5 +1,6 @@ import os import sqlite3 +from pathlib import Path import chromadb import pytest @@ -446,6 +447,69 @@ def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path): assert drift.exists() +# ── make_client cold-start gate ────────────────────────────────────────── + + +def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch): + """Quarantine fires on first ``make_client()`` for a palace, then is + skipped on subsequent calls — prevents runtime thrash where a daemon's + own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes, + making the mtime heuristic falsely trigger every reconnect.""" + from mempalace.backends.chroma import ChromaBackend + + palace_path = str(tmp_path / "palace") + os.makedirs(palace_path, exist_ok=True) + (Path(palace_path) / "chroma.sqlite3").write_text("") + + # Reset the per-process cache so this test is independent of others. + monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set()) + + calls: list[str] = [] + + def _spy(path, stale_seconds=300.0): + calls.append(path) + return [] + + monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy) + + ChromaBackend.make_client(palace_path) + ChromaBackend.make_client(palace_path) + ChromaBackend.make_client(palace_path) + + assert calls == [palace_path], ( + "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect" + ) + + +def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch): + """Two distinct palaces each get one quarantine attempt — the gate is + keyed by palace path, not global.""" + from mempalace.backends.chroma import ChromaBackend + + palace_a = str(tmp_path / "palace_a") + palace_b = str(tmp_path / "palace_b") + for p in (palace_a, palace_b): + os.makedirs(p, exist_ok=True) + (Path(p) / "chroma.sqlite3").write_text("") + + monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set()) + + calls: list[str] = [] + + def _spy(path, stale_seconds=300.0): + calls.append(path) + return [] + + monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy) + + ChromaBackend.make_client(palace_a) + ChromaBackend.make_client(palace_b) + ChromaBackend.make_client(palace_a) # already gated + ChromaBackend.make_client(palace_b) # already gated + + assert calls == [palace_a, palace_b] + + # ── _pin_hnsw_threads ─────────────────────────────────────────────────────