Merge pull request #1322 from MemPalace/fix/1121-1132-1263-client-quarantine
fix(backends/chroma): wire quarantine_stale_hnsw into _client() (#1121 #1132 #1263)
This commit is contained in:
@@ -993,7 +993,7 @@ class ChromaBackend(BaseBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if cached is None or inode_changed or mtime_changed or mtime_appeared:
|
if cached is None or inode_changed or mtime_changed or mtime_appeared:
|
||||||
_fix_blob_seq_ids(palace_path)
|
ChromaBackend._prepare_palace_for_open(palace_path)
|
||||||
cached = chromadb.PersistentClient(path=palace_path)
|
cached = chromadb.PersistentClient(path=palace_path)
|
||||||
self._clients[palace_path] = cached
|
self._clients[palace_path] = cached
|
||||||
# Re-stat after the client constructor runs: chromadb creates
|
# Re-stat after the client constructor runs: chromadb creates
|
||||||
@@ -1028,6 +1028,31 @@ class ChromaBackend(BaseBackend):
|
|||||||
# safety property; locking would add cost without correctness gain.
|
# safety property; locking would add cost without correctness gain.
|
||||||
_quarantined_paths: set[str] = set()
|
_quarantined_paths: set[str] = set()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _prepare_palace_for_open(palace_path: str) -> None:
|
||||||
|
"""Run the pre-open safety pass shared by :meth:`make_client` and
|
||||||
|
:meth:`_client`.
|
||||||
|
|
||||||
|
Two steps, both required before constructing a ``PersistentClient``:
|
||||||
|
|
||||||
|
1. ``_fix_blob_seq_ids`` — repairs the BLOB seq_id quirk that bites
|
||||||
|
certain chromadb migrations.
|
||||||
|
2. ``quarantine_stale_hnsw`` — gated by :attr:`_quarantined_paths` so
|
||||||
|
it fires once per palace per process. This is the SIGSEGV
|
||||||
|
prevention path for stale HNSW segments (see #1121, #1132, #1263);
|
||||||
|
wiring it through this helper means CLI mining, search, repair,
|
||||||
|
and status all benefit, not just the legacy ``make_client``
|
||||||
|
callers.
|
||||||
|
|
||||||
|
Idempotent: safe to call from any code path that is about to open or
|
||||||
|
re-open a palace. The ``_quarantined_paths`` gate prevents thrash on
|
||||||
|
hot paths (e.g. ``_client()`` is called on every backend operation).
|
||||||
|
"""
|
||||||
|
_fix_blob_seq_ids(palace_path)
|
||||||
|
if palace_path not in ChromaBackend._quarantined_paths:
|
||||||
|
quarantine_stale_hnsw(palace_path)
|
||||||
|
ChromaBackend._quarantined_paths.add(palace_path)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_client(palace_path: str):
|
def make_client(palace_path: str):
|
||||||
"""Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first).
|
"""Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first).
|
||||||
@@ -1040,10 +1065,7 @@ class ChromaBackend(BaseBackend):
|
|||||||
:attr:`_quarantined_paths` for the rationale (cold-start protection
|
:attr:`_quarantined_paths` for the rationale (cold-start protection
|
||||||
vs. runtime thrash on steady-write daemons).
|
vs. runtime thrash on steady-write daemons).
|
||||||
"""
|
"""
|
||||||
_fix_blob_seq_ids(palace_path)
|
ChromaBackend._prepare_palace_for_open(palace_path)
|
||||||
if palace_path not in ChromaBackend._quarantined_paths:
|
|
||||||
quarantine_stale_hnsw(palace_path)
|
|
||||||
ChromaBackend._quarantined_paths.add(palace_path)
|
|
||||||
return chromadb.PersistentClient(path=palace_path)
|
return chromadb.PersistentClient(path=palace_path)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
@@ -764,6 +764,67 @@ def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch
|
|||||||
assert calls == [palace_a, palace_b]
|
assert calls == [palace_a, palace_b]
|
||||||
|
|
||||||
|
|
||||||
|
# ── _client() cold-start gate (#1121, #1132, #1263) ──────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_client_quarantines_corrupt_segment_on_first_open(tmp_path, monkeypatch):
|
||||||
|
"""The instance ``_client()`` path must run ``quarantine_stale_hnsw``
|
||||||
|
on first open, mirroring the ``make_client()`` static helper. Before
|
||||||
|
PR #1173's wiring was extended here, CLI mining / search / repair /
|
||||||
|
status all skipped the quarantine pass and would SIGSEGV on a stale
|
||||||
|
HNSW segment (#1121, #1132, #1263)."""
|
||||||
|
now = 1_700_000_000.0
|
||||||
|
palace, seg = _make_palace_with_segment(
|
||||||
|
tmp_path,
|
||||||
|
hnsw_mtime=now - 7200,
|
||||||
|
sqlite_mtime=now,
|
||||||
|
meta_bytes=_CORRUPT_META,
|
||||||
|
)
|
||||||
|
|
||||||
|
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
|
||||||
|
|
||||||
|
backend = ChromaBackend()
|
||||||
|
try:
|
||||||
|
backend._client(str(palace))
|
||||||
|
finally:
|
||||||
|
backend.close()
|
||||||
|
|
||||||
|
assert not seg.exists(), "_client() should have quarantined the corrupt segment"
|
||||||
|
drift_dirs = [p for p in palace.iterdir() if ".drift-" in p.name]
|
||||||
|
assert len(drift_dirs) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch):
|
||||||
|
"""Repeated ``_client()`` calls for the same palace re-run quarantine
|
||||||
|
at most once — the ``_quarantined_paths`` gate prevents runtime
|
||||||
|
thrash on hot paths (``_client()`` is hit on every backend op)."""
|
||||||
|
palace_path = str(tmp_path / "palace")
|
||||||
|
os.makedirs(palace_path, exist_ok=True)
|
||||||
|
(Path(palace_path) / "chroma.sqlite3").write_text("")
|
||||||
|
|
||||||
|
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
|
||||||
|
|
||||||
|
calls: list[str] = []
|
||||||
|
|
||||||
|
def _spy(path, stale_seconds=300.0):
|
||||||
|
calls.append(path)
|
||||||
|
return []
|
||||||
|
|
||||||
|
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
|
||||||
|
|
||||||
|
backend = ChromaBackend()
|
||||||
|
try:
|
||||||
|
backend._client(palace_path)
|
||||||
|
backend._client(palace_path)
|
||||||
|
backend._client(palace_path)
|
||||||
|
finally:
|
||||||
|
backend.close()
|
||||||
|
|
||||||
|
assert (
|
||||||
|
calls == [palace_path]
|
||||||
|
), "quarantine_stale_hnsw should fire once per palace per process from _client(), not on every call"
|
||||||
|
|
||||||
|
|
||||||
# ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ──
|
# ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ──
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user