Merge pull request #1173 from jphein/fix/quarantine-on-make-client
fix: call quarantine_stale_hnsw() in make_client(); lower threshold to 5min
This commit is contained in:
+135
-26
@@ -49,41 +49,105 @@ def _validate_where(where: Optional[dict]) -> None:
|
|||||||
stack.extend(x for x in v if isinstance(x, dict))
|
stack.extend(x for x in v if isinstance(x, dict))
|
||||||
|
|
||||||
|
|
||||||
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> list[str]:
|
def _segment_appears_healthy(seg_dir: str) -> bool:
|
||||||
"""Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3.
|
"""Return True if a chromadb HNSW segment dir looks intact.
|
||||||
|
|
||||||
When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk
|
Sniff-tests the chromadb-written segment metadata file
|
||||||
HNSW segment is significantly older than ``chroma.sqlite3``, the Rust
|
(``index_metadata.pickle``) for its expected format bytes without
|
||||||
graph-walk can dereference dangling neighbor pointers for entries that
|
parsing it. ChromaDB writes that file after a successful HNSW flush;
|
||||||
exist in the metadata segment but not in the HNSW index, and segfault
|
a complete write starts with byte ``0x80`` and ends with byte
|
||||||
in a background thread on the next ``count()`` or ``query(...)`` call.
|
``0x2e`` (the protocol/terminator byte sequence chromadb serializes
|
||||||
|
with). If both bytes are present and the file is non-trivially sized,
|
||||||
|
chromadb will load the segment cleanly even when its on-disk mtime
|
||||||
|
trails ``chroma.sqlite3`` — which is the *steady state* under
|
||||||
|
chromadb 1.5.x's async batched flush, not corruption.
|
||||||
|
|
||||||
This is the same failure mode reported at #823 (semantic search stale
|
A missing metadata file is treated as "fresh / never-flushed" and
|
||||||
|
considered healthy. Renaming an empty dir orphans nothing, and a
|
||||||
|
real corruption case manifests as a present-but-malformed file or a
|
||||||
|
chromadb load error caught downstream by palace-daemon's
|
||||||
|
``_auto_repair`` retry path.
|
||||||
|
|
||||||
|
Deliberately format-sniffs only; never deserializes. Deserialization
|
||||||
|
can execute arbitrary code, and the byte-sniff is sufficient to
|
||||||
|
distinguish a complete write from truncation, zero-fill, or
|
||||||
|
partial-flush corruption.
|
||||||
|
|
||||||
|
Assumes pickle protocol >= 2 (``0x80`` PROTO marker). Matches what
|
||||||
|
chromadb writes today; if a future chromadb version emits protocol
|
||||||
|
0/1 segments, this check would start returning False on healthy
|
||||||
|
files and quarantine_stale_hnsw would conservatively rename them
|
||||||
|
out of the way (lazy rebuild on next open recovers).
|
||||||
|
"""
|
||||||
|
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
|
||||||
|
if not os.path.isfile(meta_path):
|
||||||
|
# No metadata file yet — segment hasn't flushed (fresh / empty).
|
||||||
|
# Renaming would orphan nothing; consider healthy.
|
||||||
|
return True
|
||||||
|
try:
|
||||||
|
size = os.path.getsize(meta_path)
|
||||||
|
# A real chromadb metadata file is at least tens of bytes; a
|
||||||
|
# smaller-than-floor file is almost certainly truncated.
|
||||||
|
if size < 16:
|
||||||
|
return False
|
||||||
|
with open(meta_path, "rb") as f:
|
||||||
|
head = f.read(2)
|
||||||
|
f.seek(-1, 2) # last byte
|
||||||
|
tail = f.read(1)
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
return len(head) == 2 and head[0] == 0x80 and tail == b"\x2e"
|
||||||
|
|
||||||
|
|
||||||
|
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
|
||||||
|
"""Rename HNSW segment dirs that are both stale-by-mtime AND fail an
|
||||||
|
integrity sniff-test.
|
||||||
|
|
||||||
|
Catches the segfault failure mode from #823 (semantic search stale
|
||||||
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
|
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
|
||||||
``count()`` with chromadb 1.5.5), and acknowledged as by-design at
|
``count()`` with chromadb 1.5.5), and acknowledged as by-design at
|
||||||
chroma-core/chroma#2594. On one fork palace (135K drawers), the drift
|
chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb
|
||||||
caused a 65–85% crash rate on fresh-process opens; fresh-process
|
rebuild lazily on next open instead of segfaulting.
|
||||||
crash rate dropped to 0% after the segment dir was renamed out of the
|
|
||||||
way and ChromaDB rebuilt lazily.
|
|
||||||
|
|
||||||
Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer
|
Two-stage check:
|
||||||
than the segment's ``data_level0.bin``, the segment is considered
|
|
||||||
suspect and renamed to ``<uuid>.drift-<timestamp>``. ChromaDB reopens
|
|
||||||
cleanly without it and writes fresh index files on next use. The
|
|
||||||
original directory is renamed, not deleted, so recovery remains
|
|
||||||
possible if the heuristic misfires.
|
|
||||||
|
|
||||||
The default threshold (1h) is deliberately conservative — ChromaDB's
|
1. **mtime gate.** If ``chroma.sqlite3`` is less than
|
||||||
HNSW flush cadence means legitimate drift is normally on the order of
|
``stale_seconds`` newer than the segment's ``data_level0.bin``,
|
||||||
seconds to minutes. A segment that is more than an hour out of date is
|
skip — chromadb is in normal write-path territory.
|
||||||
almost certainly in a "crashed mid-write" state.
|
|
||||||
|
2. **Integrity gate** (``_segment_appears_healthy``). Even when the
|
||||||
|
mtime gap exceeds the threshold, a segment whose
|
||||||
|
``index_metadata.pickle`` passes a format sniff-test is healthy:
|
||||||
|
chromadb 1.5.x flushes HNSW state asynchronously and a clean
|
||||||
|
shutdown does NOT force-flush, so the on-disk HNSW is *always*
|
||||||
|
somewhat older than ``chroma.sqlite3``. Production observation
|
||||||
|
(2026-04-26 disks daemon): three of three segments quarantined
|
||||||
|
on every cold start, with 538-557s gaps, leaving the 151K-drawer
|
||||||
|
palace with vector_ranked=0 until rebuild. Renaming a healthy
|
||||||
|
segment based on mtime alone destroys a valid index — chromadb
|
||||||
|
creates an empty replacement, orphaning every drawer in sqlite
|
||||||
|
from vector recall until the operator runs ``mempalace repair
|
||||||
|
--mode rebuild`` (15+ min on a 151K palace).
|
||||||
|
|
||||||
|
Only segments that pass stage 1 (suspiciously stale) AND fail stage
|
||||||
|
2 (metadata file truncated, zero-filled, or absent-with-data) are
|
||||||
|
renamed to ``<uuid>.drift-<timestamp>``. The original directory is
|
||||||
|
renamed, not deleted, so recovery remains possible if the heuristic
|
||||||
|
misfires.
|
||||||
|
|
||||||
|
The default threshold (5 min) is advisory under daemon-strict; the
|
||||||
|
integrity gate is what actually distinguishes corruption from flush
|
||||||
|
lag. The threshold still matters for the cross-machine replication
|
||||||
|
case (#823), where it bounds how stale a Syncthing-replicated
|
||||||
|
segment can be before we look harder at it.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
palace_path: path to the palace directory containing ``chroma.sqlite3``
|
palace_path: path to the palace directory containing ``chroma.sqlite3``
|
||||||
stale_seconds: minimum mtime gap to treat a segment as stale
|
stale_seconds: minimum mtime gap to *consider* a segment for quarantine
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of paths that were quarantined (empty if nothing drifted).
|
List of paths that were quarantined (empty if nothing actually
|
||||||
|
looked corrupt).
|
||||||
"""
|
"""
|
||||||
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||||
if not os.path.isfile(db_path):
|
if not os.path.isfile(db_path):
|
||||||
@@ -114,19 +178,35 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li
|
|||||||
continue
|
continue
|
||||||
if sqlite_mtime - hnsw_mtime < stale_seconds:
|
if sqlite_mtime - hnsw_mtime < stale_seconds:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Stage 2: integrity gate. mtime drift is necessary but not
|
||||||
|
# sufficient — chromadb's async flush makes drift the steady-
|
||||||
|
# state condition. A healthy segment metadata file proves
|
||||||
|
# chromadb can open the segment without segfault; don't
|
||||||
|
# quarantine a healthy index.
|
||||||
|
if _segment_appears_healthy(seg_dir):
|
||||||
|
logger.info(
|
||||||
|
"HNSW mtime gap %.0fs on %s exceeds threshold but segment "
|
||||||
|
"metadata file is intact — flush-lag, not corruption. "
|
||||||
|
"Leaving in place.",
|
||||||
|
sqlite_mtime - hnsw_mtime,
|
||||||
|
seg_dir,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
target = f"{seg_dir}.drift-{stamp}"
|
target = f"{seg_dir}.drift-{stamp}"
|
||||||
try:
|
try:
|
||||||
os.rename(seg_dir, target)
|
os.rename(seg_dir, target)
|
||||||
moved.append(target)
|
moved.append(target)
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s",
|
"Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s",
|
||||||
seg_dir,
|
seg_dir,
|
||||||
sqlite_mtime - hnsw_mtime,
|
sqlite_mtime - hnsw_mtime,
|
||||||
target,
|
target,
|
||||||
)
|
)
|
||||||
except OSError:
|
except OSError:
|
||||||
logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir)
|
logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir)
|
||||||
return moved
|
return moved
|
||||||
|
|
||||||
|
|
||||||
@@ -535,6 +615,28 @@ class ChromaBackend(BaseBackend):
|
|||||||
# Public static helpers (legacy; prefer :meth:`get_collection`)
|
# Public static helpers (legacy; prefer :meth:`get_collection`)
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Per-process record of palaces that have already had quarantine_stale_hnsw
|
||||||
|
# invoked at least once. The proactive drift check is a *cold-start*
|
||||||
|
# protection — it catches HNSW segments that arrived stale relative to
|
||||||
|
# ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore,
|
||||||
|
# crashed-mid-write). Once a long-running process has opened the palace
|
||||||
|
# cleanly, re-firing on every reconnect is a *runtime thrash*: the
|
||||||
|
# daemon's own writes bump sqlite mtime but HNSW flushes batch on
|
||||||
|
# chromadb's internal cadence, so the mtime gap naturally exceeds the
|
||||||
|
# threshold under steady write load even though nothing is corrupt.
|
||||||
|
# Real runtime drift is still handled — palace-daemon's ``_auto_repair``
|
||||||
|
# calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors,
|
||||||
|
# which bypasses this gate.
|
||||||
|
#
|
||||||
|
# Thread-safety: this set is mutated without a lock. Two concurrent
|
||||||
|
# ``make_client()`` calls for the same palace can both pass the
|
||||||
|
# membership check and both invoke ``quarantine_stale_hnsw``. That's
|
||||||
|
# safe because the function is idempotent (mtime check + timestamped
|
||||||
|
# rename of distinct directories), so the worst-case race produces
|
||||||
|
# one redundant rename attempt that no-ops. Idempotency is the
|
||||||
|
# safety property; locking would add cost without correctness gain.
|
||||||
|
_quarantined_paths: set[str] = set()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_client(palace_path: str):
|
def make_client(palace_path: str):
|
||||||
"""Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first).
|
"""Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first).
|
||||||
@@ -542,8 +644,15 @@ class ChromaBackend(BaseBackend):
|
|||||||
Deprecated-ish: exposed for legacy long-lived callers that manage their
|
Deprecated-ish: exposed for legacy long-lived callers that manage their
|
||||||
own client cache. New code should obtain a collection through
|
own client cache. New code should obtain a collection through
|
||||||
:meth:`get_collection` which manages caching internally.
|
:meth:`get_collection` which manages caching internally.
|
||||||
|
|
||||||
|
Quarantines stale HNSW segments **once per palace per process**. See
|
||||||
|
:attr:`_quarantined_paths` for the rationale (cold-start protection
|
||||||
|
vs. runtime thrash on steady-write daemons).
|
||||||
"""
|
"""
|
||||||
_fix_blob_seq_ids(palace_path)
|
_fix_blob_seq_ids(palace_path)
|
||||||
|
if palace_path not in ChromaBackend._quarantined_paths:
|
||||||
|
quarantine_stale_hnsw(palace_path)
|
||||||
|
ChromaBackend._quarantined_paths.add(palace_path)
|
||||||
return chromadb.PersistentClient(path=palace_path)
|
return chromadb.PersistentClient(path=palace_path)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
@@ -46,6 +46,14 @@ def _reset_mcp_cache():
|
|||||||
mcp_server._collection_cache = None
|
mcp_server._collection_cache = None
|
||||||
except (ImportError, AttributeError):
|
except (ImportError, AttributeError):
|
||||||
pass
|
pass
|
||||||
|
try:
|
||||||
|
# Reset the per-process quarantine gate so tests don't leak
|
||||||
|
# state through ChromaBackend._quarantined_paths.
|
||||||
|
from mempalace.backends.chroma import ChromaBackend
|
||||||
|
|
||||||
|
ChromaBackend._quarantined_paths.clear()
|
||||||
|
except (ImportError, AttributeError):
|
||||||
|
pass
|
||||||
|
|
||||||
_clear_cache()
|
_clear_cache()
|
||||||
yield
|
yield
|
||||||
|
|||||||
+138
-8
@@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import chromadb
|
import chromadb
|
||||||
import pytest
|
import pytest
|
||||||
@@ -384,36 +385,102 @@ def test_fix_blob_seq_ids_noop_without_database(tmp_path):
|
|||||||
# ── quarantine_stale_hnsw ─────────────────────────────────────────────────
|
# ── quarantine_stale_hnsw ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime):
|
# Marker bytes for the chromadb segment metadata file. A complete
|
||||||
"""Helper: build a palace dir with one HNSW segment + sqlite at given mtimes."""
|
# write begins with PROTO opcode (0x80) and ends with STOP opcode
|
||||||
|
# (0x2e); _segment_appears_healthy sniffs these bytes without parsing
|
||||||
|
# the file.
|
||||||
|
_HEALTHY_META = b"\x80\x04" + b"\x00" * 32 + b"\x2e"
|
||||||
|
_CORRUPT_META = b"\x00" * 64
|
||||||
|
|
||||||
|
|
||||||
|
def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime, meta_bytes=_HEALTHY_META):
|
||||||
|
"""Helper: build a palace dir with one HNSW segment + sqlite at given
|
||||||
|
mtimes. ``meta_bytes`` controls whether the segment looks healthy
|
||||||
|
(default), corrupt (``_CORRUPT_META``), or has no metadata file at
|
||||||
|
all (``None``)."""
|
||||||
palace = tmp_path / "palace"
|
palace = tmp_path / "palace"
|
||||||
palace.mkdir()
|
palace.mkdir()
|
||||||
(palace / "chroma.sqlite3").write_text("")
|
(palace / "chroma.sqlite3").write_text("")
|
||||||
seg = palace / "abcd-1234-5678"
|
seg = palace / "abcd-1234-5678"
|
||||||
seg.mkdir()
|
seg.mkdir()
|
||||||
(seg / "data_level0.bin").write_text("")
|
(seg / "data_level0.bin").write_text("")
|
||||||
|
if meta_bytes is not None:
|
||||||
|
(seg / "index_metadata.pickle").write_bytes(meta_bytes)
|
||||||
os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime))
|
os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime))
|
||||||
os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime))
|
os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime))
|
||||||
return palace, seg
|
return palace, seg
|
||||||
|
|
||||||
|
|
||||||
def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path):
|
def test_quarantine_stale_hnsw_renames_corrupt_segment(tmp_path):
|
||||||
"""Segment whose data_level0.bin is 2h older than sqlite gets renamed."""
|
"""Segment with stale mtime AND a malformed metadata file gets renamed."""
|
||||||
now = 1_700_000_000.0
|
now = 1_700_000_000.0
|
||||||
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now)
|
palace, seg = _make_palace_with_segment(
|
||||||
|
tmp_path,
|
||||||
|
hnsw_mtime=now - 7200,
|
||||||
|
sqlite_mtime=now,
|
||||||
|
meta_bytes=_CORRUPT_META,
|
||||||
|
)
|
||||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||||
assert len(moved) == 1
|
assert len(moved) == 1
|
||||||
assert ".drift-" in moved[0]
|
assert ".drift-" in moved[0]
|
||||||
assert not seg.exists()
|
assert not seg.exists()
|
||||||
# the renamed directory still exists and contains the original file
|
|
||||||
renamed = list(palace.iterdir())
|
renamed = list(palace.iterdir())
|
||||||
drift_dirs = [p for p in renamed if ".drift-" in p.name]
|
drift_dirs = [p for p in renamed if ".drift-" in p.name]
|
||||||
assert len(drift_dirs) == 1
|
assert len(drift_dirs) == 1
|
||||||
assert (drift_dirs[0] / "data_level0.bin").exists()
|
assert (drift_dirs[0] / "data_level0.bin").exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone(tmp_path):
|
||||||
|
"""Segment with stale mtime but a complete metadata file is NOT
|
||||||
|
renamed — this is the chromadb-1.5.x async-flush steady state, not
|
||||||
|
corruption. Production case at 06:24 PDT 2026-04-26: cold-start
|
||||||
|
quarantine renamed three healthy segments after a clean shutdown,
|
||||||
|
leaving 151K-drawer palace with vector_ranked=0."""
|
||||||
|
now = 1_700_000_000.0
|
||||||
|
palace, seg = _make_palace_with_segment(
|
||||||
|
tmp_path,
|
||||||
|
hnsw_mtime=now - 7200,
|
||||||
|
sqlite_mtime=now,
|
||||||
|
meta_bytes=_HEALTHY_META,
|
||||||
|
)
|
||||||
|
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||||
|
assert moved == []
|
||||||
|
assert seg.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path):
|
||||||
|
"""Segment with no metadata file is treated as fresh / never-flushed
|
||||||
|
and not quarantined — renaming an empty dir orphans nothing."""
|
||||||
|
now = 1_700_000_000.0
|
||||||
|
palace, seg = _make_palace_with_segment(
|
||||||
|
tmp_path,
|
||||||
|
hnsw_mtime=now - 7200,
|
||||||
|
sqlite_mtime=now,
|
||||||
|
meta_bytes=None,
|
||||||
|
)
|
||||||
|
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||||
|
assert moved == []
|
||||||
|
assert seg.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_stale_hnsw_renames_truncated_metadata(tmp_path):
|
||||||
|
"""Segment with a truncated (under-floor-size) metadata file is
|
||||||
|
quarantined — shape of a partial-flush during process kill."""
|
||||||
|
now = 1_700_000_000.0
|
||||||
|
palace, seg = _make_palace_with_segment(
|
||||||
|
tmp_path,
|
||||||
|
hnsw_mtime=now - 7200,
|
||||||
|
sqlite_mtime=now,
|
||||||
|
meta_bytes=b"\x80\x04",
|
||||||
|
)
|
||||||
|
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||||
|
assert len(moved) == 1
|
||||||
|
assert ".drift-" in moved[0]
|
||||||
|
|
||||||
|
|
||||||
def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path):
|
def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path):
|
||||||
"""Segment with recent mtime vs sqlite is not touched."""
|
"""Segment with recent mtime vs sqlite is not touched (mtime gate
|
||||||
|
short-circuits before integrity gate)."""
|
||||||
now = 1_700_000_000.0
|
now = 1_700_000_000.0
|
||||||
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now)
|
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now)
|
||||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||||
@@ -446,7 +513,70 @@ def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path):
|
|||||||
assert drift.exists()
|
assert drift.exists()
|
||||||
|
|
||||||
|
|
||||||
# ── _pin_hnsw_threads ─────────────────────────────────────────────────────
|
# ── make_client cold-start gate ──────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch):
|
||||||
|
"""Quarantine fires on first ``make_client()`` for a palace, then is
|
||||||
|
skipped on subsequent calls — prevents runtime thrash where a daemon's
|
||||||
|
own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes,
|
||||||
|
making the mtime heuristic falsely trigger every reconnect."""
|
||||||
|
from mempalace.backends.chroma import ChromaBackend
|
||||||
|
|
||||||
|
palace_path = str(tmp_path / "palace")
|
||||||
|
os.makedirs(palace_path, exist_ok=True)
|
||||||
|
(Path(palace_path) / "chroma.sqlite3").write_text("")
|
||||||
|
|
||||||
|
# Reset the per-process cache so this test is independent of others.
|
||||||
|
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
|
||||||
|
|
||||||
|
calls: list[str] = []
|
||||||
|
|
||||||
|
def _spy(path, stale_seconds=300.0):
|
||||||
|
calls.append(path)
|
||||||
|
return []
|
||||||
|
|
||||||
|
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
|
||||||
|
|
||||||
|
ChromaBackend.make_client(palace_path)
|
||||||
|
ChromaBackend.make_client(palace_path)
|
||||||
|
ChromaBackend.make_client(palace_path)
|
||||||
|
|
||||||
|
assert calls == [
|
||||||
|
palace_path
|
||||||
|
], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
|
||||||
|
|
||||||
|
|
||||||
|
def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
|
||||||
|
"""Two distinct palaces each get one quarantine attempt — the gate is
|
||||||
|
keyed by palace path, not global."""
|
||||||
|
from mempalace.backends.chroma import ChromaBackend
|
||||||
|
|
||||||
|
palace_a = str(tmp_path / "palace_a")
|
||||||
|
palace_b = str(tmp_path / "palace_b")
|
||||||
|
for p in (palace_a, palace_b):
|
||||||
|
os.makedirs(p, exist_ok=True)
|
||||||
|
(Path(p) / "chroma.sqlite3").write_text("")
|
||||||
|
|
||||||
|
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
|
||||||
|
|
||||||
|
calls: list[str] = []
|
||||||
|
|
||||||
|
def _spy(path, stale_seconds=300.0):
|
||||||
|
calls.append(path)
|
||||||
|
return []
|
||||||
|
|
||||||
|
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
|
||||||
|
|
||||||
|
ChromaBackend.make_client(palace_a)
|
||||||
|
ChromaBackend.make_client(palace_b)
|
||||||
|
ChromaBackend.make_client(palace_a) # already gated
|
||||||
|
ChromaBackend.make_client(palace_b) # already gated
|
||||||
|
|
||||||
|
assert calls == [palace_a, palace_b]
|
||||||
|
|
||||||
|
|
||||||
|
# ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ──
|
||||||
|
|
||||||
|
|
||||||
def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path):
|
def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path):
|
||||||
|
|||||||
Reference in New Issue
Block a user