diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index e73ed41..9812440 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -32,6 +32,51 @@ _REQUIRED_OPERATORS = frozenset({"$eq", "$ne", "$in", "$nin", "$and", "$or", "$c _OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"}) _SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS +# A healthy HNSW payload should keep link_lists.bin proportional to +# data_level0.bin. When link_lists.bin grows orders of magnitude larger than +# data_level0.bin, Chroma/HNSW can segfault while opening the segment even if +# index_metadata.pickle is structurally valid. +# +# The report in #1218 showed ratios above 300x, while healthy snapshots were far below 1x. +# Treat only >10x as corruption so normal flush lag or small segments do not get +# quarantined. +_HNSW_LINK_TO_DATA_MAX_RATIO = 10.0 + + +def _hnsw_link_to_data_ratio(seg_dir: str) -> Optional[float]: + """Return link_lists.bin / data_level0.bin size ratio for a segment. + + ``None`` means the ratio is not meaningful, usually because one file is + missing or data_level0.bin is empty. ``float("inf")`` means the files were + present but could not be statted safely, which should be treated as + suspicious by callers. + """ + + link_path = os.path.join(seg_dir, "link_lists.bin") + data_path = os.path.join(seg_dir, "data_level0.bin") + + if not (os.path.isfile(link_path) and os.path.isfile(data_path)): + return None + + try: + data_size = os.path.getsize(data_path) + link_size = os.path.getsize(link_path) + except OSError: + return float("inf") + + if data_size <= 0: + return None + + return link_size / data_size + + +def _hnsw_payload_appears_sane(seg_dir: str) -> bool: + """Return False when HNSW payload files are structurally implausible.""" + + ratio = _hnsw_link_to_data_ratio(seg_dir) + return ratio is None or ratio <= _HNSW_LINK_TO_DATA_MAX_RATIO + + # HNSW tuning to prevent link_lists.bin bloat on large mines (#344). # # With default params (batch_size=100, sync_threshold=1000, initial capacity @@ -109,6 +154,9 @@ def _segment_appears_healthy(seg_dir: str) -> bool: files and quarantine_stale_hnsw would conservatively rename them out of the way (lazy rebuild on next open recovers). """ + if not _hnsw_payload_appears_sane(seg_dir): + return False + meta_path = os.path.join(seg_dir, "index_metadata.pickle") if not os.path.isfile(meta_path): # No metadata file yet — segment hasn't flushed (fresh / empty). @@ -130,64 +178,35 @@ def _segment_appears_healthy(seg_dir: str) -> bool: def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]: - """Rename HNSW segment dirs that are both stale-by-mtime AND fail an - integrity sniff-test. + """Rename HNSW segment dirs that look unsafe to open. - Catches the segfault failure mode from #823 (semantic search stale - after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on - ``count()`` with chromadb 1.5.5), and acknowledged as by-design at - chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb - rebuild lazily on next open instead of segfaulting. + This catches two classes of HNSW corruption before ChromaDB opens the + native segment reader: - Two-stage check: + 1. stale-by-mtime segments whose ``index_metadata.pickle`` fails the + existing format sniff-test; + 2. structurally impossible HNSW payloads where ``link_lists.bin`` is much + larger than ``data_level0.bin``. - 1. **mtime gate.** If ``chroma.sqlite3`` is less than - ``stale_seconds`` newer than the segment's ``data_level0.bin``, - skip — chromadb is in normal write-path territory. + The second check is intentionally not gated by mtime. A segment with a + 300x link/data ratio is unsafe regardless of whether its mtime is recent; + letting Chroma open it can SIGSEGV before Python fallback code runs. - 2. **Integrity gate** (``_segment_appears_healthy``). Even when the - mtime gap exceeds the threshold, a segment whose - ``index_metadata.pickle`` passes a format sniff-test is healthy: - chromadb 1.5.x flushes HNSW state asynchronously and a clean - shutdown does NOT force-flush, so the on-disk HNSW is *always* - somewhat older than ``chroma.sqlite3``. Production observation - (2026-04-26 disks daemon): three of three segments quarantined - on every cold start, with 538-557s gaps, leaving the 151K-drawer - palace with vector_ranked=0 until rebuild. Renaming a healthy - segment based on mtime alone destroys a valid index — chromadb - creates an empty replacement, orphaning every drawer in sqlite - from vector recall until the operator runs ``mempalace repair - --mode rebuild`` (15+ min on a 151K palace). - - Only segments that pass stage 1 (suspiciously stale) AND fail stage - 2 (metadata file truncated, zero-filled, or absent-with-data) are - renamed to ``.drift-``. The original directory is - renamed, not deleted, so recovery remains possible if the heuristic - misfires. - - The default threshold (5 min) is advisory under daemon-strict; the - integrity gate is what actually distinguishes corruption from flush - lag. The threshold still matters for the cross-machine replication - case (#823), where it bounds how stale a Syncthing-replicated - segment can be before we look harder at it. - - Args: - palace_path: path to the palace directory containing ``chroma.sqlite3`` - stale_seconds: minimum mtime gap to *consider* a segment for quarantine - - Returns: - List of paths that were quarantined (empty if nothing actually - looked corrupt). + The original directory is renamed, not deleted, so recovery remains + possible if the heuristic ever misfires. """ + db_path = os.path.join(palace_path, "chroma.sqlite3") if not os.path.isfile(db_path): return [] + try: sqlite_mtime = os.path.getmtime(db_path) except OSError: return [] moved: list[str] = [] + try: entries = os.listdir(palace_path) except OSError: @@ -196,29 +215,34 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis for name in entries: if "-" not in name or name.startswith(".") or ".drift-" in name: continue + seg_dir = os.path.join(palace_path, name) if not os.path.isdir(seg_dir): continue + hnsw_bin = os.path.join(seg_dir, "data_level0.bin") if not os.path.isfile(hnsw_bin): continue + try: hnsw_mtime = os.path.getmtime(hnsw_bin) except OSError: continue - if sqlite_mtime - hnsw_mtime < stale_seconds: + + payload_ratio = _hnsw_link_to_data_ratio(seg_dir) + payload_corrupt = payload_ratio is not None and payload_ratio > _HNSW_LINK_TO_DATA_MAX_RATIO + + if not payload_corrupt and sqlite_mtime - hnsw_mtime < stale_seconds: continue - # Stage 2: integrity gate. mtime drift is necessary but not - # sufficient — chromadb's async flush makes drift the steady- - # state condition. A healthy segment metadata file proves - # chromadb can open the segment without segfault; don't - # quarantine a healthy index. - if _segment_appears_healthy(seg_dir): + # Stage 2: integrity gate. Mtime drift alone is not corruption because + # Chroma flushes HNSW asynchronously. A healthy metadata file proves the + # ordinary stale-by-mtime case is just flush lag. + if not payload_corrupt and _segment_appears_healthy(seg_dir): logger.info( "HNSW mtime gap %.0fs on %s exceeds threshold but segment " - "metadata file is intact — flush-lag, not corruption. " - "Leaving in place.", + "metadata and payload size are intact — flush-lag, not " + "corruption. Leaving in place.", sqlite_mtime - hnsw_mtime, seg_dir, ) @@ -226,17 +250,30 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S") target = f"{seg_dir}.drift-{stamp}" + + if payload_corrupt: + reason = ( + f"link_lists.bin/data_level0.bin ratio {payload_ratio:.1f}x " + f"exceeds {_HNSW_LINK_TO_DATA_MAX_RATIO:.1f}x" + ) + else: + reason = ( + f"sqlite {sqlite_mtime - hnsw_mtime:.0f}s newer than HNSW " + "and integrity check failed" + ) + try: os.rename(seg_dir, target) moved.append(target) logger.warning( - "Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s", + "Quarantined corrupt HNSW segment %s (%s); renamed to %s", seg_dir, - sqlite_mtime - hnsw_mtime, + reason, target, ) except OSError: logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir) + return moved diff --git a/tests/test_hnsw_payload_health.py b/tests/test_hnsw_payload_health.py new file mode 100644 index 0000000..0af440a --- /dev/null +++ b/tests/test_hnsw_payload_health.py @@ -0,0 +1,113 @@ +import os +from pathlib import Path + +from mempalace.backends.chroma import ( + _HNSW_LINK_TO_DATA_MAX_RATIO, + _hnsw_link_to_data_ratio, + _segment_appears_healthy, + quarantine_stale_hnsw, +) + + +def _write_segment( + seg_dir: Path, + *, + data_size: int = 100, + link_size: int = 100, + write_metadata: bool = True, +) -> None: + seg_dir.mkdir(parents=True, exist_ok=True) + (seg_dir / "data_level0.bin").write_bytes(b"\0" * data_size) + (seg_dir / "link_lists.bin").write_bytes(b"\0" * link_size) + + if write_metadata: + # Enough bytes to pass the existing pickle envelope sniff-test: + # starts with pickle protocol marker 0x80 and ends with STOP 0x2e. + (seg_dir / "index_metadata.pickle").write_bytes(b"\x80" + b"x" * 16 + b"\x2e") + + +def test_hnsw_link_to_data_ratio_reports_payload_size_ratio(tmp_path): + seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555" + _write_segment(seg_dir, data_size=100, link_size=250) + + assert _hnsw_link_to_data_ratio(str(seg_dir)) == 2.5 + + +def test_segment_health_rejects_exploded_link_lists_even_with_valid_pickle(tmp_path): + seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555" + _write_segment( + seg_dir, + data_size=100, + link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)), + write_metadata=True, + ) + + assert not _segment_appears_healthy(str(seg_dir)) + + +def test_segment_health_keeps_reasonable_payload_with_valid_pickle(tmp_path): + seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555" + _write_segment( + seg_dir, + data_size=100, + link_size=int(100 * _HNSW_LINK_TO_DATA_MAX_RATIO), + write_metadata=True, + ) + + assert _segment_appears_healthy(str(seg_dir)) + + +def test_quarantine_catches_link_bloat_without_mtime_drift(tmp_path): + palace = tmp_path / "palace" + palace.mkdir() + + db_path = palace / "chroma.sqlite3" + db_path.write_text("sqlite placeholder") + + seg_dir = palace / "11111111-2222-3333-4444-555555555555" + _write_segment( + seg_dir, + data_size=100, + link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)), + write_metadata=True, + ) + + # Make sqlite and HNSW mtimes identical. The old mtime-only gate would + # skip this segment even though the payload is structurally corrupt. + same_time = 1_700_000_000 + os.utime(db_path, (same_time, same_time)) + os.utime(seg_dir / "data_level0.bin", (same_time, same_time)) + + moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999) + + assert len(moved) == 1 + assert not seg_dir.exists() + + moved_path = Path(moved[0]) + assert moved_path.exists() + assert moved_path.name.startswith("11111111-2222-3333-4444-555555555555.drift-") + + +def test_quarantine_leaves_reasonable_payload_in_place(tmp_path): + palace = tmp_path / "palace" + palace.mkdir() + + db_path = palace / "chroma.sqlite3" + db_path.write_text("sqlite placeholder") + + seg_dir = palace / "11111111-2222-3333-4444-555555555555" + _write_segment( + seg_dir, + data_size=100, + link_size=100, + write_metadata=True, + ) + + same_time = 1_700_000_000 + os.utime(db_path, (same_time, same_time)) + os.utime(seg_dir / "data_level0.bin", (same_time, same_time)) + + moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999) + + assert moved == [] + assert seg_dir.exists()