Merge pull request #1339 from fatkobra/fix/1218-hnsw-link-payload-health
fix(storage): quarantine bloated HNSW link payloads (#1218)
This commit is contained in:
@@ -32,6 +32,51 @@ _REQUIRED_OPERATORS = frozenset({"$eq", "$ne", "$in", "$nin", "$and", "$or", "$c
|
||||
_OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
|
||||
_SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
|
||||
|
||||
# A healthy HNSW payload should keep link_lists.bin proportional to
|
||||
# data_level0.bin. When link_lists.bin grows orders of magnitude larger than
|
||||
# data_level0.bin, Chroma/HNSW can segfault while opening the segment even if
|
||||
# index_metadata.pickle is structurally valid.
|
||||
#
|
||||
# The report in #1218 showed ratios above 300x, while healthy snapshots were far below 1x.
|
||||
# Treat only >10x as corruption so normal flush lag or small segments do not get
|
||||
# quarantined.
|
||||
_HNSW_LINK_TO_DATA_MAX_RATIO = 10.0
|
||||
|
||||
|
||||
def _hnsw_link_to_data_ratio(seg_dir: str) -> Optional[float]:
|
||||
"""Return link_lists.bin / data_level0.bin size ratio for a segment.
|
||||
|
||||
``None`` means the ratio is not meaningful, usually because one file is
|
||||
missing or data_level0.bin is empty. ``float("inf")`` means the files were
|
||||
present but could not be statted safely, which should be treated as
|
||||
suspicious by callers.
|
||||
"""
|
||||
|
||||
link_path = os.path.join(seg_dir, "link_lists.bin")
|
||||
data_path = os.path.join(seg_dir, "data_level0.bin")
|
||||
|
||||
if not (os.path.isfile(link_path) and os.path.isfile(data_path)):
|
||||
return None
|
||||
|
||||
try:
|
||||
data_size = os.path.getsize(data_path)
|
||||
link_size = os.path.getsize(link_path)
|
||||
except OSError:
|
||||
return float("inf")
|
||||
|
||||
if data_size <= 0:
|
||||
return None
|
||||
|
||||
return link_size / data_size
|
||||
|
||||
|
||||
def _hnsw_payload_appears_sane(seg_dir: str) -> bool:
|
||||
"""Return False when HNSW payload files are structurally implausible."""
|
||||
|
||||
ratio = _hnsw_link_to_data_ratio(seg_dir)
|
||||
return ratio is None or ratio <= _HNSW_LINK_TO_DATA_MAX_RATIO
|
||||
|
||||
|
||||
# HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
|
||||
#
|
||||
# With default params (batch_size=100, sync_threshold=1000, initial capacity
|
||||
@@ -109,6 +154,9 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
|
||||
files and quarantine_stale_hnsw would conservatively rename them
|
||||
out of the way (lazy rebuild on next open recovers).
|
||||
"""
|
||||
if not _hnsw_payload_appears_sane(seg_dir):
|
||||
return False
|
||||
|
||||
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
|
||||
if not os.path.isfile(meta_path):
|
||||
# No metadata file yet — segment hasn't flushed (fresh / empty).
|
||||
@@ -130,64 +178,35 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
|
||||
|
||||
|
||||
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
|
||||
"""Rename HNSW segment dirs that are both stale-by-mtime AND fail an
|
||||
integrity sniff-test.
|
||||
"""Rename HNSW segment dirs that look unsafe to open.
|
||||
|
||||
Catches the segfault failure mode from #823 (semantic search stale
|
||||
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
|
||||
``count()`` with chromadb 1.5.5), and acknowledged as by-design at
|
||||
chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb
|
||||
rebuild lazily on next open instead of segfaulting.
|
||||
This catches two classes of HNSW corruption before ChromaDB opens the
|
||||
native segment reader:
|
||||
|
||||
Two-stage check:
|
||||
1. stale-by-mtime segments whose ``index_metadata.pickle`` fails the
|
||||
existing format sniff-test;
|
||||
2. structurally impossible HNSW payloads where ``link_lists.bin`` is much
|
||||
larger than ``data_level0.bin``.
|
||||
|
||||
1. **mtime gate.** If ``chroma.sqlite3`` is less than
|
||||
``stale_seconds`` newer than the segment's ``data_level0.bin``,
|
||||
skip — chromadb is in normal write-path territory.
|
||||
The second check is intentionally not gated by mtime. A segment with a
|
||||
300x link/data ratio is unsafe regardless of whether its mtime is recent;
|
||||
letting Chroma open it can SIGSEGV before Python fallback code runs.
|
||||
|
||||
2. **Integrity gate** (``_segment_appears_healthy``). Even when the
|
||||
mtime gap exceeds the threshold, a segment whose
|
||||
``index_metadata.pickle`` passes a format sniff-test is healthy:
|
||||
chromadb 1.5.x flushes HNSW state asynchronously and a clean
|
||||
shutdown does NOT force-flush, so the on-disk HNSW is *always*
|
||||
somewhat older than ``chroma.sqlite3``. Production observation
|
||||
(2026-04-26 disks daemon): three of three segments quarantined
|
||||
on every cold start, with 538-557s gaps, leaving the 151K-drawer
|
||||
palace with vector_ranked=0 until rebuild. Renaming a healthy
|
||||
segment based on mtime alone destroys a valid index — chromadb
|
||||
creates an empty replacement, orphaning every drawer in sqlite
|
||||
from vector recall until the operator runs ``mempalace repair
|
||||
--mode rebuild`` (15+ min on a 151K palace).
|
||||
|
||||
Only segments that pass stage 1 (suspiciously stale) AND fail stage
|
||||
2 (metadata file truncated, zero-filled, or absent-with-data) are
|
||||
renamed to ``<uuid>.drift-<timestamp>``. The original directory is
|
||||
renamed, not deleted, so recovery remains possible if the heuristic
|
||||
misfires.
|
||||
|
||||
The default threshold (5 min) is advisory under daemon-strict; the
|
||||
integrity gate is what actually distinguishes corruption from flush
|
||||
lag. The threshold still matters for the cross-machine replication
|
||||
case (#823), where it bounds how stale a Syncthing-replicated
|
||||
segment can be before we look harder at it.
|
||||
|
||||
Args:
|
||||
palace_path: path to the palace directory containing ``chroma.sqlite3``
|
||||
stale_seconds: minimum mtime gap to *consider* a segment for quarantine
|
||||
|
||||
Returns:
|
||||
List of paths that were quarantined (empty if nothing actually
|
||||
looked corrupt).
|
||||
The original directory is renamed, not deleted, so recovery remains
|
||||
possible if the heuristic ever misfires.
|
||||
"""
|
||||
|
||||
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||
if not os.path.isfile(db_path):
|
||||
return []
|
||||
|
||||
try:
|
||||
sqlite_mtime = os.path.getmtime(db_path)
|
||||
except OSError:
|
||||
return []
|
||||
|
||||
moved: list[str] = []
|
||||
|
||||
try:
|
||||
entries = os.listdir(palace_path)
|
||||
except OSError:
|
||||
@@ -196,29 +215,34 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
|
||||
for name in entries:
|
||||
if "-" not in name or name.startswith(".") or ".drift-" in name:
|
||||
continue
|
||||
|
||||
seg_dir = os.path.join(palace_path, name)
|
||||
if not os.path.isdir(seg_dir):
|
||||
continue
|
||||
|
||||
hnsw_bin = os.path.join(seg_dir, "data_level0.bin")
|
||||
if not os.path.isfile(hnsw_bin):
|
||||
continue
|
||||
|
||||
try:
|
||||
hnsw_mtime = os.path.getmtime(hnsw_bin)
|
||||
except OSError:
|
||||
continue
|
||||
if sqlite_mtime - hnsw_mtime < stale_seconds:
|
||||
|
||||
payload_ratio = _hnsw_link_to_data_ratio(seg_dir)
|
||||
payload_corrupt = payload_ratio is not None and payload_ratio > _HNSW_LINK_TO_DATA_MAX_RATIO
|
||||
|
||||
if not payload_corrupt and sqlite_mtime - hnsw_mtime < stale_seconds:
|
||||
continue
|
||||
|
||||
# Stage 2: integrity gate. mtime drift is necessary but not
|
||||
# sufficient — chromadb's async flush makes drift the steady-
|
||||
# state condition. A healthy segment metadata file proves
|
||||
# chromadb can open the segment without segfault; don't
|
||||
# quarantine a healthy index.
|
||||
if _segment_appears_healthy(seg_dir):
|
||||
# Stage 2: integrity gate. Mtime drift alone is not corruption because
|
||||
# Chroma flushes HNSW asynchronously. A healthy metadata file proves the
|
||||
# ordinary stale-by-mtime case is just flush lag.
|
||||
if not payload_corrupt and _segment_appears_healthy(seg_dir):
|
||||
logger.info(
|
||||
"HNSW mtime gap %.0fs on %s exceeds threshold but segment "
|
||||
"metadata file is intact — flush-lag, not corruption. "
|
||||
"Leaving in place.",
|
||||
"metadata and payload size are intact — flush-lag, not "
|
||||
"corruption. Leaving in place.",
|
||||
sqlite_mtime - hnsw_mtime,
|
||||
seg_dir,
|
||||
)
|
||||
@@ -226,17 +250,30 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
|
||||
|
||||
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
target = f"{seg_dir}.drift-{stamp}"
|
||||
|
||||
if payload_corrupt:
|
||||
reason = (
|
||||
f"link_lists.bin/data_level0.bin ratio {payload_ratio:.1f}x "
|
||||
f"exceeds {_HNSW_LINK_TO_DATA_MAX_RATIO:.1f}x"
|
||||
)
|
||||
else:
|
||||
reason = (
|
||||
f"sqlite {sqlite_mtime - hnsw_mtime:.0f}s newer than HNSW "
|
||||
"and integrity check failed"
|
||||
)
|
||||
|
||||
try:
|
||||
os.rename(seg_dir, target)
|
||||
moved.append(target)
|
||||
logger.warning(
|
||||
"Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s",
|
||||
"Quarantined corrupt HNSW segment %s (%s); renamed to %s",
|
||||
seg_dir,
|
||||
sqlite_mtime - hnsw_mtime,
|
||||
reason,
|
||||
target,
|
||||
)
|
||||
except OSError:
|
||||
logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir)
|
||||
|
||||
return moved
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from mempalace.backends.chroma import (
|
||||
_HNSW_LINK_TO_DATA_MAX_RATIO,
|
||||
_hnsw_link_to_data_ratio,
|
||||
_segment_appears_healthy,
|
||||
quarantine_stale_hnsw,
|
||||
)
|
||||
|
||||
|
||||
def _write_segment(
|
||||
seg_dir: Path,
|
||||
*,
|
||||
data_size: int = 100,
|
||||
link_size: int = 100,
|
||||
write_metadata: bool = True,
|
||||
) -> None:
|
||||
seg_dir.mkdir(parents=True, exist_ok=True)
|
||||
(seg_dir / "data_level0.bin").write_bytes(b"\0" * data_size)
|
||||
(seg_dir / "link_lists.bin").write_bytes(b"\0" * link_size)
|
||||
|
||||
if write_metadata:
|
||||
# Enough bytes to pass the existing pickle envelope sniff-test:
|
||||
# starts with pickle protocol marker 0x80 and ends with STOP 0x2e.
|
||||
(seg_dir / "index_metadata.pickle").write_bytes(b"\x80" + b"x" * 16 + b"\x2e")
|
||||
|
||||
|
||||
def test_hnsw_link_to_data_ratio_reports_payload_size_ratio(tmp_path):
|
||||
seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
|
||||
_write_segment(seg_dir, data_size=100, link_size=250)
|
||||
|
||||
assert _hnsw_link_to_data_ratio(str(seg_dir)) == 2.5
|
||||
|
||||
|
||||
def test_segment_health_rejects_exploded_link_lists_even_with_valid_pickle(tmp_path):
|
||||
seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
|
||||
_write_segment(
|
||||
seg_dir,
|
||||
data_size=100,
|
||||
link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)),
|
||||
write_metadata=True,
|
||||
)
|
||||
|
||||
assert not _segment_appears_healthy(str(seg_dir))
|
||||
|
||||
|
||||
def test_segment_health_keeps_reasonable_payload_with_valid_pickle(tmp_path):
|
||||
seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
|
||||
_write_segment(
|
||||
seg_dir,
|
||||
data_size=100,
|
||||
link_size=int(100 * _HNSW_LINK_TO_DATA_MAX_RATIO),
|
||||
write_metadata=True,
|
||||
)
|
||||
|
||||
assert _segment_appears_healthy(str(seg_dir))
|
||||
|
||||
|
||||
def test_quarantine_catches_link_bloat_without_mtime_drift(tmp_path):
|
||||
palace = tmp_path / "palace"
|
||||
palace.mkdir()
|
||||
|
||||
db_path = palace / "chroma.sqlite3"
|
||||
db_path.write_text("sqlite placeholder")
|
||||
|
||||
seg_dir = palace / "11111111-2222-3333-4444-555555555555"
|
||||
_write_segment(
|
||||
seg_dir,
|
||||
data_size=100,
|
||||
link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)),
|
||||
write_metadata=True,
|
||||
)
|
||||
|
||||
# Make sqlite and HNSW mtimes identical. The old mtime-only gate would
|
||||
# skip this segment even though the payload is structurally corrupt.
|
||||
same_time = 1_700_000_000
|
||||
os.utime(db_path, (same_time, same_time))
|
||||
os.utime(seg_dir / "data_level0.bin", (same_time, same_time))
|
||||
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999)
|
||||
|
||||
assert len(moved) == 1
|
||||
assert not seg_dir.exists()
|
||||
|
||||
moved_path = Path(moved[0])
|
||||
assert moved_path.exists()
|
||||
assert moved_path.name.startswith("11111111-2222-3333-4444-555555555555.drift-")
|
||||
|
||||
|
||||
def test_quarantine_leaves_reasonable_payload_in_place(tmp_path):
|
||||
palace = tmp_path / "palace"
|
||||
palace.mkdir()
|
||||
|
||||
db_path = palace / "chroma.sqlite3"
|
||||
db_path.write_text("sqlite placeholder")
|
||||
|
||||
seg_dir = palace / "11111111-2222-3333-4444-555555555555"
|
||||
_write_segment(
|
||||
seg_dir,
|
||||
data_size=100,
|
||||
link_size=100,
|
||||
write_metadata=True,
|
||||
)
|
||||
|
||||
same_time = 1_700_000_000
|
||||
os.utime(db_path, (same_time, same_time))
|
||||
os.utime(seg_dir / "data_level0.bin", (same_time, same_time))
|
||||
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999)
|
||||
|
||||
assert moved == []
|
||||
assert seg_dir.exists()
|
||||
Reference in New Issue
Block a user