Merge pull request #1339 from fatkobra/fix/1218-hnsw-link-payload-health
fix(storage): quarantine bloated HNSW link payloads (#1218)
This commit is contained in:
@@ -32,6 +32,51 @@ _REQUIRED_OPERATORS = frozenset({"$eq", "$ne", "$in", "$nin", "$and", "$or", "$c
|
|||||||
_OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
|
_OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
|
||||||
_SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
|
_SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
|
||||||
|
|
||||||
|
# A healthy HNSW payload should keep link_lists.bin proportional to
|
||||||
|
# data_level0.bin. When link_lists.bin grows orders of magnitude larger than
|
||||||
|
# data_level0.bin, Chroma/HNSW can segfault while opening the segment even if
|
||||||
|
# index_metadata.pickle is structurally valid.
|
||||||
|
#
|
||||||
|
# The report in #1218 showed ratios above 300x, while healthy snapshots were far below 1x.
|
||||||
|
# Treat only >10x as corruption so normal flush lag or small segments do not get
|
||||||
|
# quarantined.
|
||||||
|
_HNSW_LINK_TO_DATA_MAX_RATIO = 10.0
|
||||||
|
|
||||||
|
|
||||||
|
def _hnsw_link_to_data_ratio(seg_dir: str) -> Optional[float]:
|
||||||
|
"""Return link_lists.bin / data_level0.bin size ratio for a segment.
|
||||||
|
|
||||||
|
``None`` means the ratio is not meaningful, usually because one file is
|
||||||
|
missing or data_level0.bin is empty. ``float("inf")`` means the files were
|
||||||
|
present but could not be statted safely, which should be treated as
|
||||||
|
suspicious by callers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
link_path = os.path.join(seg_dir, "link_lists.bin")
|
||||||
|
data_path = os.path.join(seg_dir, "data_level0.bin")
|
||||||
|
|
||||||
|
if not (os.path.isfile(link_path) and os.path.isfile(data_path)):
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_size = os.path.getsize(data_path)
|
||||||
|
link_size = os.path.getsize(link_path)
|
||||||
|
except OSError:
|
||||||
|
return float("inf")
|
||||||
|
|
||||||
|
if data_size <= 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return link_size / data_size
|
||||||
|
|
||||||
|
|
||||||
|
def _hnsw_payload_appears_sane(seg_dir: str) -> bool:
|
||||||
|
"""Return False when HNSW payload files are structurally implausible."""
|
||||||
|
|
||||||
|
ratio = _hnsw_link_to_data_ratio(seg_dir)
|
||||||
|
return ratio is None or ratio <= _HNSW_LINK_TO_DATA_MAX_RATIO
|
||||||
|
|
||||||
|
|
||||||
# HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
|
# HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
|
||||||
#
|
#
|
||||||
# With default params (batch_size=100, sync_threshold=1000, initial capacity
|
# With default params (batch_size=100, sync_threshold=1000, initial capacity
|
||||||
@@ -109,6 +154,9 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
|
|||||||
files and quarantine_stale_hnsw would conservatively rename them
|
files and quarantine_stale_hnsw would conservatively rename them
|
||||||
out of the way (lazy rebuild on next open recovers).
|
out of the way (lazy rebuild on next open recovers).
|
||||||
"""
|
"""
|
||||||
|
if not _hnsw_payload_appears_sane(seg_dir):
|
||||||
|
return False
|
||||||
|
|
||||||
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
|
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
|
||||||
if not os.path.isfile(meta_path):
|
if not os.path.isfile(meta_path):
|
||||||
# No metadata file yet — segment hasn't flushed (fresh / empty).
|
# No metadata file yet — segment hasn't flushed (fresh / empty).
|
||||||
@@ -130,64 +178,35 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
|
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
|
||||||
"""Rename HNSW segment dirs that are both stale-by-mtime AND fail an
|
"""Rename HNSW segment dirs that look unsafe to open.
|
||||||
integrity sniff-test.
|
|
||||||
|
|
||||||
Catches the segfault failure mode from #823 (semantic search stale
|
This catches two classes of HNSW corruption before ChromaDB opens the
|
||||||
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
|
native segment reader:
|
||||||
``count()`` with chromadb 1.5.5), and acknowledged as by-design at
|
|
||||||
chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb
|
|
||||||
rebuild lazily on next open instead of segfaulting.
|
|
||||||
|
|
||||||
Two-stage check:
|
1. stale-by-mtime segments whose ``index_metadata.pickle`` fails the
|
||||||
|
existing format sniff-test;
|
||||||
|
2. structurally impossible HNSW payloads where ``link_lists.bin`` is much
|
||||||
|
larger than ``data_level0.bin``.
|
||||||
|
|
||||||
1. **mtime gate.** If ``chroma.sqlite3`` is less than
|
The second check is intentionally not gated by mtime. A segment with a
|
||||||
``stale_seconds`` newer than the segment's ``data_level0.bin``,
|
300x link/data ratio is unsafe regardless of whether its mtime is recent;
|
||||||
skip — chromadb is in normal write-path territory.
|
letting Chroma open it can SIGSEGV before Python fallback code runs.
|
||||||
|
|
||||||
2. **Integrity gate** (``_segment_appears_healthy``). Even when the
|
The original directory is renamed, not deleted, so recovery remains
|
||||||
mtime gap exceeds the threshold, a segment whose
|
possible if the heuristic ever misfires.
|
||||||
``index_metadata.pickle`` passes a format sniff-test is healthy:
|
|
||||||
chromadb 1.5.x flushes HNSW state asynchronously and a clean
|
|
||||||
shutdown does NOT force-flush, so the on-disk HNSW is *always*
|
|
||||||
somewhat older than ``chroma.sqlite3``. Production observation
|
|
||||||
(2026-04-26 disks daemon): three of three segments quarantined
|
|
||||||
on every cold start, with 538-557s gaps, leaving the 151K-drawer
|
|
||||||
palace with vector_ranked=0 until rebuild. Renaming a healthy
|
|
||||||
segment based on mtime alone destroys a valid index — chromadb
|
|
||||||
creates an empty replacement, orphaning every drawer in sqlite
|
|
||||||
from vector recall until the operator runs ``mempalace repair
|
|
||||||
--mode rebuild`` (15+ min on a 151K palace).
|
|
||||||
|
|
||||||
Only segments that pass stage 1 (suspiciously stale) AND fail stage
|
|
||||||
2 (metadata file truncated, zero-filled, or absent-with-data) are
|
|
||||||
renamed to ``<uuid>.drift-<timestamp>``. The original directory is
|
|
||||||
renamed, not deleted, so recovery remains possible if the heuristic
|
|
||||||
misfires.
|
|
||||||
|
|
||||||
The default threshold (5 min) is advisory under daemon-strict; the
|
|
||||||
integrity gate is what actually distinguishes corruption from flush
|
|
||||||
lag. The threshold still matters for the cross-machine replication
|
|
||||||
case (#823), where it bounds how stale a Syncthing-replicated
|
|
||||||
segment can be before we look harder at it.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
palace_path: path to the palace directory containing ``chroma.sqlite3``
|
|
||||||
stale_seconds: minimum mtime gap to *consider* a segment for quarantine
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of paths that were quarantined (empty if nothing actually
|
|
||||||
looked corrupt).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||||
if not os.path.isfile(db_path):
|
if not os.path.isfile(db_path):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sqlite_mtime = os.path.getmtime(db_path)
|
sqlite_mtime = os.path.getmtime(db_path)
|
||||||
except OSError:
|
except OSError:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
moved: list[str] = []
|
moved: list[str] = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
entries = os.listdir(palace_path)
|
entries = os.listdir(palace_path)
|
||||||
except OSError:
|
except OSError:
|
||||||
@@ -196,29 +215,34 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
|
|||||||
for name in entries:
|
for name in entries:
|
||||||
if "-" not in name or name.startswith(".") or ".drift-" in name:
|
if "-" not in name or name.startswith(".") or ".drift-" in name:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
seg_dir = os.path.join(palace_path, name)
|
seg_dir = os.path.join(palace_path, name)
|
||||||
if not os.path.isdir(seg_dir):
|
if not os.path.isdir(seg_dir):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
hnsw_bin = os.path.join(seg_dir, "data_level0.bin")
|
hnsw_bin = os.path.join(seg_dir, "data_level0.bin")
|
||||||
if not os.path.isfile(hnsw_bin):
|
if not os.path.isfile(hnsw_bin):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
hnsw_mtime = os.path.getmtime(hnsw_bin)
|
hnsw_mtime = os.path.getmtime(hnsw_bin)
|
||||||
except OSError:
|
except OSError:
|
||||||
continue
|
continue
|
||||||
if sqlite_mtime - hnsw_mtime < stale_seconds:
|
|
||||||
|
payload_ratio = _hnsw_link_to_data_ratio(seg_dir)
|
||||||
|
payload_corrupt = payload_ratio is not None and payload_ratio > _HNSW_LINK_TO_DATA_MAX_RATIO
|
||||||
|
|
||||||
|
if not payload_corrupt and sqlite_mtime - hnsw_mtime < stale_seconds:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Stage 2: integrity gate. mtime drift is necessary but not
|
# Stage 2: integrity gate. Mtime drift alone is not corruption because
|
||||||
# sufficient — chromadb's async flush makes drift the steady-
|
# Chroma flushes HNSW asynchronously. A healthy metadata file proves the
|
||||||
# state condition. A healthy segment metadata file proves
|
# ordinary stale-by-mtime case is just flush lag.
|
||||||
# chromadb can open the segment without segfault; don't
|
if not payload_corrupt and _segment_appears_healthy(seg_dir):
|
||||||
# quarantine a healthy index.
|
|
||||||
if _segment_appears_healthy(seg_dir):
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"HNSW mtime gap %.0fs on %s exceeds threshold but segment "
|
"HNSW mtime gap %.0fs on %s exceeds threshold but segment "
|
||||||
"metadata file is intact — flush-lag, not corruption. "
|
"metadata and payload size are intact — flush-lag, not "
|
||||||
"Leaving in place.",
|
"corruption. Leaving in place.",
|
||||||
sqlite_mtime - hnsw_mtime,
|
sqlite_mtime - hnsw_mtime,
|
||||||
seg_dir,
|
seg_dir,
|
||||||
)
|
)
|
||||||
@@ -226,17 +250,30 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
|
|||||||
|
|
||||||
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
target = f"{seg_dir}.drift-{stamp}"
|
target = f"{seg_dir}.drift-{stamp}"
|
||||||
|
|
||||||
|
if payload_corrupt:
|
||||||
|
reason = (
|
||||||
|
f"link_lists.bin/data_level0.bin ratio {payload_ratio:.1f}x "
|
||||||
|
f"exceeds {_HNSW_LINK_TO_DATA_MAX_RATIO:.1f}x"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
reason = (
|
||||||
|
f"sqlite {sqlite_mtime - hnsw_mtime:.0f}s newer than HNSW "
|
||||||
|
"and integrity check failed"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
os.rename(seg_dir, target)
|
os.rename(seg_dir, target)
|
||||||
moved.append(target)
|
moved.append(target)
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s",
|
"Quarantined corrupt HNSW segment %s (%s); renamed to %s",
|
||||||
seg_dir,
|
seg_dir,
|
||||||
sqlite_mtime - hnsw_mtime,
|
reason,
|
||||||
target,
|
target,
|
||||||
)
|
)
|
||||||
except OSError:
|
except OSError:
|
||||||
logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir)
|
logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir)
|
||||||
|
|
||||||
return moved
|
return moved
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,113 @@
|
|||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from mempalace.backends.chroma import (
|
||||||
|
_HNSW_LINK_TO_DATA_MAX_RATIO,
|
||||||
|
_hnsw_link_to_data_ratio,
|
||||||
|
_segment_appears_healthy,
|
||||||
|
quarantine_stale_hnsw,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _write_segment(
|
||||||
|
seg_dir: Path,
|
||||||
|
*,
|
||||||
|
data_size: int = 100,
|
||||||
|
link_size: int = 100,
|
||||||
|
write_metadata: bool = True,
|
||||||
|
) -> None:
|
||||||
|
seg_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(seg_dir / "data_level0.bin").write_bytes(b"\0" * data_size)
|
||||||
|
(seg_dir / "link_lists.bin").write_bytes(b"\0" * link_size)
|
||||||
|
|
||||||
|
if write_metadata:
|
||||||
|
# Enough bytes to pass the existing pickle envelope sniff-test:
|
||||||
|
# starts with pickle protocol marker 0x80 and ends with STOP 0x2e.
|
||||||
|
(seg_dir / "index_metadata.pickle").write_bytes(b"\x80" + b"x" * 16 + b"\x2e")
|
||||||
|
|
||||||
|
|
||||||
|
def test_hnsw_link_to_data_ratio_reports_payload_size_ratio(tmp_path):
|
||||||
|
seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
|
||||||
|
_write_segment(seg_dir, data_size=100, link_size=250)
|
||||||
|
|
||||||
|
assert _hnsw_link_to_data_ratio(str(seg_dir)) == 2.5
|
||||||
|
|
||||||
|
|
||||||
|
def test_segment_health_rejects_exploded_link_lists_even_with_valid_pickle(tmp_path):
|
||||||
|
seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
|
||||||
|
_write_segment(
|
||||||
|
seg_dir,
|
||||||
|
data_size=100,
|
||||||
|
link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)),
|
||||||
|
write_metadata=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert not _segment_appears_healthy(str(seg_dir))
|
||||||
|
|
||||||
|
|
||||||
|
def test_segment_health_keeps_reasonable_payload_with_valid_pickle(tmp_path):
|
||||||
|
seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
|
||||||
|
_write_segment(
|
||||||
|
seg_dir,
|
||||||
|
data_size=100,
|
||||||
|
link_size=int(100 * _HNSW_LINK_TO_DATA_MAX_RATIO),
|
||||||
|
write_metadata=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert _segment_appears_healthy(str(seg_dir))
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_catches_link_bloat_without_mtime_drift(tmp_path):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
|
||||||
|
db_path = palace / "chroma.sqlite3"
|
||||||
|
db_path.write_text("sqlite placeholder")
|
||||||
|
|
||||||
|
seg_dir = palace / "11111111-2222-3333-4444-555555555555"
|
||||||
|
_write_segment(
|
||||||
|
seg_dir,
|
||||||
|
data_size=100,
|
||||||
|
link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)),
|
||||||
|
write_metadata=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make sqlite and HNSW mtimes identical. The old mtime-only gate would
|
||||||
|
# skip this segment even though the payload is structurally corrupt.
|
||||||
|
same_time = 1_700_000_000
|
||||||
|
os.utime(db_path, (same_time, same_time))
|
||||||
|
os.utime(seg_dir / "data_level0.bin", (same_time, same_time))
|
||||||
|
|
||||||
|
moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999)
|
||||||
|
|
||||||
|
assert len(moved) == 1
|
||||||
|
assert not seg_dir.exists()
|
||||||
|
|
||||||
|
moved_path = Path(moved[0])
|
||||||
|
assert moved_path.exists()
|
||||||
|
assert moved_path.name.startswith("11111111-2222-3333-4444-555555555555.drift-")
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_leaves_reasonable_payload_in_place(tmp_path):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
|
||||||
|
db_path = palace / "chroma.sqlite3"
|
||||||
|
db_path.write_text("sqlite placeholder")
|
||||||
|
|
||||||
|
seg_dir = palace / "11111111-2222-3333-4444-555555555555"
|
||||||
|
_write_segment(
|
||||||
|
seg_dir,
|
||||||
|
data_size=100,
|
||||||
|
link_size=100,
|
||||||
|
write_metadata=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
same_time = 1_700_000_000
|
||||||
|
os.utime(db_path, (same_time, same_time))
|
||||||
|
os.utime(seg_dir / "data_level0.bin", (same_time, same_time))
|
||||||
|
|
||||||
|
moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999)
|
||||||
|
|
||||||
|
assert moved == []
|
||||||
|
assert seg_dir.exists()
|
||||||
Reference in New Issue
Block a user