Merge pull request #1342 from fatkobra/fix/1274-missing-hnsw-metadata-gate

fix(storage): quarantine partial HNSW flush without metadata (#1274)
This commit is contained in:
Igor Lins e Silva
2026-05-07 10:42:20 -03:00
committed by GitHub
2 changed files with 77 additions and 15 deletions
+26 -12
View File
@@ -102,6 +102,13 @@ _HNSW_BLOAT_GUARD = {
"hnsw:sync_threshold": 50_000, "hnsw:sync_threshold": 50_000,
} }
# Missing index_metadata.pickle is normal only while a segment is still fresh
# or effectively empty. Once data_level0.bin has non-trivial payload, a
# missing metadata pickle means the segment was interrupted after writing HNSW
# data but before writing its metadata. Letting Chroma open that shape can
# segfault or hang in native HNSW code.
_HNSW_MISSING_METADATA_DATA_FLOOR = 1024
def _validate_where(where: Optional[dict]) -> None: def _validate_where(where: Optional[dict]) -> None:
"""Scan a where-clause for unknown operators and raise ``UnsupportedFilterError``. """Scan a where-clause for unknown operators and raise ``UnsupportedFilterError``.
@@ -132,16 +139,13 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
parsing it. ChromaDB writes that file after a successful HNSW flush; parsing it. ChromaDB writes that file after a successful HNSW flush;
a complete write starts with byte ``0x80`` and ends with byte a complete write starts with byte ``0x80`` and ends with byte
``0x2e`` (the protocol/terminator byte sequence chromadb serializes ``0x2e`` (the protocol/terminator byte sequence chromadb serializes
with). If both bytes are present and the file is non-trivially sized, with).
chromadb will load the segment cleanly even when its on-disk mtime
trails ``chroma.sqlite3`` — which is the *steady state* under
chromadb 1.5.x's async batched flush, not corruption.
A missing metadata file is treated as "fresh / never-flushed" and Missing metadata is healthy only while the segment still looks fresh or
considered healthy. Renaming an empty dir orphans nothing, and a empty. If ``data_level0.bin`` already has non-trivial payload but
real corruption case manifests as a present-but-malformed file or a ``index_metadata.pickle`` is missing, the segment is partially flushed:
chromadb load error caught downstream by palace-daemon's Chroma wrote vector data without the metadata it needs to reopen the
``_auto_repair`` retry path. HNSW reader safely.
Deliberately format-sniffs only; never deserializes. Deserialization Deliberately format-sniffs only; never deserializes. Deserialization
can execute arbitrary code, and the byte-sniff is sufficient to can execute arbitrary code, and the byte-sniff is sufficient to
@@ -152,16 +156,26 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
chromadb writes today; if a future chromadb version emits protocol chromadb writes today; if a future chromadb version emits protocol
0/1 segments, this check would start returning False on healthy 0/1 segments, this check would start returning False on healthy
files and quarantine_stale_hnsw would conservatively rename them files and quarantine_stale_hnsw would conservatively rename them
out of the way (lazy rebuild on next open recovers). out of the way.
""" """
if not _hnsw_payload_appears_sane(seg_dir): if not _hnsw_payload_appears_sane(seg_dir):
return False return False
meta_path = os.path.join(seg_dir, "index_metadata.pickle") meta_path = os.path.join(seg_dir, "index_metadata.pickle")
if not os.path.isfile(meta_path): if not os.path.isfile(meta_path):
# No metadata file yet — segment hasn't flushed (fresh / empty). data_path = os.path.join(seg_dir, "data_level0.bin")
# Renaming would orphan nothing; consider healthy. try:
if (
os.path.isfile(data_path)
and os.path.getsize(data_path) > _HNSW_MISSING_METADATA_DATA_FLOOR
):
return False
except OSError:
return False
# No metadata and no meaningful vector payload yet: fresh/empty segment.
return True return True
try: try:
size = os.path.getsize(meta_path) size = os.path.getsize(meta_path)
# A real chromadb metadata file is at least tens of bytes; a # A real chromadb metadata file is at least tens of bytes; a
+51 -3
View File
@@ -18,8 +18,10 @@ from mempalace.backends import (
from mempalace.backends.chroma import ( from mempalace.backends.chroma import (
ChromaBackend, ChromaBackend,
ChromaCollection, ChromaCollection,
_HNSW_MISSING_METADATA_DATA_FLOOR,
_fix_blob_seq_ids, _fix_blob_seq_ids,
_pin_hnsw_threads, _pin_hnsw_threads,
_segment_appears_healthy,
quarantine_invalid_hnsw_metadata, quarantine_invalid_hnsw_metadata,
quarantine_stale_hnsw, quarantine_stale_hnsw,
) )
@@ -685,9 +687,9 @@ def test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone(tmp_path)
assert seg.exists() assert seg.exists()
def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path): def test_quarantine_stale_hnsw_leaves_empty_segment_without_metadata_alone(tmp_path):
"""Segment with no metadata file is treated as fresh / never-flushed """Missing metadata is okay only when the segment has no meaningful data yet."""
and not quarantined — renaming an empty dir orphans nothing."""
now = 1_700_000_000.0 now = 1_700_000_000.0
palace, seg = _make_palace_with_segment( palace, seg = _make_palace_with_segment(
tmp_path, tmp_path,
@@ -695,11 +697,57 @@ def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path):
sqlite_mtime=now, sqlite_mtime=now,
meta_bytes=None, meta_bytes=None,
) )
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert moved == [] assert moved == []
assert seg.exists() assert seg.exists()
def test_segment_without_metadata_but_with_nontrivial_data_is_unhealthy(tmp_path):
"""Data without index_metadata.pickle is a partial flush, not a fresh segment."""
seg = tmp_path / "abcd-1234-5678"
seg.mkdir()
(seg / "data_level0.bin").write_bytes(b"\0" * (_HNSW_MISSING_METADATA_DATA_FLOOR + 1))
assert not _segment_appears_healthy(str(seg))
def test_segment_without_metadata_and_tiny_data_is_still_treated_as_fresh(tmp_path):
"""Tiny data payloads can occur before metadata has flushed; leave them alone."""
seg = tmp_path / "abcd-1234-5678"
seg.mkdir()
(seg / "data_level0.bin").write_bytes(b"\0" * _HNSW_MISSING_METADATA_DATA_FLOOR)
assert _segment_appears_healthy(str(seg))
def test_quarantine_stale_hnsw_renames_missing_metadata_with_nontrivial_data(tmp_path):
"""Regression for #1274: missing pickle + non-trivial data must quarantine."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(
tmp_path,
hnsw_mtime=now - 7200,
sqlite_mtime=now,
meta_bytes=None,
)
(seg / "data_level0.bin").write_bytes(b"\0" * (_HNSW_MISSING_METADATA_DATA_FLOOR + 1))
os.utime(seg / "data_level0.bin", (now - 7200, now - 7200))
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert len(moved) == 1
assert ".drift-" in moved[0]
assert not seg.exists()
drift_dirs = [p for p in palace.iterdir() if ".drift-" in p.name]
assert len(drift_dirs) == 1
assert (drift_dirs[0] / "data_level0.bin").exists()
def test_quarantine_stale_hnsw_renames_truncated_metadata(tmp_path): def test_quarantine_stale_hnsw_renames_truncated_metadata(tmp_path):
"""Segment with a truncated (under-floor-size) metadata file is """Segment with a truncated (under-floor-size) metadata file is
quarantined — shape of a partial-flush during process kill.""" quarantined — shape of a partial-flush during process kill."""