merge: develop into fix/1362-repair-sqlite-integrity-preflight (round 2)
#1357 (max_seq_id preflight) merged into develop while this branch was in CI, opening a fresh conflict between the two preflight helpers. mempalace/repair.py: - Kept both: this branch's sqlite_integrity_errors() / print_sqlite_ integrity_abort() AND develop's maybe_repair_poisoned_max_seq_id_ before_rebuild() from #1357. They check for distinct corruption classes and run as separate preflights. tests/test_repair.py: - Kept both this branch's sqlite_integrity_errors test group and develop's max_seq_id preflight test group; non-overlapping coverage. Local: 1623 tests pass, ruff lint+format clean against 0.4.x CI pin.
This commit is contained in:
@@ -102,6 +102,13 @@ _HNSW_BLOAT_GUARD = {
|
||||
"hnsw:sync_threshold": 50_000,
|
||||
}
|
||||
|
||||
# Missing index_metadata.pickle is normal only while a segment is still fresh
|
||||
# or effectively empty. Once data_level0.bin has non-trivial payload, a
|
||||
# missing metadata pickle means the segment was interrupted after writing HNSW
|
||||
# data but before writing its metadata. Letting Chroma open that shape can
|
||||
# segfault or hang in native HNSW code.
|
||||
_HNSW_MISSING_METADATA_DATA_FLOOR = 1024
|
||||
|
||||
|
||||
def _validate_where(where: Optional[dict]) -> None:
|
||||
"""Scan a where-clause for unknown operators and raise ``UnsupportedFilterError``.
|
||||
@@ -132,16 +139,13 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
|
||||
parsing it. ChromaDB writes that file after a successful HNSW flush;
|
||||
a complete write starts with byte ``0x80`` and ends with byte
|
||||
``0x2e`` (the protocol/terminator byte sequence chromadb serializes
|
||||
with). If both bytes are present and the file is non-trivially sized,
|
||||
chromadb will load the segment cleanly even when its on-disk mtime
|
||||
trails ``chroma.sqlite3`` — which is the *steady state* under
|
||||
chromadb 1.5.x's async batched flush, not corruption.
|
||||
with).
|
||||
|
||||
A missing metadata file is treated as "fresh / never-flushed" and
|
||||
considered healthy. Renaming an empty dir orphans nothing, and a
|
||||
real corruption case manifests as a present-but-malformed file or a
|
||||
chromadb load error caught downstream by palace-daemon's
|
||||
``_auto_repair`` retry path.
|
||||
Missing metadata is healthy only while the segment still looks fresh or
|
||||
empty. If ``data_level0.bin`` already has non-trivial payload but
|
||||
``index_metadata.pickle`` is missing, the segment is partially flushed:
|
||||
Chroma wrote vector data without the metadata it needs to reopen the
|
||||
HNSW reader safely.
|
||||
|
||||
Deliberately format-sniffs only; never deserializes. Deserialization
|
||||
can execute arbitrary code, and the byte-sniff is sufficient to
|
||||
@@ -152,16 +156,26 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
|
||||
chromadb writes today; if a future chromadb version emits protocol
|
||||
0/1 segments, this check would start returning False on healthy
|
||||
files and quarantine_stale_hnsw would conservatively rename them
|
||||
out of the way (lazy rebuild on next open recovers).
|
||||
out of the way.
|
||||
"""
|
||||
if not _hnsw_payload_appears_sane(seg_dir):
|
||||
return False
|
||||
|
||||
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
|
||||
if not os.path.isfile(meta_path):
|
||||
# No metadata file yet — segment hasn't flushed (fresh / empty).
|
||||
# Renaming would orphan nothing; consider healthy.
|
||||
data_path = os.path.join(seg_dir, "data_level0.bin")
|
||||
try:
|
||||
if (
|
||||
os.path.isfile(data_path)
|
||||
and os.path.getsize(data_path) > _HNSW_MISSING_METADATA_DATA_FLOOR
|
||||
):
|
||||
return False
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
# No metadata and no meaningful vector payload yet: fresh/empty segment.
|
||||
return True
|
||||
|
||||
try:
|
||||
size = os.path.getsize(meta_path)
|
||||
# A real chromadb metadata file is at least tens of bytes; a
|
||||
|
||||
+12
-2
@@ -661,6 +661,7 @@ def cmd_repair(args):
|
||||
_extract_drawers,
|
||||
_rebuild_collection_via_temp,
|
||||
check_extraction_safety,
|
||||
maybe_repair_poisoned_max_seq_id_before_rebuild,
|
||||
)
|
||||
|
||||
config = MempalaceConfig()
|
||||
@@ -739,11 +740,20 @@ def cmd_repair(args):
|
||||
print(f"\n No palace found at {palace_path}")
|
||||
return
|
||||
if not contains_palace_database(palace_path):
|
||||
print(f"\n No palace database found at {db_path}")
|
||||
print(f"\n No palace database found at {db_path}")
|
||||
return
|
||||
|
||||
preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
|
||||
palace_path,
|
||||
backup=getattr(args, "backup", True),
|
||||
dry_run=getattr(args, "dry_run", False),
|
||||
assume_yes=getattr(args, "yes", False),
|
||||
)
|
||||
if preflight is not None:
|
||||
return
|
||||
|
||||
print(f"\n{'=' * 55}")
|
||||
print(" MemPalace Repair")
|
||||
print(" MemPalace Repair")
|
||||
print(f"{'=' * 55}\n")
|
||||
print(f" Palace: {palace_path}")
|
||||
|
||||
|
||||
+60
-1
@@ -551,6 +551,58 @@ def print_sqlite_integrity_abort(palace_path: str, errors: list[str]) -> None:
|
||||
print(" 6. Re-run `mempalace repair --yes`.")
|
||||
|
||||
|
||||
def maybe_repair_poisoned_max_seq_id_before_rebuild(
|
||||
palace_path: str,
|
||||
*,
|
||||
backup: bool = True,
|
||||
dry_run: bool = False,
|
||||
assume_yes: bool = False,
|
||||
) -> "dict | None":
|
||||
"""Run non-destructive max_seq_id repair before a rebuild if needed.
|
||||
|
||||
A poisoned ``max_seq_id`` row can make Chroma believe it has already
|
||||
consumed every row in ``embeddings_queue``. Writes then report success
|
||||
because they land in the queue, but they never become visible in
|
||||
``embeddings``.
|
||||
|
||||
If this precise corruption is present, do the narrow bookmark repair and
|
||||
stop instead of continuing into the legacy rebuild path. The rebuild path
|
||||
extracts only already-visible embeddings and can discard queued writes.
|
||||
"""
|
||||
|
||||
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||
if not os.path.isfile(db_path):
|
||||
return None
|
||||
|
||||
try:
|
||||
poisoned = _detect_poisoned_max_seq_ids(db_path)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if not poisoned:
|
||||
return None
|
||||
|
||||
print("\n Detected poisoned max_seq_id rows before repair rebuild.")
|
||||
print(
|
||||
" This can make writes report success while embeddings_queue grows "
|
||||
"and embeddings stay static."
|
||||
)
|
||||
print(
|
||||
" Running the non-destructive max_seq_id repair instead of rebuilding " "the collection."
|
||||
)
|
||||
print(
|
||||
" Queued writes remain in chroma.sqlite3 for Chroma to drain after "
|
||||
"the bookmark is unpoisoned."
|
||||
)
|
||||
|
||||
return repair_max_seq_id(
|
||||
palace_path,
|
||||
backup=backup,
|
||||
dry_run=dry_run,
|
||||
assume_yes=assume_yes,
|
||||
)
|
||||
|
||||
|
||||
def rebuild_index(
|
||||
palace_path=None,
|
||||
confirm_truncation_ok: bool = False,
|
||||
@@ -579,7 +631,14 @@ def rebuild_index(
|
||||
print(f"\n{'=' * 55}")
|
||||
print(" MemPalace Repair — Index Rebuild")
|
||||
print(f"{'=' * 55}\n")
|
||||
print(f" Palace: {palace_path}")
|
||||
print(f" Palace: {palace_path}")
|
||||
|
||||
preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
|
||||
palace_path,
|
||||
assume_yes=True,
|
||||
)
|
||||
if preflight is not None:
|
||||
return
|
||||
|
||||
backend = ChromaBackend()
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user