Merge pull request #1403 from MemPalace/fix/sqlite-preflight-order
fix(repair): run SQLite integrity preflight before chromadb open (follow-up to #1364)
This commit is contained in:
@@ -662,6 +662,8 @@ def cmd_repair(args):
|
||||
_rebuild_collection_via_temp,
|
||||
check_extraction_safety,
|
||||
maybe_repair_poisoned_max_seq_id_before_rebuild,
|
||||
print_sqlite_integrity_abort,
|
||||
sqlite_integrity_errors,
|
||||
)
|
||||
|
||||
config = MempalaceConfig()
|
||||
@@ -743,6 +745,18 @@ def cmd_repair(args):
|
||||
print(f"\n No palace database found at {db_path}")
|
||||
return
|
||||
|
||||
# Run the SQLite integrity preflight before any chromadb client open.
|
||||
# ChromaDB's rust binding raises pyo3_runtime.PanicException on a
|
||||
# malformed page, which is not a regular Exception subclass and
|
||||
# propagates past the try/except below — the user gets a 30-line
|
||||
# stack trace instead of the friendly abort message. Run quick_check
|
||||
# here so we can surface the clear recovery instructions and exit
|
||||
# cleanly before chromadb's compactor touches the disk.
|
||||
sqlite_errors = sqlite_integrity_errors(palace_path)
|
||||
if sqlite_errors:
|
||||
print_sqlite_integrity_abort(palace_path, sqlite_errors)
|
||||
sys.exit(1)
|
||||
|
||||
preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
|
||||
palace_path,
|
||||
backup=getattr(args, "backup", True),
|
||||
|
||||
+11
-5
@@ -633,6 +633,17 @@ def rebuild_index(
|
||||
print(f"{'=' * 55}\n")
|
||||
print(f" Palace: {palace_path}")
|
||||
|
||||
# Run the SQLite integrity preflight before any chromadb client open.
|
||||
# ChromaDB's rust binding raises pyo3_runtime.PanicException (which is
|
||||
# not a regular Exception subclass) on a malformed page, propagating
|
||||
# past the try/except around get_collection below. Catching the
|
||||
# corruption here lets us surface the clear recovery instructions and
|
||||
# exit cleanly before chromadb's compactor touches the disk.
|
||||
sqlite_errors = sqlite_integrity_errors(palace_path)
|
||||
if sqlite_errors:
|
||||
print_sqlite_integrity_abort(palace_path, sqlite_errors)
|
||||
return
|
||||
|
||||
preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
|
||||
palace_path,
|
||||
assume_yes=True,
|
||||
@@ -676,11 +687,6 @@ def rebuild_index(
|
||||
print(e.message)
|
||||
return
|
||||
|
||||
sqlite_errors = sqlite_integrity_errors(palace_path)
|
||||
if sqlite_errors:
|
||||
print_sqlite_integrity_abort(palace_path, sqlite_errors)
|
||||
return
|
||||
|
||||
# Back up ONLY the SQLite database, not the bloated HNSW files
|
||||
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||
backup_path = sqlite_path + ".backup"
|
||||
|
||||
+7
-6
@@ -2,6 +2,7 @@
|
||||
|
||||
import argparse
|
||||
import shlex
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, call, patch
|
||||
@@ -774,7 +775,7 @@ def test_cmd_repair_requires_palace_database(mock_config_cls, tmp_path, capsys):
|
||||
def test_cmd_repair_error_reading(mock_config_cls, tmp_path, capsys):
|
||||
palace_dir = tmp_path / "palace"
|
||||
palace_dir.mkdir()
|
||||
(palace_dir / "chroma.sqlite3").write_text("db")
|
||||
sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
|
||||
mock_config_cls.return_value.palace_path = str(palace_dir)
|
||||
mock_config_cls.return_value.collection_name = "mempalace_drawers"
|
||||
args = argparse.Namespace(palace=None)
|
||||
@@ -790,7 +791,7 @@ def test_cmd_repair_error_reading(mock_config_cls, tmp_path, capsys):
|
||||
def test_cmd_repair_zero_drawers(mock_config_cls, tmp_path, capsys):
|
||||
palace_dir = tmp_path / "palace"
|
||||
palace_dir.mkdir()
|
||||
(palace_dir / "chroma.sqlite3").write_text("db")
|
||||
sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
|
||||
mock_config_cls.return_value.palace_path = str(palace_dir)
|
||||
mock_config_cls.return_value.collection_name = "mempalace_drawers"
|
||||
args = argparse.Namespace(palace=None)
|
||||
@@ -807,7 +808,7 @@ def test_cmd_repair_zero_drawers(mock_config_cls, tmp_path, capsys):
|
||||
def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
|
||||
palace_dir = tmp_path / "palace"
|
||||
palace_dir.mkdir()
|
||||
(palace_dir / "chroma.sqlite3").write_text("db")
|
||||
sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
|
||||
mock_config_cls.return_value.palace_path = str(palace_dir)
|
||||
mock_config_cls.return_value.collection_name = "mempalace_drawers"
|
||||
args = argparse.Namespace(palace=None, yes=True)
|
||||
@@ -843,7 +844,7 @@ def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
|
||||
def test_cmd_repair_uses_configured_collection(mock_config_cls, tmp_path, capsys):
|
||||
palace_dir = tmp_path / "palace"
|
||||
palace_dir.mkdir()
|
||||
(palace_dir / "chroma.sqlite3").write_text("db")
|
||||
sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
|
||||
mock_config_cls.return_value.palace_path = str(palace_dir)
|
||||
mock_config_cls.return_value.collection_name = "custom_drawers"
|
||||
args = argparse.Namespace(palace=None, yes=True)
|
||||
@@ -882,7 +883,7 @@ def test_cmd_repair_uses_configured_collection(mock_config_cls, tmp_path, capsys
|
||||
def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp_path, capsys):
|
||||
palace_dir = tmp_path / "palace"
|
||||
palace_dir.mkdir()
|
||||
(palace_dir / "chroma.sqlite3").write_text("db")
|
||||
sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
|
||||
mock_config_cls.return_value.palace_path = str(palace_dir)
|
||||
mock_config_cls.return_value.collection_name = "mempalace_drawers"
|
||||
args = argparse.Namespace(palace=None, yes=True)
|
||||
@@ -916,7 +917,7 @@ def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp
|
||||
def test_cmd_repair_aborts_without_confirmation(mock_config_cls, tmp_path, capsys):
|
||||
palace_dir = tmp_path / "palace"
|
||||
palace_dir.mkdir()
|
||||
(palace_dir / "chroma.sqlite3").write_text("db")
|
||||
sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
|
||||
mock_config_cls.return_value.palace_path = str(palace_dir)
|
||||
mock_config_cls.return_value.collection_name = "mempalace_drawers"
|
||||
args = argparse.Namespace(palace=None)
|
||||
|
||||
@@ -1153,6 +1153,77 @@ def test_rebuild_index_aborts_on_sqlite_integrity_errors_before_delete_collectio
|
||||
mock_shutil.copy2.assert_not_called()
|
||||
|
||||
|
||||
def test_rebuild_index_runs_sqlite_preflight_before_chromadb_open(tmp_path, capsys):
|
||||
"""The SQLite integrity preflight must run BEFORE backend.get_collection.
|
||||
|
||||
chromadb's rust binding raises pyo3_runtime.PanicException (which is not
|
||||
a regular Exception subclass) on a malformed page, so any get_collection
|
||||
call against a corrupt SQLite propagates past `except Exception` handlers
|
||||
and produces a 30-line stack trace instead of the friendly abort message.
|
||||
Regression test for the ordering bug where the preflight was placed after
|
||||
the chromadb client open and therefore never reached on the cases it was
|
||||
designed to catch (#1364 follow-up).
|
||||
"""
|
||||
palace = tmp_path / "palace"
|
||||
palace.mkdir()
|
||||
|
||||
# Build a real chromadb palace with one drawer so chroma.sqlite3 exists
|
||||
# at full schema size, then mangle several middle pages so PRAGMA
|
||||
# quick_check fails with "disk image is malformed". This matches the
|
||||
# production failure mode users hit in #1362 / #1364.
|
||||
from mempalace.backends.chroma import ChromaBackend
|
||||
|
||||
backend = ChromaBackend()
|
||||
try:
|
||||
col = backend.create_collection(str(palace), "mempalace_drawers")
|
||||
col.upsert(
|
||||
ids=["d1"],
|
||||
documents=["doc"],
|
||||
metadatas=[{"wing": "w", "room": "r"}],
|
||||
)
|
||||
finally:
|
||||
backend.close()
|
||||
|
||||
sqlite_path = palace / "chroma.sqlite3"
|
||||
pre_size = sqlite_path.stat().st_size
|
||||
|
||||
# Compute a page-aligned corruption offset that's always inside the
|
||||
# existing file. SQLite uses 4 KB pages by default; we mangle 4 pages
|
||||
# somewhere in the middle, skipping at least the first 2 pages
|
||||
# (header + root) so the file still opens. Without clamping to the
|
||||
# actual file size, a seek past EOF on r+b mode would silently
|
||||
# extend the file with zero-padding and leave the original pages
|
||||
# intact — quick_check would still pass, and the regression guard
|
||||
# would skip the bug.
|
||||
PAGE = 4096
|
||||
CORRUPT_BYTES = 16384 # 4 pages
|
||||
HEADER_GUARD = PAGE * 2 # leave header + root pages intact
|
||||
assert (
|
||||
pre_size >= HEADER_GUARD + CORRUPT_BYTES
|
||||
), f"sqlite db too small to mangle without truncating: {pre_size} bytes"
|
||||
# Round (pre_size - CORRUPT_BYTES) down to a page boundary so we
|
||||
# mangle whole pages. Cap at offset 40960 (page 10) for stable
|
||||
# diagnostics across SQLite versions that may grow the file.
|
||||
max_offset = (pre_size - CORRUPT_BYTES) & ~(PAGE - 1)
|
||||
corrupt_offset = min(40960, max_offset)
|
||||
assert corrupt_offset >= HEADER_GUARD, f"corruption offset {corrupt_offset} too close to header"
|
||||
|
||||
with open(sqlite_path, "r+b") as f:
|
||||
f.seek(corrupt_offset)
|
||||
f.write(b"\xde\xad\xbe\xef" * (CORRUPT_BYTES // 4))
|
||||
|
||||
# No chromadb mocks: rebuild_index must reach sqlite_integrity_errors
|
||||
# before any code path that opens a chromadb client. If the preflight
|
||||
# comes too late, the test fails with pyo3_runtime.PanicException
|
||||
# instead of returning cleanly.
|
||||
repair.rebuild_index(palace_path=str(palace))
|
||||
|
||||
out = capsys.readouterr().out
|
||||
assert "SQLite-layer corruption detected before repair rebuild" in out
|
||||
assert "PRAGMA quick_check" in out
|
||||
assert "disk image is malformed" in out
|
||||
|
||||
|
||||
def test_max_seq_id_preflight_preserves_embeddings_queue(tmp_path):
|
||||
"""#1295: default repair preflight must not drop queued writes."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user