fix(repair): run SQLite integrity preflight before chromadb open
#1364 added the SQLite quick_check preflight to rebuild_index, but placed it AFTER backend.get_collection(...). On a SQLite-corrupt palace, chromadb's rust binding raises pyo3_runtime.PanicException — which is not a regular Exception subclass — so it propagates past the existing `except Exception` handlers and the user sees a 30-line stack trace instead of the friendly abort message #1364 was designed to deliver. Reproduced with `mempalace repair --yes` against a palace whose chroma.sqlite3 has 4 mangled pages: pre-fix, panic; post-fix, the clean abort message and exit code 1. Two changes: - mempalace/cli.py cmd_repair: run sqlite_integrity_errors() right after the basic palace-existence check, BEFORE the max_seq_id preflight (which itself opens sqlite3) and BEFORE backend = ChromaBackend(). Exit non-zero so unattended scripts and CI gates see the failure. - mempalace/repair.py rebuild_index: same move at the function level for direct callers (tests, MCP) that bypass cmd_repair. The new test test_rebuild_index_runs_sqlite_preflight_before_chromadb_open uses a real chromadb-built palace (no ChromaBackend mock) plus a real corrupt SQLite (16 KB of mangled pages) so the ordering is exercised end-to-end. The previously-shipping test for the abort path mocked both the backend and sqlite_integrity_errors, which is why the ordering bug shipped CI-green. Six existing test_cli.py cmd_repair tests used `(palace_dir / "chroma.sqlite3").write_text("db")` to fake the SQLite file. The new preflight correctly fails quick_check on those 2-byte stubs, so the tests now create empty real SQLite DBs the same way the test_repair.py fixtures already do.
This commit is contained in:
@@ -1153,6 +1153,57 @@ def test_rebuild_index_aborts_on_sqlite_integrity_errors_before_delete_collectio
|
||||
mock_shutil.copy2.assert_not_called()
|
||||
|
||||
|
||||
def test_rebuild_index_runs_sqlite_preflight_before_chromadb_open(tmp_path, capsys):
|
||||
"""The SQLite integrity preflight must run BEFORE backend.get_collection.
|
||||
|
||||
chromadb's rust binding raises pyo3_runtime.PanicException (which is not
|
||||
a regular Exception subclass) on a malformed page, so any get_collection
|
||||
call against a corrupt SQLite propagates past `except Exception` handlers
|
||||
and produces a 30-line stack trace instead of the friendly abort message.
|
||||
Regression test for the ordering bug where the preflight was placed after
|
||||
the chromadb client open and therefore never reached on the cases it was
|
||||
designed to catch (#1364 follow-up).
|
||||
"""
|
||||
palace = tmp_path / "palace"
|
||||
palace.mkdir()
|
||||
|
||||
# Build a real chromadb palace with one drawer so chroma.sqlite3 exists
|
||||
# at full schema size, then mangle several middle pages so PRAGMA
|
||||
# quick_check fails with "disk image is malformed". This matches the
|
||||
# production failure mode users hit in #1362 / #1364.
|
||||
from mempalace.backends.chroma import ChromaBackend
|
||||
|
||||
backend = ChromaBackend()
|
||||
try:
|
||||
col = backend.create_collection(str(palace), "mempalace_drawers")
|
||||
col.upsert(
|
||||
ids=["d1"],
|
||||
documents=["doc"],
|
||||
metadatas=[{"wing": "w", "room": "r"}],
|
||||
)
|
||||
finally:
|
||||
backend.close()
|
||||
|
||||
sqlite_path = palace / "chroma.sqlite3"
|
||||
pre_size = sqlite_path.stat().st_size
|
||||
assert pre_size > 16384, "need a multi-page sqlite db to mangle"
|
||||
|
||||
with open(sqlite_path, "r+b") as f:
|
||||
f.seek(40960) # page 10
|
||||
f.write(b"\xde\xad\xbe\xef" * 4096) # 16 KB of garbage
|
||||
|
||||
# No chromadb mocks: rebuild_index must reach sqlite_integrity_errors
|
||||
# before any code path that opens a chromadb client. If the preflight
|
||||
# comes too late, the test fails with pyo3_runtime.PanicException
|
||||
# instead of returning cleanly.
|
||||
repair.rebuild_index(palace_path=str(palace))
|
||||
|
||||
out = capsys.readouterr().out
|
||||
assert "SQLite-layer corruption detected before repair rebuild" in out
|
||||
assert "PRAGMA quick_check" in out
|
||||
assert "disk image is malformed" in out
|
||||
|
||||
|
||||
def test_max_seq_id_preflight_preserves_embeddings_queue(tmp_path):
|
||||
"""#1295: default repair preflight must not drop queued writes."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user