fix(repair): preflight SQLite integrity before rebuild

This commit is contained in:
fatkobra
2026-05-05 15:51:43 +00:00
parent 1888b671e2
commit 6b042982e8
2 changed files with 147 additions and 5 deletions
+70
View File
@@ -330,6 +330,71 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
return None return None
def sqlite_integrity_errors(palace_path: str) -> list[str]:
"""Return SQLite quick_check errors for chroma.sqlite3.
The repair rebuild path eventually calls Chroma's delete_collection().
If the SQLite layer has corrupt secondary indexes or FTS5 shadow pages,
Chroma can raise an opaque SQLITE_CORRUPT_INDEX / code 779 error before
repair reaches the HNSW rebuild.
Run a direct SQLite quick_check first so repair can fail with a clear,
actionable message before invoking Chroma's destructive collection-delete
path.
"""
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.exists(sqlite_path):
return []
try:
with sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True) as conn:
rows = conn.execute("PRAGMA quick_check").fetchall()
except sqlite3.Error as e:
return [f"PRAGMA quick_check failed: {e}"]
errors: list[str] = []
for row in rows:
if not row:
continue
message = str(row[0])
if message.lower() != "ok":
errors.append(message)
return errors
def print_sqlite_integrity_abort(palace_path: str, errors: list[str]) -> None:
"""Print a clear repair abort message for SQLite-layer corruption."""
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
preview = errors[:5]
print("\n ABORT: SQLite-layer corruption detected before repair rebuild.")
print(" `mempalace repair` will not call Chroma delete_collection() because")
print(" the SQLite database failed `PRAGMA quick_check`.")
print()
print(f" Database: {sqlite_path}")
print()
print(" quick_check output:")
for message in preview:
print(f" - {message}")
if len(errors) > len(preview):
print(f" ... and {len(errors) - len(preview)} more issue(s)")
print()
print(" This often means derived SQLite structures, such as secondary indexes")
print(" or FTS5 shadow tables, are corrupt while the underlying rows may still")
print(" be recoverable.")
print()
print(" Suggested recovery:")
print(" 1. Stop all MemPalace writers / MCP clients.")
print(" 2. Back up the entire palace directory.")
print(" 3. Recover chroma.sqlite3 offline with sqlite3 `.recover` or `.dump`.")
print(" 4. Recreate the FTS5 virtual table from intact embedding_metadata rows.")
print(" 5. Verify `PRAGMA integrity_check` returns `ok`.")
print(" 6. Re-run `mempalace repair --yes`.")
def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False): def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
"""Rebuild the HNSW index from scratch. """Rebuild the HNSW index from scratch.
@@ -397,6 +462,11 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
print(e.message) print(e.message)
return return
sqlite_errors = sqlite_integrity_errors(palace_path)
if sqlite_errors:
print_sqlite_integrity_abort(palace_path, sqlite_errors)
return
# Back up ONLY the SQLite database, not the bloated HNSW files # Back up ONLY the SQLite database, not the bloated HNSW files
sqlite_path = os.path.join(palace_path, "chroma.sqlite3") sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
backup_path = sqlite_path + ".backup" backup_path = sqlite_path + ".backup"
+77 -5
View File
@@ -216,9 +216,11 @@ def test_rebuild_index_empty_palace(mock_backend_cls, mock_shutil, tmp_path):
@patch("mempalace.repair.shutil") @patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend") @patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path): def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
# Create a fake sqlite file # Create a valid sqlite file so the repair preflight can run quick_check.
sqlite_path = tmp_path / "chroma.sqlite3" sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake") with sqlite3.connect(sqlite_path) as conn:
conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)")
conn.commit()
mock_col = MagicMock() mock_col = MagicMock()
mock_col.count.return_value = 2 mock_col.count.return_value = 2
@@ -234,15 +236,15 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
repair.rebuild_index(palace_path=str(tmp_path)) repair.rebuild_index(palace_path=str(tmp_path))
# Verify: backed up sqlite only (not copytree) # Verify: backed up sqlite only, not copytree.
mock_shutil.copy2.assert_called_once() mock_shutil.copy2.assert_called_once()
assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args) assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args)
# Verify: deleted and recreated (cosine is the backend default) # Verify: deleted and recreated.
mock_backend.delete_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers") mock_backend.delete_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
mock_backend.create_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers") mock_backend.create_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
# Verify: used upsert not add # Verify: used upsert, not add.
mock_new_col.upsert.assert_called_once() mock_new_col.upsert.assert_called_once()
mock_new_col.add.assert_not_called() mock_new_col.add.assert_not_called()
@@ -682,3 +684,73 @@ def test_max_seq_id_rollback_on_verification_failure(tmp_path, monkeypatch):
# A backup file is still present — caller can roll back from it. # A backup file is still present — caller can roll back from it.
leftover = [fn for fn in os.listdir(palace) if "max-seq-id-backup-" in fn] leftover = [fn for fn in os.listdir(palace) if "max-seq-id-backup-" in fn]
assert leftover assert leftover
def test_sqlite_integrity_errors_returns_empty_for_healthy_db(tmp_path):
palace = tmp_path / "palace"
palace.mkdir()
db_path = palace / "chroma.sqlite3"
with sqlite3.connect(db_path) as conn:
conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)")
conn.commit()
assert repair.sqlite_integrity_errors(str(palace)) == []
def test_sqlite_integrity_errors_reports_unreadable_sqlite_file(tmp_path):
palace = tmp_path / "palace"
palace.mkdir()
db_path = palace / "chroma.sqlite3"
db_path.write_bytes(b"not a sqlite database")
errors = repair.sqlite_integrity_errors(str(palace))
assert errors
assert "quick_check failed" in errors[0]
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_aborts_on_sqlite_integrity_errors_before_delete_collection(
mock_backend_cls,
mock_shutil,
tmp_path,
capsys,
):
"""Regression for #1362: fail before Chroma delete_collection on sqlite corruption."""
sqlite_path = tmp_path / "chroma.sqlite3"
with sqlite3.connect(sqlite_path) as conn:
conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)")
conn.commit()
mock_col = MagicMock()
mock_col.count.return_value = 2
mock_col.get.return_value = {
"ids": ["id1", "id2"],
"documents": ["doc1", "doc2"],
"metadatas": [{"wing": "a"}, {"wing": "b"}],
}
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
with patch(
"mempalace.repair.sqlite_integrity_errors",
return_value=[
"Page 4 of B-tree 12345: database disk image is malformed",
"Page 8 of B-tree 67890: database disk image is malformed",
],
):
repair.rebuild_index(palace_path=str(tmp_path))
out = capsys.readouterr().out
assert "SQLite-layer corruption detected before repair rebuild" in out
assert "PRAGMA quick_check" in out
assert "delete_collection" in out
assert "Page 4 of B-tree" in out
mock_backend.delete_collection.assert_not_called()
mock_backend.create_collection.assert_not_called()
mock_shutil.copy2.assert_not_called()