Merge pull request #1364 from fatkobra/fix/1362-repair-sqlite-integrity-preflight

fix(repair): preflight SQLite integrity before rebuild
This commit is contained in:
Igor Lins e Silva
2026-05-07 11:07:52 -03:00
committed by GitHub
2 changed files with 152 additions and 10 deletions
+70
View File
@@ -486,6 +486,71 @@ def sqlite_drawer_count(palace_path: str, collection_name: Optional[str] = None)
return None
def sqlite_integrity_errors(palace_path: str) -> list[str]:
"""Return SQLite quick_check errors for chroma.sqlite3.
The repair rebuild path eventually calls Chroma's delete_collection().
If the SQLite layer has corrupt secondary indexes or FTS5 shadow pages,
Chroma can raise an opaque SQLITE_CORRUPT_INDEX / code 779 error before
repair reaches the HNSW rebuild.
Run a direct SQLite quick_check first so repair can fail with a clear,
actionable message before invoking Chroma's destructive collection-delete
path.
"""
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.exists(sqlite_path):
return []
try:
with sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True) as conn:
rows = conn.execute("PRAGMA quick_check").fetchall()
except sqlite3.Error as e:
return [f"PRAGMA quick_check failed: {e}"]
errors: list[str] = []
for row in rows:
if not row:
continue
message = str(row[0])
if message.lower() != "ok":
errors.append(message)
return errors
def print_sqlite_integrity_abort(palace_path: str, errors: list[str]) -> None:
"""Print a clear repair abort message for SQLite-layer corruption."""
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
preview = errors[:5]
print("\n ABORT: SQLite-layer corruption detected before repair rebuild.")
print(" `mempalace repair` will not call Chroma delete_collection() because")
print(" the SQLite database failed `PRAGMA quick_check`.")
print()
print(f" Database: {sqlite_path}")
print()
print(" quick_check output:")
for message in preview:
print(f" - {message}")
if len(errors) > len(preview):
print(f" ... and {len(errors) - len(preview)} more issue(s)")
print()
print(" This often means derived SQLite structures, such as secondary indexes")
print(" or FTS5 shadow tables, are corrupt while the underlying rows may still")
print(" be recoverable.")
print()
print(" Suggested recovery:")
print(" 1. Stop all MemPalace writers / MCP clients.")
print(" 2. Back up the entire palace directory.")
print(" 3. Recover chroma.sqlite3 offline with sqlite3 `.recover` or `.dump`.")
print(" 4. Recreate the FTS5 virtual table from intact embedding_metadata rows.")
print(" 5. Verify `PRAGMA integrity_check` returns `ok`.")
print(" 6. Re-run `mempalace repair --yes`.")
def maybe_repair_poisoned_max_seq_id_before_rebuild(
palace_path: str,
*,
@@ -611,6 +676,11 @@ def rebuild_index(
print(e.message)
return
sqlite_errors = sqlite_integrity_errors(palace_path)
if sqlite_errors:
print_sqlite_integrity_abort(palace_path, sqlite_errors)
return
# Back up ONLY the SQLite database, not the bloated HNSW files
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
backup_path = sqlite_path + ".backup"
+82 -10
View File
@@ -226,9 +226,11 @@ def test_rebuild_index_empty_palace(mock_backend_cls, mock_shutil, tmp_path):
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
# Create a fake sqlite file
# Create a valid sqlite file so the repair preflight can run quick_check.
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
with sqlite3.connect(sqlite_path) as conn:
conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)")
conn.commit()
mock_col = MagicMock()
mock_col.count.return_value = 2
@@ -247,7 +249,7 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
repair.rebuild_index(palace_path=str(tmp_path))
# Verify: backed up sqlite only (not copytree)
# Verify: backed up sqlite only, not copytree.
mock_shutil.copy2.assert_called_once()
assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args)
@@ -274,7 +276,7 @@ def test_rebuild_index_ignores_missing_temp_collection_at_start(
mock_backend_cls, mock_shutil, tmp_path
):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
sqlite3.connect(str(sqlite_path)).close()
def _fake_copy2(src, dst):
with open(dst, "w") as handle:
@@ -413,7 +415,7 @@ def test_sqlite_drawer_count_returns_none_on_unreadable_schema(tmp_path):
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_default_uses_configured_collection(mock_backend_cls, mock_shutil, tmp_path):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
sqlite3.connect(str(sqlite_path)).close()
mock_col = MagicMock()
mock_col.count.return_value = 2
mock_col.get.return_value = {
@@ -543,7 +545,7 @@ def test_rebuild_index_stage_failure_leaves_live_collection_untouched(
mock_backend_cls, mock_shutil, tmp_path
):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
sqlite3.connect(str(sqlite_path)).close()
mock_col = MagicMock()
mock_col.count.return_value = 2
@@ -572,7 +574,7 @@ def test_rebuild_index_stage_failure_leaves_live_collection_untouched(
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_live_failure_restores_backup(mock_backend_cls, mock_shutil, tmp_path):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
sqlite3.connect(str(sqlite_path)).close()
def _fake_copy2(src, dst):
with open(dst, "w") as handle:
@@ -618,7 +620,7 @@ def test_rebuild_index_live_delete_missing_still_restores_backup(
mock_backend_cls, mock_shutil, tmp_path
):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
sqlite3.connect(str(sqlite_path)).close()
def _fake_copy2(src, dst):
with open(dst, "w") as handle:
@@ -663,7 +665,7 @@ def test_rebuild_index_restore_failure_preserves_original_error(
mock_backend_cls, mock_shutil, tmp_path, capsys
):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
sqlite3.connect(str(sqlite_path)).close()
def _copy2_side_effect(src, dst):
if str(src).endswith(".backup"):
@@ -738,7 +740,7 @@ def test_rebuild_index_ignores_temp_cleanup_failure_after_success(
mock_backend_cls, mock_shutil, tmp_path
):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
sqlite3.connect(str(sqlite_path)).close()
def _fake_copy2(src, dst):
with open(dst, "w") as handle:
@@ -1081,6 +1083,76 @@ def test_max_seq_id_rollback_on_verification_failure(tmp_path, monkeypatch):
assert leftover
def test_sqlite_integrity_errors_returns_empty_for_healthy_db(tmp_path):
palace = tmp_path / "palace"
palace.mkdir()
db_path = palace / "chroma.sqlite3"
with sqlite3.connect(db_path) as conn:
conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)")
conn.commit()
assert repair.sqlite_integrity_errors(str(palace)) == []
def test_sqlite_integrity_errors_reports_unreadable_sqlite_file(tmp_path):
palace = tmp_path / "palace"
palace.mkdir()
db_path = palace / "chroma.sqlite3"
db_path.write_bytes(b"not a sqlite database")
errors = repair.sqlite_integrity_errors(str(palace))
assert errors
assert "quick_check failed" in errors[0]
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_aborts_on_sqlite_integrity_errors_before_delete_collection(
mock_backend_cls,
mock_shutil,
tmp_path,
capsys,
):
"""Regression for #1362: fail before Chroma delete_collection on sqlite corruption."""
sqlite_path = tmp_path / "chroma.sqlite3"
with sqlite3.connect(sqlite_path) as conn:
conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)")
conn.commit()
mock_col = MagicMock()
mock_col.count.return_value = 2
mock_col.get.return_value = {
"ids": ["id1", "id2"],
"documents": ["doc1", "doc2"],
"metadatas": [{"wing": "a"}, {"wing": "b"}],
}
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
with patch(
"mempalace.repair.sqlite_integrity_errors",
return_value=[
"Page 4 of B-tree 12345: database disk image is malformed",
"Page 8 of B-tree 67890: database disk image is malformed",
],
):
repair.rebuild_index(palace_path=str(tmp_path))
out = capsys.readouterr().out
assert "SQLite-layer corruption detected before repair rebuild" in out
assert "PRAGMA quick_check" in out
assert "delete_collection" in out
assert "Page 4 of B-tree" in out
mock_backend.delete_collection.assert_not_called()
mock_backend.create_collection.assert_not_called()
mock_shutil.copy2.assert_not_called()
def test_max_seq_id_preflight_preserves_embeddings_queue(tmp_path):
"""#1295: default repair preflight must not drop queued writes."""