diff --git a/mempalace/repair.py b/mempalace/repair.py index 115ae54..6e170ef 100644 --- a/mempalace/repair.py +++ b/mempalace/repair.py @@ -486,6 +486,71 @@ def sqlite_drawer_count(palace_path: str, collection_name: Optional[str] = None) return None +def sqlite_integrity_errors(palace_path: str) -> list[str]: + """Return SQLite quick_check errors for chroma.sqlite3. + + The repair rebuild path eventually calls Chroma's delete_collection(). + If the SQLite layer has corrupt secondary indexes or FTS5 shadow pages, + Chroma can raise an opaque SQLITE_CORRUPT_INDEX / code 779 error before + repair reaches the HNSW rebuild. + + Run a direct SQLite quick_check first so repair can fail with a clear, + actionable message before invoking Chroma's destructive collection-delete + path. + """ + + sqlite_path = os.path.join(palace_path, "chroma.sqlite3") + if not os.path.exists(sqlite_path): + return [] + + try: + with sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True) as conn: + rows = conn.execute("PRAGMA quick_check").fetchall() + except sqlite3.Error as e: + return [f"PRAGMA quick_check failed: {e}"] + + errors: list[str] = [] + for row in rows: + if not row: + continue + message = str(row[0]) + if message.lower() != "ok": + errors.append(message) + + return errors + + +def print_sqlite_integrity_abort(palace_path: str, errors: list[str]) -> None: + """Print a clear repair abort message for SQLite-layer corruption.""" + + sqlite_path = os.path.join(palace_path, "chroma.sqlite3") + preview = errors[:5] + + print("\n ABORT: SQLite-layer corruption detected before repair rebuild.") + print(" `mempalace repair` will not call Chroma delete_collection() because") + print(" the SQLite database failed `PRAGMA quick_check`.") + print() + print(f" Database: {sqlite_path}") + print() + print(" quick_check output:") + for message in preview: + print(f" - {message}") + if len(errors) > len(preview): + print(f" ... and {len(errors) - len(preview)} more issue(s)") + print() + print(" This often means derived SQLite structures, such as secondary indexes") + print(" or FTS5 shadow tables, are corrupt while the underlying rows may still") + print(" be recoverable.") + print() + print(" Suggested recovery:") + print(" 1. Stop all MemPalace writers / MCP clients.") + print(" 2. Back up the entire palace directory.") + print(" 3. Recover chroma.sqlite3 offline with sqlite3 `.recover` or `.dump`.") + print(" 4. Recreate the FTS5 virtual table from intact embedding_metadata rows.") + print(" 5. Verify `PRAGMA integrity_check` returns `ok`.") + print(" 6. Re-run `mempalace repair --yes`.") + + def maybe_repair_poisoned_max_seq_id_before_rebuild( palace_path: str, *, @@ -611,6 +676,11 @@ def rebuild_index( print(e.message) return + sqlite_errors = sqlite_integrity_errors(palace_path) + if sqlite_errors: + print_sqlite_integrity_abort(palace_path, sqlite_errors) + return + # Back up ONLY the SQLite database, not the bloated HNSW files sqlite_path = os.path.join(palace_path, "chroma.sqlite3") backup_path = sqlite_path + ".backup" diff --git a/tests/test_repair.py b/tests/test_repair.py index 03d6ac8..dda83ec 100644 --- a/tests/test_repair.py +++ b/tests/test_repair.py @@ -226,9 +226,11 @@ def test_rebuild_index_empty_palace(mock_backend_cls, mock_shutil, tmp_path): @patch("mempalace.repair.shutil") @patch("mempalace.repair.ChromaBackend") def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path): - # Create a fake sqlite file + # Create a valid sqlite file so the repair preflight can run quick_check. sqlite_path = tmp_path / "chroma.sqlite3" - sqlite_path.write_text("fake") + with sqlite3.connect(sqlite_path) as conn: + conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)") + conn.commit() mock_col = MagicMock() mock_col.count.return_value = 2 @@ -247,7 +249,7 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path): repair.rebuild_index(palace_path=str(tmp_path)) - # Verify: backed up sqlite only (not copytree) + # Verify: backed up sqlite only, not copytree. mock_shutil.copy2.assert_called_once() assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args) @@ -274,7 +276,7 @@ def test_rebuild_index_ignores_missing_temp_collection_at_start( mock_backend_cls, mock_shutil, tmp_path ): sqlite_path = tmp_path / "chroma.sqlite3" - sqlite_path.write_text("fake") + sqlite3.connect(str(sqlite_path)).close() def _fake_copy2(src, dst): with open(dst, "w") as handle: @@ -413,7 +415,7 @@ def test_sqlite_drawer_count_returns_none_on_unreadable_schema(tmp_path): @patch("mempalace.repair.ChromaBackend") def test_rebuild_index_default_uses_configured_collection(mock_backend_cls, mock_shutil, tmp_path): sqlite_path = tmp_path / "chroma.sqlite3" - sqlite_path.write_text("fake") + sqlite3.connect(str(sqlite_path)).close() mock_col = MagicMock() mock_col.count.return_value = 2 mock_col.get.return_value = { @@ -543,7 +545,7 @@ def test_rebuild_index_stage_failure_leaves_live_collection_untouched( mock_backend_cls, mock_shutil, tmp_path ): sqlite_path = tmp_path / "chroma.sqlite3" - sqlite_path.write_text("fake") + sqlite3.connect(str(sqlite_path)).close() mock_col = MagicMock() mock_col.count.return_value = 2 @@ -572,7 +574,7 @@ def test_rebuild_index_stage_failure_leaves_live_collection_untouched( @patch("mempalace.repair.ChromaBackend") def test_rebuild_index_live_failure_restores_backup(mock_backend_cls, mock_shutil, tmp_path): sqlite_path = tmp_path / "chroma.sqlite3" - sqlite_path.write_text("fake") + sqlite3.connect(str(sqlite_path)).close() def _fake_copy2(src, dst): with open(dst, "w") as handle: @@ -618,7 +620,7 @@ def test_rebuild_index_live_delete_missing_still_restores_backup( mock_backend_cls, mock_shutil, tmp_path ): sqlite_path = tmp_path / "chroma.sqlite3" - sqlite_path.write_text("fake") + sqlite3.connect(str(sqlite_path)).close() def _fake_copy2(src, dst): with open(dst, "w") as handle: @@ -663,7 +665,7 @@ def test_rebuild_index_restore_failure_preserves_original_error( mock_backend_cls, mock_shutil, tmp_path, capsys ): sqlite_path = tmp_path / "chroma.sqlite3" - sqlite_path.write_text("fake") + sqlite3.connect(str(sqlite_path)).close() def _copy2_side_effect(src, dst): if str(src).endswith(".backup"): @@ -738,7 +740,7 @@ def test_rebuild_index_ignores_temp_cleanup_failure_after_success( mock_backend_cls, mock_shutil, tmp_path ): sqlite_path = tmp_path / "chroma.sqlite3" - sqlite_path.write_text("fake") + sqlite3.connect(str(sqlite_path)).close() def _fake_copy2(src, dst): with open(dst, "w") as handle: @@ -1081,6 +1083,76 @@ def test_max_seq_id_rollback_on_verification_failure(tmp_path, monkeypatch): assert leftover +def test_sqlite_integrity_errors_returns_empty_for_healthy_db(tmp_path): + palace = tmp_path / "palace" + palace.mkdir() + db_path = palace / "chroma.sqlite3" + + with sqlite3.connect(db_path) as conn: + conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)") + conn.commit() + + assert repair.sqlite_integrity_errors(str(palace)) == [] + + +def test_sqlite_integrity_errors_reports_unreadable_sqlite_file(tmp_path): + palace = tmp_path / "palace" + palace.mkdir() + db_path = palace / "chroma.sqlite3" + db_path.write_bytes(b"not a sqlite database") + + errors = repair.sqlite_integrity_errors(str(palace)) + + assert errors + assert "quick_check failed" in errors[0] + + +@patch("mempalace.repair.shutil") +@patch("mempalace.repair.ChromaBackend") +def test_rebuild_index_aborts_on_sqlite_integrity_errors_before_delete_collection( + mock_backend_cls, + mock_shutil, + tmp_path, + capsys, +): + """Regression for #1362: fail before Chroma delete_collection on sqlite corruption.""" + + sqlite_path = tmp_path / "chroma.sqlite3" + with sqlite3.connect(sqlite_path) as conn: + conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)") + conn.commit() + + mock_col = MagicMock() + mock_col.count.return_value = 2 + mock_col.get.return_value = { + "ids": ["id1", "id2"], + "documents": ["doc1", "doc2"], + "metadatas": [{"wing": "a"}, {"wing": "b"}], + } + + mock_backend = _install_mock_backend(mock_backend_cls, mock_col) + + with patch( + "mempalace.repair.sqlite_integrity_errors", + return_value=[ + "Page 4 of B-tree 12345: database disk image is malformed", + "Page 8 of B-tree 67890: database disk image is malformed", + ], + ): + repair.rebuild_index(palace_path=str(tmp_path)) + + out = capsys.readouterr().out + + assert "SQLite-layer corruption detected before repair rebuild" in out + assert "PRAGMA quick_check" in out + assert "delete_collection" in out + assert "Page 4 of B-tree" in out + + mock_backend.delete_collection.assert_not_called() + mock_backend.create_collection.assert_not_called() + mock_shutil.copy2.assert_not_called() + + def test_max_seq_id_preflight_preserves_embeddings_queue(tmp_path): """#1295: default repair preflight must not drop queued writes."""