Merge pull request #935 from shaun0927/fix/repair-crash-safety

fix: guard against data loss in repair, migrate, and CLI rebuild
This commit is contained in:
Igor Lins e Silva
2026-04-25 20:49:23 -03:00
committed by GitHub
4 changed files with 120 additions and 13 deletions
+3 -1
View File
@@ -458,10 +458,12 @@ def cmd_repair(args):
offset = 0 offset = 0
while offset < total: while offset < total:
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"]) batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
if not batch["ids"]:
break
all_ids.extend(batch["ids"]) all_ids.extend(batch["ids"])
all_docs.extend(batch["documents"]) all_docs.extend(batch["documents"])
all_metas.extend(batch["metadatas"]) all_metas.extend(batch["metadatas"])
offset += batch_size offset += len(batch["ids"])
print(f" Extracted {len(all_ids)} drawers") print(f" Extracted {len(all_ids)} drawers")
# Backup and rebuild # Backup and rebuild
+41 -3
View File
@@ -18,6 +18,7 @@ Usage:
mempalace migrate --dry-run # show what would be migrated mempalace migrate --dry-run # show what would be migrated
""" """
import errno
import os import os
import shutil import shutil
import sqlite3 import sqlite3
@@ -25,6 +26,26 @@ from collections import defaultdict
from datetime import datetime from datetime import datetime
def _restore_stale_palace(palace_path: str, stale_path: str) -> None:
"""Roll back a failed swap.
shutil.move() can partially create palace_path before raising, which
would make a bare os.replace(stale_path, palace_path) fail (dest exists).
Clear any partial destination first, then restore. Best-effort: if the
restore itself fails, log both paths so the operator can recover by hand.
"""
try:
if os.path.lexists(palace_path):
shutil.rmtree(palace_path, ignore_errors=True)
os.replace(stale_path, palace_path)
except Exception as err:
print(
f" CRITICAL: rollback failed — original palace at {stale_path}, "
f"partial migration data at {palace_path}. Restore manually. "
f"({err})"
)
def extract_drawers_from_sqlite(db_path: str) -> list: def extract_drawers_from_sqlite(db_path: str) -> list:
"""Read all drawers directly from ChromaDB's SQLite, bypassing the API. """Read all drawers directly from ChromaDB's SQLite, bypassing the API.
@@ -231,10 +252,27 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
del col del col
del fresh_backend del fresh_backend
# Swap: remove old palace, move new one into place # Swap: rename old palace aside, then move new one into place.
# This avoids a window where both old and new are missing.
print(" Swapping old palace for migrated version...") print(" Swapping old palace for migrated version...")
shutil.rmtree(palace_path) stale_path = palace_path + ".old"
shutil.move(temp_palace, palace_path) if os.path.exists(stale_path):
shutil.rmtree(stale_path)
os.replace(palace_path, stale_path)
try:
os.replace(temp_palace, palace_path)
except OSError as e:
# EXDEV = temp lives on a different filesystem; fall back to copy+delete.
# Anything else is a real error — don't mask it with shutil.move.
if getattr(e, "errno", None) != errno.EXDEV:
_restore_stale_palace(palace_path, stale_path)
raise
try:
shutil.move(temp_palace, palace_path)
except Exception:
_restore_stale_palace(palace_path, stale_path)
raise
shutil.rmtree(stale_path, ignore_errors=True)
print("\n Migration complete.") print("\n Migration complete.")
print(f" Drawers migrated: {final_count}") print(f" Drawers migrated: {final_count}")
+20 -8
View File
@@ -254,8 +254,8 @@ def rebuild_index(palace_path=None):
# Back up ONLY the SQLite database, not the bloated HNSW files # Back up ONLY the SQLite database, not the bloated HNSW files
sqlite_path = os.path.join(palace_path, "chroma.sqlite3") sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
backup_path = sqlite_path + ".backup"
if os.path.exists(sqlite_path): if os.path.exists(sqlite_path):
backup_path = sqlite_path + ".backup"
print(f" Backing up chroma.sqlite3 ({os.path.getsize(sqlite_path) / 1e6:.0f} MB)...") print(f" Backing up chroma.sqlite3 ({os.path.getsize(sqlite_path) / 1e6:.0f} MB)...")
shutil.copy2(sqlite_path, backup_path) shutil.copy2(sqlite_path, backup_path)
print(f" Backup: {backup_path}") print(f" Backup: {backup_path}")
@@ -266,13 +266,25 @@ def rebuild_index(palace_path=None):
new_col = backend.create_collection(palace_path, COLLECTION_NAME) new_col = backend.create_collection(palace_path, COLLECTION_NAME)
filed = 0 filed = 0
for i in range(0, len(all_ids), batch_size): try:
batch_ids = all_ids[i : i + batch_size] for i in range(0, len(all_ids), batch_size):
batch_docs = all_docs[i : i + batch_size] batch_ids = all_ids[i : i + batch_size]
batch_metas = all_metas[i : i + batch_size] batch_docs = all_docs[i : i + batch_size]
new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas) batch_metas = all_metas[i : i + batch_size]
filed += len(batch_ids) new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
print(f" Re-filed {filed}/{len(all_ids)} drawers...") filed += len(batch_ids)
print(f" Re-filed {filed}/{len(all_ids)} drawers...")
except Exception as e:
print(f"\n ERROR during rebuild: {e}")
print(f" Only {filed}/{len(all_ids)} drawers were re-filed.")
if os.path.exists(backup_path):
print(f" Restoring from backup: {backup_path}")
backend.delete_collection(palace_path, COLLECTION_NAME)
shutil.copy2(backup_path, sqlite_path)
print(" Backup restored. Palace is back to pre-repair state.")
else:
print(" No backup available. Re-mine from source files to recover.")
raise
print(f"\n Repair complete. {filed} drawers rebuilt.") print(f"\n Repair complete. {filed} drawers rebuilt.")
print(" HNSW index is now clean with cosine distance metric.") print(" HNSW index is now clean with cosine distance metric.")
+56 -1
View File
@@ -1,9 +1,10 @@
"""Tests for destructive-operation safety in mempalace.migrate.""" """Tests for destructive-operation safety in mempalace.migrate."""
import os
from types import SimpleNamespace from types import SimpleNamespace
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from mempalace.migrate import migrate from mempalace.migrate import _restore_stale_palace, migrate
def test_migrate_requires_palace_database(tmp_path, capsys): def test_migrate_requires_palace_database(tmp_path, capsys):
@@ -46,3 +47,57 @@ def test_migrate_aborts_without_confirmation(tmp_path, capsys):
assert "Aborted." in out assert "Aborted." in out
mock_copytree.assert_not_called() mock_copytree.assert_not_called()
mock_rmtree.assert_not_called() mock_rmtree.assert_not_called()
def test_restore_stale_palace_with_clean_destination(tmp_path):
"""Rollback when no partial copy exists at palace_path."""
palace_path = tmp_path / "palace"
stale_path = tmp_path / "palace.old"
stale_path.mkdir()
(stale_path / "chroma.sqlite3").write_bytes(b"original")
_restore_stale_palace(str(palace_path), str(stale_path))
assert palace_path.is_dir()
assert (palace_path / "chroma.sqlite3").read_bytes() == b"original"
assert not stale_path.exists()
def test_restore_stale_palace_clears_partial_copy(tmp_path):
"""Rollback must remove a partially-copied palace_path before restoring.
Simulates the Qodo-reported hazard: shutil.move() began creating
palace_path, then failed. A bare os.replace(stale, palace_path) would
trip on the existing destination; _restore_stale_palace must clear it.
"""
palace_path = tmp_path / "palace"
stale_path = tmp_path / "palace.old"
stale_path.mkdir()
(stale_path / "chroma.sqlite3").write_bytes(b"original")
palace_path.mkdir()
(palace_path / "half-copied.bin").write_bytes(b"garbage")
_restore_stale_palace(str(palace_path), str(stale_path))
assert palace_path.is_dir()
assert (palace_path / "chroma.sqlite3").read_bytes() == b"original"
assert not (palace_path / "half-copied.bin").exists()
assert not stale_path.exists()
def test_restore_stale_palace_logs_and_swallows_on_failure(tmp_path, capsys):
"""If restore itself fails, log both paths — don't raise from rollback."""
palace_path = tmp_path / "palace"
stale_path = tmp_path / "palace.old"
stale_path.mkdir()
# Force os.replace to fail deterministically.
with patch("mempalace.migrate.os.replace", side_effect=OSError("boom")):
_restore_stale_palace(str(palace_path), str(stale_path))
out = capsys.readouterr().out
assert "CRITICAL" in out
assert os.fspath(palace_path) in out
assert os.fspath(stale_path) in out