Merge pull request #935 from shaun0927/fix/repair-crash-safety
fix: guard against data loss in repair, migrate, and CLI rebuild
This commit is contained in:
+3
-1
@@ -458,10 +458,12 @@ def cmd_repair(args):
|
|||||||
offset = 0
|
offset = 0
|
||||||
while offset < total:
|
while offset < total:
|
||||||
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
|
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
|
||||||
|
if not batch["ids"]:
|
||||||
|
break
|
||||||
all_ids.extend(batch["ids"])
|
all_ids.extend(batch["ids"])
|
||||||
all_docs.extend(batch["documents"])
|
all_docs.extend(batch["documents"])
|
||||||
all_metas.extend(batch["metadatas"])
|
all_metas.extend(batch["metadatas"])
|
||||||
offset += batch_size
|
offset += len(batch["ids"])
|
||||||
print(f" Extracted {len(all_ids)} drawers")
|
print(f" Extracted {len(all_ids)} drawers")
|
||||||
|
|
||||||
# Backup and rebuild
|
# Backup and rebuild
|
||||||
|
|||||||
+41
-3
@@ -18,6 +18,7 @@ Usage:
|
|||||||
mempalace migrate --dry-run # show what would be migrated
|
mempalace migrate --dry-run # show what would be migrated
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import errno
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import sqlite3
|
import sqlite3
|
||||||
@@ -25,6 +26,26 @@ from collections import defaultdict
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
def _restore_stale_palace(palace_path: str, stale_path: str) -> None:
|
||||||
|
"""Roll back a failed swap.
|
||||||
|
|
||||||
|
shutil.move() can partially create palace_path before raising, which
|
||||||
|
would make a bare os.replace(stale_path, palace_path) fail (dest exists).
|
||||||
|
Clear any partial destination first, then restore. Best-effort: if the
|
||||||
|
restore itself fails, log both paths so the operator can recover by hand.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if os.path.lexists(palace_path):
|
||||||
|
shutil.rmtree(palace_path, ignore_errors=True)
|
||||||
|
os.replace(stale_path, palace_path)
|
||||||
|
except Exception as err:
|
||||||
|
print(
|
||||||
|
f" CRITICAL: rollback failed — original palace at {stale_path}, "
|
||||||
|
f"partial migration data at {palace_path}. Restore manually. "
|
||||||
|
f"({err})"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def extract_drawers_from_sqlite(db_path: str) -> list:
|
def extract_drawers_from_sqlite(db_path: str) -> list:
|
||||||
"""Read all drawers directly from ChromaDB's SQLite, bypassing the API.
|
"""Read all drawers directly from ChromaDB's SQLite, bypassing the API.
|
||||||
|
|
||||||
@@ -231,10 +252,27 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
|
|||||||
del col
|
del col
|
||||||
del fresh_backend
|
del fresh_backend
|
||||||
|
|
||||||
# Swap: remove old palace, move new one into place
|
# Swap: rename old palace aside, then move new one into place.
|
||||||
|
# This avoids a window where both old and new are missing.
|
||||||
print(" Swapping old palace for migrated version...")
|
print(" Swapping old palace for migrated version...")
|
||||||
shutil.rmtree(palace_path)
|
stale_path = palace_path + ".old"
|
||||||
shutil.move(temp_palace, palace_path)
|
if os.path.exists(stale_path):
|
||||||
|
shutil.rmtree(stale_path)
|
||||||
|
os.replace(palace_path, stale_path)
|
||||||
|
try:
|
||||||
|
os.replace(temp_palace, palace_path)
|
||||||
|
except OSError as e:
|
||||||
|
# EXDEV = temp lives on a different filesystem; fall back to copy+delete.
|
||||||
|
# Anything else is a real error — don't mask it with shutil.move.
|
||||||
|
if getattr(e, "errno", None) != errno.EXDEV:
|
||||||
|
_restore_stale_palace(palace_path, stale_path)
|
||||||
|
raise
|
||||||
|
try:
|
||||||
|
shutil.move(temp_palace, palace_path)
|
||||||
|
except Exception:
|
||||||
|
_restore_stale_palace(palace_path, stale_path)
|
||||||
|
raise
|
||||||
|
shutil.rmtree(stale_path, ignore_errors=True)
|
||||||
|
|
||||||
print("\n Migration complete.")
|
print("\n Migration complete.")
|
||||||
print(f" Drawers migrated: {final_count}")
|
print(f" Drawers migrated: {final_count}")
|
||||||
|
|||||||
+20
-8
@@ -254,8 +254,8 @@ def rebuild_index(palace_path=None):
|
|||||||
|
|
||||||
# Back up ONLY the SQLite database, not the bloated HNSW files
|
# Back up ONLY the SQLite database, not the bloated HNSW files
|
||||||
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
|
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||||
|
backup_path = sqlite_path + ".backup"
|
||||||
if os.path.exists(sqlite_path):
|
if os.path.exists(sqlite_path):
|
||||||
backup_path = sqlite_path + ".backup"
|
|
||||||
print(f" Backing up chroma.sqlite3 ({os.path.getsize(sqlite_path) / 1e6:.0f} MB)...")
|
print(f" Backing up chroma.sqlite3 ({os.path.getsize(sqlite_path) / 1e6:.0f} MB)...")
|
||||||
shutil.copy2(sqlite_path, backup_path)
|
shutil.copy2(sqlite_path, backup_path)
|
||||||
print(f" Backup: {backup_path}")
|
print(f" Backup: {backup_path}")
|
||||||
@@ -266,13 +266,25 @@ def rebuild_index(palace_path=None):
|
|||||||
new_col = backend.create_collection(palace_path, COLLECTION_NAME)
|
new_col = backend.create_collection(palace_path, COLLECTION_NAME)
|
||||||
|
|
||||||
filed = 0
|
filed = 0
|
||||||
for i in range(0, len(all_ids), batch_size):
|
try:
|
||||||
batch_ids = all_ids[i : i + batch_size]
|
for i in range(0, len(all_ids), batch_size):
|
||||||
batch_docs = all_docs[i : i + batch_size]
|
batch_ids = all_ids[i : i + batch_size]
|
||||||
batch_metas = all_metas[i : i + batch_size]
|
batch_docs = all_docs[i : i + batch_size]
|
||||||
new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
batch_metas = all_metas[i : i + batch_size]
|
||||||
filed += len(batch_ids)
|
new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
||||||
print(f" Re-filed {filed}/{len(all_ids)} drawers...")
|
filed += len(batch_ids)
|
||||||
|
print(f" Re-filed {filed}/{len(all_ids)} drawers...")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n ERROR during rebuild: {e}")
|
||||||
|
print(f" Only {filed}/{len(all_ids)} drawers were re-filed.")
|
||||||
|
if os.path.exists(backup_path):
|
||||||
|
print(f" Restoring from backup: {backup_path}")
|
||||||
|
backend.delete_collection(palace_path, COLLECTION_NAME)
|
||||||
|
shutil.copy2(backup_path, sqlite_path)
|
||||||
|
print(" Backup restored. Palace is back to pre-repair state.")
|
||||||
|
else:
|
||||||
|
print(" No backup available. Re-mine from source files to recover.")
|
||||||
|
raise
|
||||||
|
|
||||||
print(f"\n Repair complete. {filed} drawers rebuilt.")
|
print(f"\n Repair complete. {filed} drawers rebuilt.")
|
||||||
print(" HNSW index is now clean with cosine distance metric.")
|
print(" HNSW index is now clean with cosine distance metric.")
|
||||||
|
|||||||
+56
-1
@@ -1,9 +1,10 @@
|
|||||||
"""Tests for destructive-operation safety in mempalace.migrate."""
|
"""Tests for destructive-operation safety in mempalace.migrate."""
|
||||||
|
|
||||||
|
import os
|
||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
from mempalace.migrate import migrate
|
from mempalace.migrate import _restore_stale_palace, migrate
|
||||||
|
|
||||||
|
|
||||||
def test_migrate_requires_palace_database(tmp_path, capsys):
|
def test_migrate_requires_palace_database(tmp_path, capsys):
|
||||||
@@ -46,3 +47,57 @@ def test_migrate_aborts_without_confirmation(tmp_path, capsys):
|
|||||||
assert "Aborted." in out
|
assert "Aborted." in out
|
||||||
mock_copytree.assert_not_called()
|
mock_copytree.assert_not_called()
|
||||||
mock_rmtree.assert_not_called()
|
mock_rmtree.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
def test_restore_stale_palace_with_clean_destination(tmp_path):
|
||||||
|
"""Rollback when no partial copy exists at palace_path."""
|
||||||
|
palace_path = tmp_path / "palace"
|
||||||
|
stale_path = tmp_path / "palace.old"
|
||||||
|
stale_path.mkdir()
|
||||||
|
(stale_path / "chroma.sqlite3").write_bytes(b"original")
|
||||||
|
|
||||||
|
_restore_stale_palace(str(palace_path), str(stale_path))
|
||||||
|
|
||||||
|
assert palace_path.is_dir()
|
||||||
|
assert (palace_path / "chroma.sqlite3").read_bytes() == b"original"
|
||||||
|
assert not stale_path.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_restore_stale_palace_clears_partial_copy(tmp_path):
|
||||||
|
"""Rollback must remove a partially-copied palace_path before restoring.
|
||||||
|
|
||||||
|
Simulates the Qodo-reported hazard: shutil.move() began creating
|
||||||
|
palace_path, then failed. A bare os.replace(stale, palace_path) would
|
||||||
|
trip on the existing destination; _restore_stale_palace must clear it.
|
||||||
|
"""
|
||||||
|
palace_path = tmp_path / "palace"
|
||||||
|
stale_path = tmp_path / "palace.old"
|
||||||
|
|
||||||
|
stale_path.mkdir()
|
||||||
|
(stale_path / "chroma.sqlite3").write_bytes(b"original")
|
||||||
|
|
||||||
|
palace_path.mkdir()
|
||||||
|
(palace_path / "half-copied.bin").write_bytes(b"garbage")
|
||||||
|
|
||||||
|
_restore_stale_palace(str(palace_path), str(stale_path))
|
||||||
|
|
||||||
|
assert palace_path.is_dir()
|
||||||
|
assert (palace_path / "chroma.sqlite3").read_bytes() == b"original"
|
||||||
|
assert not (palace_path / "half-copied.bin").exists()
|
||||||
|
assert not stale_path.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_restore_stale_palace_logs_and_swallows_on_failure(tmp_path, capsys):
|
||||||
|
"""If restore itself fails, log both paths — don't raise from rollback."""
|
||||||
|
palace_path = tmp_path / "palace"
|
||||||
|
stale_path = tmp_path / "palace.old"
|
||||||
|
stale_path.mkdir()
|
||||||
|
|
||||||
|
# Force os.replace to fail deterministically.
|
||||||
|
with patch("mempalace.migrate.os.replace", side_effect=OSError("boom")):
|
||||||
|
_restore_stale_palace(str(palace_path), str(stale_path))
|
||||||
|
|
||||||
|
out = capsys.readouterr().out
|
||||||
|
assert "CRITICAL" in out
|
||||||
|
assert os.fspath(palace_path) in out
|
||||||
|
assert os.fspath(stale_path) in out
|
||||||
|
|||||||
Reference in New Issue
Block a user