fix(migrate): harden swap rollback against partial cross-device copy
shutil.move() can partially create palace_path before raising, which would trip a bare os.replace(stale_path, palace_path) rollback (dest exists). - Switch the primary swap to os.replace so same-filesystem moves stay atomic - Branch on errno.EXDEV before falling back to shutil.move, so real errors (permissions, EIO) surface instead of silently attempting a slow copy - Extract rollback into _restore_stale_palace which clears any partial destination and, if the restore itself fails, logs both stale_path and palace_path so the operator can recover by hand Adds three regression tests covering clean rollback, partial-copy cleanup, and logged failure on rollback-failure. Flagged by the Qodo reviewer on #935.
This commit is contained in:
+30
-4
@@ -16,6 +16,7 @@ Usage:
|
||||
mempalace migrate --dry-run # show what would be migrated
|
||||
"""
|
||||
|
||||
import errno
|
||||
import os
|
||||
import shutil
|
||||
import sqlite3
|
||||
@@ -23,6 +24,26 @@ from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def _restore_stale_palace(palace_path: str, stale_path: str) -> None:
|
||||
"""Roll back a failed swap.
|
||||
|
||||
shutil.move() can partially create palace_path before raising, which
|
||||
would make a bare os.replace(stale_path, palace_path) fail (dest exists).
|
||||
Clear any partial destination first, then restore. Best-effort: if the
|
||||
restore itself fails, log both paths so the operator can recover by hand.
|
||||
"""
|
||||
try:
|
||||
if os.path.lexists(palace_path):
|
||||
shutil.rmtree(palace_path, ignore_errors=True)
|
||||
os.replace(stale_path, palace_path)
|
||||
except Exception as err:
|
||||
print(
|
||||
f" CRITICAL: rollback failed — original palace at {stale_path}, "
|
||||
f"partial migration data at {palace_path}. Restore manually. "
|
||||
f"({err})"
|
||||
)
|
||||
|
||||
|
||||
def extract_drawers_from_sqlite(db_path: str) -> list:
|
||||
"""Read all drawers directly from ChromaDB's SQLite, bypassing the API.
|
||||
|
||||
@@ -235,14 +256,19 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
|
||||
stale_path = palace_path + ".old"
|
||||
if os.path.exists(stale_path):
|
||||
shutil.rmtree(stale_path)
|
||||
os.rename(palace_path, stale_path)
|
||||
os.replace(palace_path, stale_path)
|
||||
try:
|
||||
os.rename(temp_palace, palace_path)
|
||||
except OSError:
|
||||
os.replace(temp_palace, palace_path)
|
||||
except OSError as e:
|
||||
# EXDEV = temp lives on a different filesystem; fall back to copy+delete.
|
||||
# Anything else is a real error — don't mask it with shutil.move.
|
||||
if getattr(e, "errno", None) != errno.EXDEV:
|
||||
_restore_stale_palace(palace_path, stale_path)
|
||||
raise
|
||||
try:
|
||||
shutil.move(temp_palace, palace_path)
|
||||
except Exception:
|
||||
os.rename(stale_path, palace_path)
|
||||
_restore_stale_palace(palace_path, stale_path)
|
||||
raise
|
||||
shutil.rmtree(stale_path, ignore_errors=True)
|
||||
|
||||
|
||||
+56
-1
@@ -1,9 +1,10 @@
|
||||
"""Tests for destructive-operation safety in mempalace.migrate."""
|
||||
|
||||
import os
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from mempalace.migrate import migrate
|
||||
from mempalace.migrate import _restore_stale_palace, migrate
|
||||
|
||||
|
||||
def test_migrate_requires_palace_database(tmp_path, capsys):
|
||||
@@ -46,3 +47,57 @@ def test_migrate_aborts_without_confirmation(tmp_path, capsys):
|
||||
assert "Aborted." in out
|
||||
mock_copytree.assert_not_called()
|
||||
mock_rmtree.assert_not_called()
|
||||
|
||||
|
||||
def test_restore_stale_palace_with_clean_destination(tmp_path):
|
||||
"""Rollback when no partial copy exists at palace_path."""
|
||||
palace_path = tmp_path / "palace"
|
||||
stale_path = tmp_path / "palace.old"
|
||||
stale_path.mkdir()
|
||||
(stale_path / "chroma.sqlite3").write_bytes(b"original")
|
||||
|
||||
_restore_stale_palace(str(palace_path), str(stale_path))
|
||||
|
||||
assert palace_path.is_dir()
|
||||
assert (palace_path / "chroma.sqlite3").read_bytes() == b"original"
|
||||
assert not stale_path.exists()
|
||||
|
||||
|
||||
def test_restore_stale_palace_clears_partial_copy(tmp_path):
|
||||
"""Rollback must remove a partially-copied palace_path before restoring.
|
||||
|
||||
Simulates the Qodo-reported hazard: shutil.move() began creating
|
||||
palace_path, then failed. A bare os.replace(stale, palace_path) would
|
||||
trip on the existing destination; _restore_stale_palace must clear it.
|
||||
"""
|
||||
palace_path = tmp_path / "palace"
|
||||
stale_path = tmp_path / "palace.old"
|
||||
|
||||
stale_path.mkdir()
|
||||
(stale_path / "chroma.sqlite3").write_bytes(b"original")
|
||||
|
||||
palace_path.mkdir()
|
||||
(palace_path / "half-copied.bin").write_bytes(b"garbage")
|
||||
|
||||
_restore_stale_palace(str(palace_path), str(stale_path))
|
||||
|
||||
assert palace_path.is_dir()
|
||||
assert (palace_path / "chroma.sqlite3").read_bytes() == b"original"
|
||||
assert not (palace_path / "half-copied.bin").exists()
|
||||
assert not stale_path.exists()
|
||||
|
||||
|
||||
def test_restore_stale_palace_logs_and_swallows_on_failure(tmp_path, capsys):
|
||||
"""If restore itself fails, log both paths — don't raise from rollback."""
|
||||
palace_path = tmp_path / "palace"
|
||||
stale_path = tmp_path / "palace.old"
|
||||
stale_path.mkdir()
|
||||
|
||||
# Force os.replace to fail deterministically.
|
||||
with patch("mempalace.migrate.os.replace", side_effect=OSError("boom")):
|
||||
_restore_stale_palace(str(palace_path), str(stale_path))
|
||||
|
||||
out = capsys.readouterr().out
|
||||
assert "CRITICAL" in out
|
||||
assert os.fspath(palace_path) in out
|
||||
assert os.fspath(stale_path) in out
|
||||
|
||||
Reference in New Issue
Block a user