fix(migrate): harden swap rollback against partial cross-device copy

shutil.move() can partially create palace_path before raising, which would
trip a bare os.replace(stale_path, palace_path) rollback (dest exists).

- Switch the primary swap to os.replace so same-filesystem moves stay atomic
- Branch on errno.EXDEV before falling back to shutil.move, so real errors
  (permissions, EIO) surface instead of silently attempting a slow copy
- Extract rollback into _restore_stale_palace which clears any partial
  destination and, if the restore itself fails, logs both stale_path and
  palace_path so the operator can recover by hand

Adds three regression tests covering clean rollback, partial-copy cleanup,
and logged failure on rollback-failure.

Flagged by the Qodo reviewer on #935.
This commit is contained in:
shaun0927
2026-04-24 13:12:10 +09:00
parent fb1cf53919
commit 659cb815ea
2 changed files with 86 additions and 5 deletions
+30 -4
View File
@@ -16,6 +16,7 @@ Usage:
mempalace migrate --dry-run # show what would be migrated
"""
import errno
import os
import shutil
import sqlite3
@@ -23,6 +24,26 @@ from collections import defaultdict
from datetime import datetime
def _restore_stale_palace(palace_path: str, stale_path: str) -> None:
"""Roll back a failed swap.
shutil.move() can partially create palace_path before raising, which
would make a bare os.replace(stale_path, palace_path) fail (dest exists).
Clear any partial destination first, then restore. Best-effort: if the
restore itself fails, log both paths so the operator can recover by hand.
"""
try:
if os.path.lexists(palace_path):
shutil.rmtree(palace_path, ignore_errors=True)
os.replace(stale_path, palace_path)
except Exception as err:
print(
f" CRITICAL: rollback failed — original palace at {stale_path}, "
f"partial migration data at {palace_path}. Restore manually. "
f"({err})"
)
def extract_drawers_from_sqlite(db_path: str) -> list:
"""Read all drawers directly from ChromaDB's SQLite, bypassing the API.
@@ -235,14 +256,19 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
stale_path = palace_path + ".old"
if os.path.exists(stale_path):
shutil.rmtree(stale_path)
os.rename(palace_path, stale_path)
os.replace(palace_path, stale_path)
try:
os.rename(temp_palace, palace_path)
except OSError:
os.replace(temp_palace, palace_path)
except OSError as e:
# EXDEV = temp lives on a different filesystem; fall back to copy+delete.
# Anything else is a real error — don't mask it with shutil.move.
if getattr(e, "errno", None) != errno.EXDEV:
_restore_stale_palace(palace_path, stale_path)
raise
try:
shutil.move(temp_palace, palace_path)
except Exception:
os.rename(stale_path, palace_path)
_restore_stale_palace(palace_path, stale_path)
raise
shutil.rmtree(stale_path, ignore_errors=True)