From 5dfe8531543f674022b30722209733c7b0ce309e Mon Sep 17 00:00:00 2001 From: JunghwanNA <70629228+shaun0927@users.noreply.github.com> Date: Thu, 16 Apr 2026 12:11:18 +0900 Subject: [PATCH 1/3] fix: guard against data loss in repair, migrate, and CLI rebuild - repair.py: wrap upsert loop in try/except; restore from backup on failure instead of leaving a partially rebuilt collection - migrate.py: replace non-atomic rmtree+move with rename-aside swap so a crash between the two calls does not destroy both copies - cli.py: use offset += len(batch["ids"]) with empty-batch guard instead of fixed offset += batch_size to prevent skipping drawers --- mempalace/cli.py | 4 +++- mempalace/migrate.py | 15 ++++++++++++--- mempalace/repair.py | 26 +++++++++++++++++++------- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/mempalace/cli.py b/mempalace/cli.py index fb2f0ae..d185f61 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -270,10 +270,12 @@ def cmd_repair(args): offset = 0 while offset < total: batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"]) + if not batch["ids"]: + break all_ids.extend(batch["ids"]) all_docs.extend(batch["documents"]) all_metas.extend(batch["metadatas"]) - offset += batch_size + offset += len(batch["ids"]) print(f" Extracted {len(all_ids)} drawers") # Backup and rebuild diff --git a/mempalace/migrate.py b/mempalace/migrate.py index 2eebb61..c804211 100644 --- a/mempalace/migrate.py +++ b/mempalace/migrate.py @@ -229,10 +229,19 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False): del col del fresh_backend - # Swap: remove old palace, move new one into place + # Swap: rename old palace aside, then move new one into place. + # This avoids a window where both old and new are missing. print(" Swapping old palace for migrated version...") - shutil.rmtree(palace_path) - shutil.move(temp_palace, palace_path) + stale_path = palace_path + ".old" + if os.path.exists(stale_path): + shutil.rmtree(stale_path) + os.rename(palace_path, stale_path) + try: + os.rename(temp_palace, palace_path) + except OSError: + # os.rename fails across filesystems; fall back to move + shutil.move(temp_palace, palace_path) + shutil.rmtree(stale_path, ignore_errors=True) print("\n Migration complete.") print(f" Drawers migrated: {final_count}") diff --git a/mempalace/repair.py b/mempalace/repair.py index 9a9aa88..1b4f271 100644 --- a/mempalace/repair.py +++ b/mempalace/repair.py @@ -266,13 +266,25 @@ def rebuild_index(palace_path=None): new_col = backend.create_collection(palace_path, COLLECTION_NAME) filed = 0 - for i in range(0, len(all_ids), batch_size): - batch_ids = all_ids[i : i + batch_size] - batch_docs = all_docs[i : i + batch_size] - batch_metas = all_metas[i : i + batch_size] - new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas) - filed += len(batch_ids) - print(f" Re-filed {filed}/{len(all_ids)} drawers...") + try: + for i in range(0, len(all_ids), batch_size): + batch_ids = all_ids[i : i + batch_size] + batch_docs = all_docs[i : i + batch_size] + batch_metas = all_metas[i : i + batch_size] + new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas) + filed += len(batch_ids) + print(f" Re-filed {filed}/{len(all_ids)} drawers...") + except Exception as e: + print(f"\n ERROR during rebuild: {e}") + print(f" Only {filed}/{len(all_ids)} drawers were re-filed.") + if os.path.exists(backup_path): + print(f" Restoring from backup: {backup_path}") + backend.delete_collection(palace_path, COLLECTION_NAME) + shutil.copy2(backup_path, sqlite_path) + print(" Backup restored. Palace is back to pre-repair state.") + else: + print(" No backup available. Re-mine from source files to recover.") + raise print(f"\n Repair complete. {filed} drawers rebuilt.") print(" HNSW index is now clean with cosine distance metric.") From fb1cf53919d70603b464b00383ea0175efca68d1 Mon Sep 17 00:00:00 2001 From: JunghwanNA <70629228+shaun0927@users.noreply.github.com> Date: Thu, 16 Apr 2026 14:04:26 +0900 Subject: [PATCH 2/3] fix: harden repair backup scope and migrate swap rollback - repair.py: define backup_path before the conditional block so it is always in scope when the except handler references it - migrate.py: restore old palace from .old if both os.rename and shutil.move fail during the swap step --- mempalace/migrate.py | 7 +++++-- mempalace/repair.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/mempalace/migrate.py b/mempalace/migrate.py index c804211..d56cc13 100644 --- a/mempalace/migrate.py +++ b/mempalace/migrate.py @@ -239,8 +239,11 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False): try: os.rename(temp_palace, palace_path) except OSError: - # os.rename fails across filesystems; fall back to move - shutil.move(temp_palace, palace_path) + try: + shutil.move(temp_palace, palace_path) + except Exception: + os.rename(stale_path, palace_path) + raise shutil.rmtree(stale_path, ignore_errors=True) print("\n Migration complete.") diff --git a/mempalace/repair.py b/mempalace/repair.py index 1b4f271..d391d38 100644 --- a/mempalace/repair.py +++ b/mempalace/repair.py @@ -254,8 +254,8 @@ def rebuild_index(palace_path=None): # Back up ONLY the SQLite database, not the bloated HNSW files sqlite_path = os.path.join(palace_path, "chroma.sqlite3") + backup_path = sqlite_path + ".backup" if os.path.exists(sqlite_path): - backup_path = sqlite_path + ".backup" print(f" Backing up chroma.sqlite3 ({os.path.getsize(sqlite_path) / 1e6:.0f} MB)...") shutil.copy2(sqlite_path, backup_path) print(f" Backup: {backup_path}") From 659cb815ea185b140fd98ef49ef18b3341918cb2 Mon Sep 17 00:00:00 2001 From: shaun0927 <70629228+shaun0927@users.noreply.github.com> Date: Fri, 24 Apr 2026 13:12:10 +0900 Subject: [PATCH 3/3] fix(migrate): harden swap rollback against partial cross-device copy shutil.move() can partially create palace_path before raising, which would trip a bare os.replace(stale_path, palace_path) rollback (dest exists). - Switch the primary swap to os.replace so same-filesystem moves stay atomic - Branch on errno.EXDEV before falling back to shutil.move, so real errors (permissions, EIO) surface instead of silently attempting a slow copy - Extract rollback into _restore_stale_palace which clears any partial destination and, if the restore itself fails, logs both stale_path and palace_path so the operator can recover by hand Adds three regression tests covering clean rollback, partial-copy cleanup, and logged failure on rollback-failure. Flagged by the Qodo reviewer on #935. --- mempalace/migrate.py | 34 +++++++++++++++++++++++--- tests/test_migrate.py | 57 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 86 insertions(+), 5 deletions(-) diff --git a/mempalace/migrate.py b/mempalace/migrate.py index d56cc13..5ad1a5b 100644 --- a/mempalace/migrate.py +++ b/mempalace/migrate.py @@ -16,6 +16,7 @@ Usage: mempalace migrate --dry-run # show what would be migrated """ +import errno import os import shutil import sqlite3 @@ -23,6 +24,26 @@ from collections import defaultdict from datetime import datetime +def _restore_stale_palace(palace_path: str, stale_path: str) -> None: + """Roll back a failed swap. + + shutil.move() can partially create palace_path before raising, which + would make a bare os.replace(stale_path, palace_path) fail (dest exists). + Clear any partial destination first, then restore. Best-effort: if the + restore itself fails, log both paths so the operator can recover by hand. + """ + try: + if os.path.lexists(palace_path): + shutil.rmtree(palace_path, ignore_errors=True) + os.replace(stale_path, palace_path) + except Exception as err: + print( + f" CRITICAL: rollback failed — original palace at {stale_path}, " + f"partial migration data at {palace_path}. Restore manually. " + f"({err})" + ) + + def extract_drawers_from_sqlite(db_path: str) -> list: """Read all drawers directly from ChromaDB's SQLite, bypassing the API. @@ -235,14 +256,19 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False): stale_path = palace_path + ".old" if os.path.exists(stale_path): shutil.rmtree(stale_path) - os.rename(palace_path, stale_path) + os.replace(palace_path, stale_path) try: - os.rename(temp_palace, palace_path) - except OSError: + os.replace(temp_palace, palace_path) + except OSError as e: + # EXDEV = temp lives on a different filesystem; fall back to copy+delete. + # Anything else is a real error — don't mask it with shutil.move. + if getattr(e, "errno", None) != errno.EXDEV: + _restore_stale_palace(palace_path, stale_path) + raise try: shutil.move(temp_palace, palace_path) except Exception: - os.rename(stale_path, palace_path) + _restore_stale_palace(palace_path, stale_path) raise shutil.rmtree(stale_path, ignore_errors=True) diff --git a/tests/test_migrate.py b/tests/test_migrate.py index f7e7d7e..4701048 100644 --- a/tests/test_migrate.py +++ b/tests/test_migrate.py @@ -1,9 +1,10 @@ """Tests for destructive-operation safety in mempalace.migrate.""" +import os from types import SimpleNamespace from unittest.mock import MagicMock, patch -from mempalace.migrate import migrate +from mempalace.migrate import _restore_stale_palace, migrate def test_migrate_requires_palace_database(tmp_path, capsys): @@ -46,3 +47,57 @@ def test_migrate_aborts_without_confirmation(tmp_path, capsys): assert "Aborted." in out mock_copytree.assert_not_called() mock_rmtree.assert_not_called() + + +def test_restore_stale_palace_with_clean_destination(tmp_path): + """Rollback when no partial copy exists at palace_path.""" + palace_path = tmp_path / "palace" + stale_path = tmp_path / "palace.old" + stale_path.mkdir() + (stale_path / "chroma.sqlite3").write_bytes(b"original") + + _restore_stale_palace(str(palace_path), str(stale_path)) + + assert palace_path.is_dir() + assert (palace_path / "chroma.sqlite3").read_bytes() == b"original" + assert not stale_path.exists() + + +def test_restore_stale_palace_clears_partial_copy(tmp_path): + """Rollback must remove a partially-copied palace_path before restoring. + + Simulates the Qodo-reported hazard: shutil.move() began creating + palace_path, then failed. A bare os.replace(stale, palace_path) would + trip on the existing destination; _restore_stale_palace must clear it. + """ + palace_path = tmp_path / "palace" + stale_path = tmp_path / "palace.old" + + stale_path.mkdir() + (stale_path / "chroma.sqlite3").write_bytes(b"original") + + palace_path.mkdir() + (palace_path / "half-copied.bin").write_bytes(b"garbage") + + _restore_stale_palace(str(palace_path), str(stale_path)) + + assert palace_path.is_dir() + assert (palace_path / "chroma.sqlite3").read_bytes() == b"original" + assert not (palace_path / "half-copied.bin").exists() + assert not stale_path.exists() + + +def test_restore_stale_palace_logs_and_swallows_on_failure(tmp_path, capsys): + """If restore itself fails, log both paths — don't raise from rollback.""" + palace_path = tmp_path / "palace" + stale_path = tmp_path / "palace.old" + stale_path.mkdir() + + # Force os.replace to fail deterministically. + with patch("mempalace.migrate.os.replace", side_effect=OSError("boom")): + _restore_stale_palace(str(palace_path), str(stale_path)) + + out = capsys.readouterr().out + assert "CRITICAL" in out + assert os.fspath(palace_path) in out + assert os.fspath(stale_path) in out