fix: use configured collection in recovery paths

2026-05-02 00:16:29 -06:00
parent 88493acd0d
commit ec6d2dde01
12 changed files with 369 additions and 53 deletions
@@ -181,10 +181,12 @@ def _rebuild_collection_via_temp(
    all_docs,
    all_metas,
    batch_size: int,
+    collection_name: Optional[str] = None,
    progress=print,
 ) -> int:
    expected = len(all_ids)
-    temp_name = REPAIR_TEMP_COLLECTION
+    collection_name = collection_name or _drawers_collection_name()
+    temp_name = f"{collection_name}__repair_tmp"
    live_replaced = False

    try:
@@ -203,9 +205,9 @@ def _rebuild_collection_via_temp(
        _verify_collection_count(temp_col, expected, "temporary rebuild")

        progress("  Rebuilding live collection...")
-        backend.delete_collection(palace_path, COLLECTION_NAME)
+        backend.delete_collection(palace_path, collection_name)
        live_replaced = True
-        new_col = backend.create_collection(palace_path, COLLECTION_NAME)
+        new_col = backend.create_collection(palace_path, collection_name)

        rebuilt = 0
        for i in range(0, expected, batch_size):
@@ -230,7 +232,7 @@ def _rebuild_collection_via_temp(
        raise RebuildCollectionError(str(exc), live_replaced=live_replaced) from exc


-def scan_palace(palace_path=None, only_wing=None):
+def scan_palace(palace_path=None, only_wing=None, collection_name: Optional[str] = None):
    """Scan the palace for corrupt/unfetchable IDs.

    Probes in batches of 100, falls back to per-ID on failure.
@@ -239,14 +241,15 @@ def scan_palace(palace_path=None, only_wing=None):
    Returns (good_set, bad_set).
    """
    palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
    print(f"\n  Palace: {palace_path}")
    print("  Loading...")

-    col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
+    col = ChromaBackend().get_collection(palace_path, collection_name)

    where = {"wing": only_wing} if only_wing else None
    total = col.count()
-    print(f"  Collection: {COLLECTION_NAME}, total: {total:,}")
+    print(f"  Collection: {collection_name}, total: {total:,}")
    if only_wing:
        print(f"  Scanning wing: {only_wing}")

@@ -307,9 +310,10 @@ def scan_palace(palace_path=None, only_wing=None):
    return good_set, bad_set


-def prune_corrupt(palace_path=None, confirm=False):
+def prune_corrupt(palace_path=None, confirm=False, collection_name: Optional[str] = None):
    """Delete corrupt IDs listed in corrupt_ids.txt."""
    palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
    bad_file = os.path.join(palace_path, "corrupt_ids.txt")

    if not os.path.exists(bad_file):
@@ -325,7 +329,7 @@ def prune_corrupt(palace_path=None, confirm=False):
        print("  Re-run with --confirm to actually delete.")
        return

-    col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
+    col = ChromaBackend().get_collection(palace_path, collection_name)
    before = col.count()
    print(f"  Collection size before: {before:,}")

@@ -379,7 +383,10 @@ class TruncationDetected(Exception):


 def check_extraction_safety(
-    palace_path: str, extracted: int, confirm_truncation_ok: bool = False
+    palace_path: str,
+    extracted: int,
+    confirm_truncation_ok: bool = False,
+    collection_name: Optional[str] = None,
 ) -> None:
    """Cross-check that ``extracted`` matches the SQLite ground truth.

@@ -401,7 +408,8 @@ def check_extraction_safety(
    if confirm_truncation_ok:
        return

-    sqlite_count = sqlite_drawer_count(palace_path)
+    collection_name = collection_name or _drawers_collection_name()
+    sqlite_count = sqlite_drawer_count(palace_path, collection_name)
    cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT

    if sqlite_count is not None and sqlite_count > extracted:
@@ -437,7 +445,7 @@ def check_extraction_safety(
        raise TruncationDetected(message, sqlite_count, extracted)


-def sqlite_drawer_count(palace_path: str) -> "int | None":
+def sqlite_drawer_count(palace_path: str, collection_name: Optional[str] = None) -> "int | None":
    """Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection.

    Used as an independent ground-truth check against the chromadb
@@ -449,6 +457,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
    drift, missing tables, locked file). Callers treat ``None`` as
    "unknown" and fall back to the cap-detection check.
    """
+    collection_name = collection_name or _drawers_collection_name()
    sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
    if not os.path.exists(sqlite_path):
        return None
@@ -465,7 +474,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
                JOIN collections c ON s.collection = c.id
                WHERE c.name = ?
                """,
-                (COLLECTION_NAME,),
+                (collection_name,),
            ).fetchone()
            return int(row[0]) if row and row[0] is not None else None
        finally:
@@ -477,7 +486,11 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
        return None


-def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
+def rebuild_index(
+    palace_path=None,
+    confirm_truncation_ok: bool = False,
+    collection_name: Optional[str] = None,
+):
    """Rebuild the HNSW index from scratch.

    1. Extract all drawers via ChromaDB get()
@@ -492,6 +505,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
    (typically only a concern for palaces sized at exactly 10 000 rows).
    """
    palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()

    if not os.path.isdir(palace_path):
        print(f"\n  No palace found at {palace_path}")
@@ -504,7 +518,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):

    backend = ChromaBackend()
    try:
-        col = backend.get_collection(palace_path, COLLECTION_NAME)
+        col = backend.get_collection(palace_path, collection_name)
        total = col.count()
    except Exception as e:
        print(f"  Error reading palace: {e}")
@@ -528,7 +542,12 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
    # short of the SQLite ground truth (or when extraction == chromadb
    # default get() cap and the SQLite check couldn't run).
    try:
-        check_extraction_safety(palace_path, len(all_ids), confirm_truncation_ok)
+        check_extraction_safety(
+            palace_path,
+            len(all_ids),
+            confirm_truncation_ok,
+            collection_name=collection_name,
+        )
    except TruncationDetected as e:
        print(e.message)
        return
@@ -551,6 +570,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
            all_docs,
            all_metas,
            batch_size,
+            collection_name=collection_name,
            progress=print,
        )
    except RebuildCollectionError as e:
@@ -560,7 +580,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
            print(f"  Restoring from backup: {backup_path}")
            try:
                _close_chroma_handles(palace_path, backend=backend)
-                _delete_collection_if_exists(backend, palace_path, COLLECTION_NAME)
+                _delete_collection_if_exists(backend, palace_path, collection_name)
                shutil.copy2(backup_path, sqlite_path)
                print("  Backup restored. Palace is back to pre-repair state.")
            except Exception as restore_error:
@@ -950,7 +970,7 @@ def rebuild_from_sqlite(
        backend.close()


-def status(palace_path=None) -> dict:
+def status(palace_path=None, collection_name: Optional[str] = None) -> dict:
    """Read-only health check: compare sqlite vs HNSW element counts.

    Catches the #1222 failure mode where chromadb's HNSW segment freezes
@@ -968,6 +988,7 @@ def status(palace_path=None) -> dict:
    ``status="unknown"`` when no palace exists at the given path.
    """
    palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
    print(f"\n{'=' * 55}")
    print("  MemPalace Repair — Status")
    print(f"{'=' * 55}\n")
@@ -977,8 +998,8 @@ def status(palace_path=None) -> dict:
        print("  No palace found.\n")
        return {"status": "unknown", "message": "no palace at path"}

-    drawers = hnsw_capacity_status(palace_path, "mempalace_drawers")
-    closets = hnsw_capacity_status(palace_path, "mempalace_closets")
+    drawers = hnsw_capacity_status(palace_path, collection_name)
+    closets = hnsw_capacity_status(palace_path, CLOSETS_COLLECTION_NAME)

    for label, info in (("drawers", drawers), ("closets", closets)):
        print(f"\n  [{label}]")