fix: use configured collection in recovery paths

This commit is contained in:
Mika Cohen
2026-05-02 00:16:29 -06:00
committed by Igor Lins e Silva
parent 88493acd0d
commit ec6d2dde01
12 changed files with 369 additions and 53 deletions
+40 -19
View File
@@ -181,10 +181,12 @@ def _rebuild_collection_via_temp(
all_docs,
all_metas,
batch_size: int,
collection_name: Optional[str] = None,
progress=print,
) -> int:
expected = len(all_ids)
temp_name = REPAIR_TEMP_COLLECTION
collection_name = collection_name or _drawers_collection_name()
temp_name = f"{collection_name}__repair_tmp"
live_replaced = False
try:
@@ -203,9 +205,9 @@ def _rebuild_collection_via_temp(
_verify_collection_count(temp_col, expected, "temporary rebuild")
progress(" Rebuilding live collection...")
backend.delete_collection(palace_path, COLLECTION_NAME)
backend.delete_collection(palace_path, collection_name)
live_replaced = True
new_col = backend.create_collection(palace_path, COLLECTION_NAME)
new_col = backend.create_collection(palace_path, collection_name)
rebuilt = 0
for i in range(0, expected, batch_size):
@@ -230,7 +232,7 @@ def _rebuild_collection_via_temp(
raise RebuildCollectionError(str(exc), live_replaced=live_replaced) from exc
def scan_palace(palace_path=None, only_wing=None):
def scan_palace(palace_path=None, only_wing=None, collection_name: Optional[str] = None):
"""Scan the palace for corrupt/unfetchable IDs.
Probes in batches of 100, falls back to per-ID on failure.
@@ -239,14 +241,15 @@ def scan_palace(palace_path=None, only_wing=None):
Returns (good_set, bad_set).
"""
palace_path = palace_path or _get_palace_path()
collection_name = collection_name or _drawers_collection_name()
print(f"\n Palace: {palace_path}")
print(" Loading...")
col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
col = ChromaBackend().get_collection(palace_path, collection_name)
where = {"wing": only_wing} if only_wing else None
total = col.count()
print(f" Collection: {COLLECTION_NAME}, total: {total:,}")
print(f" Collection: {collection_name}, total: {total:,}")
if only_wing:
print(f" Scanning wing: {only_wing}")
@@ -307,9 +310,10 @@ def scan_palace(palace_path=None, only_wing=None):
return good_set, bad_set
def prune_corrupt(palace_path=None, confirm=False):
def prune_corrupt(palace_path=None, confirm=False, collection_name: Optional[str] = None):
"""Delete corrupt IDs listed in corrupt_ids.txt."""
palace_path = palace_path or _get_palace_path()
collection_name = collection_name or _drawers_collection_name()
bad_file = os.path.join(palace_path, "corrupt_ids.txt")
if not os.path.exists(bad_file):
@@ -325,7 +329,7 @@ def prune_corrupt(palace_path=None, confirm=False):
print(" Re-run with --confirm to actually delete.")
return
col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
col = ChromaBackend().get_collection(palace_path, collection_name)
before = col.count()
print(f" Collection size before: {before:,}")
@@ -379,7 +383,10 @@ class TruncationDetected(Exception):
def check_extraction_safety(
palace_path: str, extracted: int, confirm_truncation_ok: bool = False
palace_path: str,
extracted: int,
confirm_truncation_ok: bool = False,
collection_name: Optional[str] = None,
) -> None:
"""Cross-check that ``extracted`` matches the SQLite ground truth.
@@ -401,7 +408,8 @@ def check_extraction_safety(
if confirm_truncation_ok:
return
sqlite_count = sqlite_drawer_count(palace_path)
collection_name = collection_name or _drawers_collection_name()
sqlite_count = sqlite_drawer_count(palace_path, collection_name)
cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT
if sqlite_count is not None and sqlite_count > extracted:
@@ -437,7 +445,7 @@ def check_extraction_safety(
raise TruncationDetected(message, sqlite_count, extracted)
def sqlite_drawer_count(palace_path: str) -> "int | None":
def sqlite_drawer_count(palace_path: str, collection_name: Optional[str] = None) -> "int | None":
"""Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection.
Used as an independent ground-truth check against the chromadb
@@ -449,6 +457,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
drift, missing tables, locked file). Callers treat ``None`` as
"unknown" and fall back to the cap-detection check.
"""
collection_name = collection_name or _drawers_collection_name()
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.exists(sqlite_path):
return None
@@ -465,7 +474,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
JOIN collections c ON s.collection = c.id
WHERE c.name = ?
""",
(COLLECTION_NAME,),
(collection_name,),
).fetchone()
return int(row[0]) if row and row[0] is not None else None
finally:
@@ -477,7 +486,11 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
return None
def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
def rebuild_index(
palace_path=None,
confirm_truncation_ok: bool = False,
collection_name: Optional[str] = None,
):
"""Rebuild the HNSW index from scratch.
1. Extract all drawers via ChromaDB get()
@@ -492,6 +505,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
(typically only a concern for palaces sized at exactly 10 000 rows).
"""
palace_path = palace_path or _get_palace_path()
collection_name = collection_name or _drawers_collection_name()
if not os.path.isdir(palace_path):
print(f"\n No palace found at {palace_path}")
@@ -504,7 +518,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
backend = ChromaBackend()
try:
col = backend.get_collection(palace_path, COLLECTION_NAME)
col = backend.get_collection(palace_path, collection_name)
total = col.count()
except Exception as e:
print(f" Error reading palace: {e}")
@@ -528,7 +542,12 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
# short of the SQLite ground truth (or when extraction == chromadb
# default get() cap and the SQLite check couldn't run).
try:
check_extraction_safety(palace_path, len(all_ids), confirm_truncation_ok)
check_extraction_safety(
palace_path,
len(all_ids),
confirm_truncation_ok,
collection_name=collection_name,
)
except TruncationDetected as e:
print(e.message)
return
@@ -551,6 +570,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
all_docs,
all_metas,
batch_size,
collection_name=collection_name,
progress=print,
)
except RebuildCollectionError as e:
@@ -560,7 +580,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
print(f" Restoring from backup: {backup_path}")
try:
_close_chroma_handles(palace_path, backend=backend)
_delete_collection_if_exists(backend, palace_path, COLLECTION_NAME)
_delete_collection_if_exists(backend, palace_path, collection_name)
shutil.copy2(backup_path, sqlite_path)
print(" Backup restored. Palace is back to pre-repair state.")
except Exception as restore_error:
@@ -950,7 +970,7 @@ def rebuild_from_sqlite(
backend.close()
def status(palace_path=None) -> dict:
def status(palace_path=None, collection_name: Optional[str] = None) -> dict:
"""Read-only health check: compare sqlite vs HNSW element counts.
Catches the #1222 failure mode where chromadb's HNSW segment freezes
@@ -968,6 +988,7 @@ def status(palace_path=None) -> dict:
``status="unknown"`` when no palace exists at the given path.
"""
palace_path = palace_path or _get_palace_path()
collection_name = collection_name or _drawers_collection_name()
print(f"\n{'=' * 55}")
print(" MemPalace Repair — Status")
print(f"{'=' * 55}\n")
@@ -977,8 +998,8 @@ def status(palace_path=None) -> dict:
print(" No palace found.\n")
return {"status": "unknown", "message": "no palace at path"}
drawers = hnsw_capacity_status(palace_path, "mempalace_drawers")
closets = hnsw_capacity_status(palace_path, "mempalace_closets")
drawers = hnsw_capacity_status(palace_path, collection_name)
closets = hnsw_capacity_status(palace_path, CLOSETS_COLLECTION_NAME)
for label, info in (("drawers", drawers), ("closets", closets)):
print(f"\n [{label}]")