fix: use configured collection in recovery paths

2026-05-02 00:16:29 -06:00
parent 88493acd0d
commit ec6d2dde01
12 changed files with 369 additions and 53 deletions
@@ -663,8 +663,10 @@ def cmd_repair(args):
        check_extraction_safety,
    )

+    config = MempalaceConfig()
+    collection_name = config.collection_name
    palace_path = os.path.abspath(
-        os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
+        os.path.expanduser(args.palace) if args.palace else config.palace_path
    )

    if getattr(args, "mode", "legacy") == "max-seq-id":
@@ -749,7 +751,7 @@ def cmd_repair(args):

    # Try to read existing drawers
    try:
-        col = backend.get_collection(palace_path, "mempalace_drawers")
+        col = backend.get_collection(palace_path, collection_name)
        total = col.count()
        print(f"  Drawers found: {total}")
    except Exception as e:
@@ -784,6 +786,7 @@ def cmd_repair(args):
            palace_path,
            len(all_ids),
            confirm_truncation_ok=getattr(args, "confirm_truncation_ok", False),
+            collection_name=collection_name,
        )
    except TruncationDetected as e:
        print(e.message)
@@ -810,6 +813,7 @@ def cmd_repair(args):
            all_docs,
            all_metas,
            batch_size,
+            collection_name=collection_name,
            progress=print,
        )
    except RebuildCollectionError as e:
@@ -7,6 +7,7 @@ Priority: env vars > config file (~/.mempalace/config.json) > defaults
 import json
 import os
 import re
+from functools import lru_cache
 from pathlib import Path


@@ -127,6 +128,13 @@ def sanitize_content(value: str, max_length: int = 100_000) -> str:
 DEFAULT_PALACE_PATH = os.path.expanduser("~/.mempalace/palace")
 DEFAULT_COLLECTION_NAME = "mempalace_drawers"

+
+@lru_cache(maxsize=1)
+def get_configured_collection_name() -> str:
+    """Return the configured drawer collection name without repeated config-file reads."""
+    return MempalaceConfig().collection_name
+
+
 DEFAULT_TOPIC_WINGS = [
    "emotions",
    "consciousness",
@@ -193,7 +193,7 @@ def _refresh_vector_disabled_flag() -> None:
    """
    global _vector_disabled, _vector_disabled_reason, _vector_capacity_status
    try:
-        info = hnsw_capacity_status(_config.palace_path, "mempalace_drawers")
+        info = hnsw_capacity_status(_config.palace_path, _config.collection_name)
    except Exception:
        logger.debug("HNSW capacity probe raised", exc_info=True)
        return
@@ -490,6 +490,7 @@ def _tool_status_via_sqlite() -> dict:
    db_path = os.path.join(_config.palace_path, "chroma.sqlite3")
    if not os.path.isfile(db_path):
        return _no_palace()
+    collection_name = _config.collection_name

    wings: dict = {}
    rooms: dict = {}
@@ -503,8 +504,9 @@ def _tool_status_via_sqlite() -> dict:
                FROM embeddings e
                JOIN segments s ON e.segment_id = s.id
                JOIN collections c ON s.collection = c.id
-                WHERE c.name = 'mempalace_drawers'
-                """
+                WHERE c.name = ?
+                """,
+                (collection_name,),
            ).fetchone()
            total = int(row[0]) if row and row[0] is not None else 0
            for key, target in (("wing", wings), ("room", rooms)):
@@ -515,12 +517,12 @@ def _tool_status_via_sqlite() -> dict:
                    JOIN embeddings e ON em.id = e.id
                    JOIN segments s ON e.segment_id = s.id
                    JOIN collections c ON s.collection = c.id
-                    WHERE c.name = 'mempalace_drawers'
+                    WHERE c.name = ?
                      AND em.key = ?
                      AND em.string_value IS NOT NULL
                    GROUP BY em.string_value
                    """,
-                    (key,),
+                    (collection_name, key),
                ):
                    target[value] = count
        finally:
@@ -720,6 +722,7 @@ def tool_search(
        n_results=limit,
        max_distance=dist,
        vector_disabled=_vector_disabled,
+        collection_name=_config.collection_name,
    )
    if _vector_disabled:
        result["vector_disabled"] = True
@@ -922,8 +925,8 @@ def tool_add_drawer(

    # Idempotency: if the deterministic ID already exists, return success as a no-op.
    try:
-        existing = col.get(ids=[drawer_id])
-        if existing and existing["ids"]:
+        existing = col.get(ids=[drawer_id], include=[])
+        if existing.ids:
            return {"success": True, "reason": "already_exists", "drawer_id": drawer_id}
    except Exception:
        logger.debug("Idempotency pre-check failed for %s", drawer_id, exc_info=True)
@@ -943,6 +946,12 @@ def tool_add_drawer(
                }
            ],
        )
+        inserted = col.get(ids=[drawer_id], include=[])
+        if not inserted.ids:
+            raise RuntimeError(
+                "Drawer write was acknowledged but the new ID is not readable. "
+                "The palace index may be stale; run reconnect or repair."
+            )
        _metadata_cache = None
        logger.info(f"Filed drawer: {drawer_id} → {wing}/{room}")
        return {"success": True, "drawer_id": drawer_id, "wing": wing, "room": room}
@@ -1506,6 +1515,30 @@ def tool_reconnect():
        _palace_db_mtime, \
        _vector_disabled, \
        _vector_disabled_reason
+    from . import palace as palace_module
+
+    close_errors = []
+    try:
+        palace_module._DEFAULT_BACKEND.close_palace(_config.palace_path)
+    except Exception as exc:
+        logger.debug("Failed to close shared palace backend during reconnect", exc_info=True)
+        close_errors.append(f"backend close_palace failed: {exc}")
+    try:
+        from chromadb.api.client import SharedSystemClient
+
+        clear_system_cache = getattr(SharedSystemClient, "clear_system_cache", None)
+        if callable(clear_system_cache):
+            clear_system_cache()
+        else:
+            logger.debug(
+                "SharedSystemClient.clear_system_cache is unavailable; skipping shared Chroma cache clear during reconnect"
+            )
+    except Exception as exc:
+        logger.debug(
+            "Failed to clear Chroma shared system cache during reconnect",
+            exc_info=True,
+        )
+        close_errors.append(f"shared Chroma cache clear failed: {exc}")
    _client_cache = None
    _collection_cache = None
    _palace_db_inode = 0
@@ -1527,12 +1560,24 @@ def tool_reconnect():
    try:
        col = _get_collection()
        if col is None:
-            return {
+            result = {
                "success": False,
                "message": "No palace found after reconnect",
                "drawers": 0,
                "vector_disabled": _vector_disabled,
            }
+            if close_errors:
+                result["error"] = "; ".join(close_errors)
+            return result
+        if close_errors:
+            return {
+                "success": False,
+                "message": "Reconnect reopened the palace but failed to fully reset cached handles",
+                "drawers": col.count(),
+                "vector_disabled": _vector_disabled,
+                "vector_disabled_reason": _vector_disabled_reason,
+                "error": "; ".join(close_errors),
+            }
        return {
            "success": True,
            "message": "Reconnected to palace",
@@ -10,6 +10,7 @@ import logging
 import os
 import re
 import threading
+from typing import Optional

 from .backends.chroma import ChromaBackend

@@ -56,10 +57,14 @@ NORMALIZE_VERSION = 2

 def get_collection(
    palace_path: str,
-    collection_name: str = "mempalace_drawers",
+    collection_name: Optional[str] = None,
    create: bool = True,
 ):
    """Get the palace collection through the backend layer."""
+    if collection_name is None:
+        from .config import get_configured_collection_name
+
+        collection_name = get_configured_collection_name()
    return _DEFAULT_BACKEND.get_collection(
        palace_path,
        collection_name=collection_name,
@@ -181,10 +181,12 @@ def _rebuild_collection_via_temp(
    all_docs,
    all_metas,
    batch_size: int,
+    collection_name: Optional[str] = None,
    progress=print,
 ) -> int:
    expected = len(all_ids)
-    temp_name = REPAIR_TEMP_COLLECTION
+    collection_name = collection_name or _drawers_collection_name()
+    temp_name = f"{collection_name}__repair_tmp"
    live_replaced = False

    try:
@@ -203,9 +205,9 @@ def _rebuild_collection_via_temp(
        _verify_collection_count(temp_col, expected, "temporary rebuild")

        progress("  Rebuilding live collection...")
-        backend.delete_collection(palace_path, COLLECTION_NAME)
+        backend.delete_collection(palace_path, collection_name)
        live_replaced = True
-        new_col = backend.create_collection(palace_path, COLLECTION_NAME)
+        new_col = backend.create_collection(palace_path, collection_name)

        rebuilt = 0
        for i in range(0, expected, batch_size):
@@ -230,7 +232,7 @@ def _rebuild_collection_via_temp(
        raise RebuildCollectionError(str(exc), live_replaced=live_replaced) from exc


-def scan_palace(palace_path=None, only_wing=None):
+def scan_palace(palace_path=None, only_wing=None, collection_name: Optional[str] = None):
    """Scan the palace for corrupt/unfetchable IDs.

    Probes in batches of 100, falls back to per-ID on failure.
@@ -239,14 +241,15 @@ def scan_palace(palace_path=None, only_wing=None):
    Returns (good_set, bad_set).
    """
    palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
    print(f"\n  Palace: {palace_path}")
    print("  Loading...")

-    col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
+    col = ChromaBackend().get_collection(palace_path, collection_name)

    where = {"wing": only_wing} if only_wing else None
    total = col.count()
-    print(f"  Collection: {COLLECTION_NAME}, total: {total:,}")
+    print(f"  Collection: {collection_name}, total: {total:,}")
    if only_wing:
        print(f"  Scanning wing: {only_wing}")

@@ -307,9 +310,10 @@ def scan_palace(palace_path=None, only_wing=None):
    return good_set, bad_set


-def prune_corrupt(palace_path=None, confirm=False):
+def prune_corrupt(palace_path=None, confirm=False, collection_name: Optional[str] = None):
    """Delete corrupt IDs listed in corrupt_ids.txt."""
    palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
    bad_file = os.path.join(palace_path, "corrupt_ids.txt")

    if not os.path.exists(bad_file):
@@ -325,7 +329,7 @@ def prune_corrupt(palace_path=None, confirm=False):
        print("  Re-run with --confirm to actually delete.")
        return

-    col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
+    col = ChromaBackend().get_collection(palace_path, collection_name)
    before = col.count()
    print(f"  Collection size before: {before:,}")

@@ -379,7 +383,10 @@ class TruncationDetected(Exception):


 def check_extraction_safety(
-    palace_path: str, extracted: int, confirm_truncation_ok: bool = False
+    palace_path: str,
+    extracted: int,
+    confirm_truncation_ok: bool = False,
+    collection_name: Optional[str] = None,
 ) -> None:
    """Cross-check that ``extracted`` matches the SQLite ground truth.

@@ -401,7 +408,8 @@ def check_extraction_safety(
    if confirm_truncation_ok:
        return

-    sqlite_count = sqlite_drawer_count(palace_path)
+    collection_name = collection_name or _drawers_collection_name()
+    sqlite_count = sqlite_drawer_count(palace_path, collection_name)
    cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT

    if sqlite_count is not None and sqlite_count > extracted:
@@ -437,7 +445,7 @@ def check_extraction_safety(
        raise TruncationDetected(message, sqlite_count, extracted)


-def sqlite_drawer_count(palace_path: str) -> "int | None":
+def sqlite_drawer_count(palace_path: str, collection_name: Optional[str] = None) -> "int | None":
    """Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection.

    Used as an independent ground-truth check against the chromadb
@@ -449,6 +457,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
    drift, missing tables, locked file). Callers treat ``None`` as
    "unknown" and fall back to the cap-detection check.
    """
+    collection_name = collection_name or _drawers_collection_name()
    sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
    if not os.path.exists(sqlite_path):
        return None
@@ -465,7 +474,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
                JOIN collections c ON s.collection = c.id
                WHERE c.name = ?
                """,
-                (COLLECTION_NAME,),
+                (collection_name,),
            ).fetchone()
            return int(row[0]) if row and row[0] is not None else None
        finally:
@@ -477,7 +486,11 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
        return None


-def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
+def rebuild_index(
+    palace_path=None,
+    confirm_truncation_ok: bool = False,
+    collection_name: Optional[str] = None,
+):
    """Rebuild the HNSW index from scratch.

    1. Extract all drawers via ChromaDB get()
@@ -492,6 +505,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
    (typically only a concern for palaces sized at exactly 10 000 rows).
    """
    palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()

    if not os.path.isdir(palace_path):
        print(f"\n  No palace found at {palace_path}")
@@ -504,7 +518,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):

    backend = ChromaBackend()
    try:
-        col = backend.get_collection(palace_path, COLLECTION_NAME)
+        col = backend.get_collection(palace_path, collection_name)
        total = col.count()
    except Exception as e:
        print(f"  Error reading palace: {e}")
@@ -528,7 +542,12 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
    # short of the SQLite ground truth (or when extraction == chromadb
    # default get() cap and the SQLite check couldn't run).
    try:
-        check_extraction_safety(palace_path, len(all_ids), confirm_truncation_ok)
+        check_extraction_safety(
+            palace_path,
+            len(all_ids),
+            confirm_truncation_ok,
+            collection_name=collection_name,
+        )
    except TruncationDetected as e:
        print(e.message)
        return
@@ -551,6 +570,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
            all_docs,
            all_metas,
            batch_size,
+            collection_name=collection_name,
            progress=print,
        )
    except RebuildCollectionError as e:
@@ -560,7 +580,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
            print(f"  Restoring from backup: {backup_path}")
            try:
                _close_chroma_handles(palace_path, backend=backend)
-                _delete_collection_if_exists(backend, palace_path, COLLECTION_NAME)
+                _delete_collection_if_exists(backend, palace_path, collection_name)
                shutil.copy2(backup_path, sqlite_path)
                print("  Backup restored. Palace is back to pre-repair state.")
            except Exception as restore_error:
@@ -950,7 +970,7 @@ def rebuild_from_sqlite(
        backend.close()


-def status(palace_path=None) -> dict:
+def status(palace_path=None, collection_name: Optional[str] = None) -> dict:
    """Read-only health check: compare sqlite vs HNSW element counts.

    Catches the #1222 failure mode where chromadb's HNSW segment freezes
@@ -968,6 +988,7 @@ def status(palace_path=None) -> dict:
    ``status="unknown"`` when no palace exists at the given path.
    """
    palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
    print(f"\n{'=' * 55}")
    print("  MemPalace Repair — Status")
    print(f"{'=' * 55}\n")
@@ -977,8 +998,8 @@ def status(palace_path=None) -> dict:
        print("  No palace found.\n")
        return {"status": "unknown", "message": "no palace at path"}

-    drawers = hnsw_capacity_status(palace_path, "mempalace_drawers")
-    closets = hnsw_capacity_status(palace_path, "mempalace_closets")
+    drawers = hnsw_capacity_status(palace_path, collection_name)
+    closets = hnsw_capacity_status(palace_path, CLOSETS_COLLECTION_NAME)

    for label, info in (("drawers", drawers), ("closets", closets)):
        print(f"\n  [{label}]")
@@ -382,6 +382,7 @@ def _bm25_only_via_sqlite(
    n_results: int = 5,
    max_candidates: int = 500,
    _include_internal: bool = False,
+    collection_name: str = None,
 ) -> dict:
    """BM25-only search reading drawers directly from chroma.sqlite3.

@@ -405,6 +406,10 @@ def _bm25_only_via_sqlite(
            "error": "No palace found",
            "hint": "Run: mempalace init <dir> && mempalace mine <dir>",
        }
+    if collection_name is None:
+        from .config import get_configured_collection_name
+
+        collection_name = get_configured_collection_name()

    def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]:
        clauses = []
@@ -441,35 +446,43 @@ def _bm25_only_via_sqlite(
        # shorter than 3 chars (trigram tokenizer can't match them).
        tokens = [t for t in _tokenize(query) if len(t) >= 3]
        candidate_ids: list[int] = []
+        use_recency_fallback = not tokens
        if tokens:
            fts_query = " OR ".join(tokens)
            filter_sql, filter_params = _metadata_filter_sql("embedding_fulltext_search.rowid")
            try:
                rows = conn.execute(
                    f"""
-                    SELECT rowid
+                    SELECT embedding_fulltext_search.rowid
                    FROM embedding_fulltext_search
+                    JOIN embeddings e ON e.id = embedding_fulltext_search.rowid
+                    JOIN segments s ON e.segment_id = s.id
+                    JOIN collections c ON s.collection = c.id
                    WHERE embedding_fulltext_search MATCH ?
+                      AND c.name = ?
                    {filter_sql}
                    LIMIT ?
                    """,
-                    (fts_query, *filter_params, max_candidates),
+                    (fts_query, collection_name, *filter_params, max_candidates),
                ).fetchall()
                candidate_ids = [r[0] for r in rows]
            except sqlite3.Error:
                # FTS5 tokenizer mismatch or syntax error — fall through
                # to the recency-window selector below.
                logger.debug("FTS5 MATCH failed; using recency fallback", exc_info=True)
+                use_recency_fallback = True

-        if not candidate_ids:
-            # No FTS hits (or no usable tokens) — pull the most recent
-            # rows for the drawers segment so we can BM25-rank something
-            # rather than return empty-handed. Wrapped in try/except
-            # because the schema may differ on legacy palaces (older
-            # chromadb without ``created_at``, missing ``segments``
-            # rows after partial restore, etc.); on schema mismatch we
-            # fall back to ordering by primary-key id and finally to an
-            # empty result rather than letting search raise.
+        if not candidate_ids and use_recency_fallback:
+            # No usable FTS tokens, or FTS itself failed — pull the most
+            # recent rows for the drawers segment so we can BM25-rank
+            # something rather than return empty-handed. A clean FTS miss
+            # must stay empty, especially after wing/room filtering, because
+            # recency fallback would return unrelated scoped drawers.
+            # Wrapped in try/except because the schema may differ on legacy
+            # palaces (older chromadb without ``created_at``, missing
+            # ``segments`` rows after partial restore, etc.); on schema
+            # mismatch we fall back to ordering by primary-key id and finally
+            # to an empty result rather than letting search raise.
            try:
                filter_sql, filter_params = _metadata_filter_sql("e.id")
                rows = conn.execute(
@@ -478,12 +491,12 @@ def _bm25_only_via_sqlite(
                    FROM embeddings e
                    JOIN segments s ON e.segment_id = s.id
                    JOIN collections c ON s.collection = c.id
-                    WHERE c.name = 'mempalace_drawers'
+                    WHERE c.name = ?
                    {filter_sql}
                    ORDER BY e.created_at DESC
                    LIMIT ?
                    """,
-                    (*filter_params, max_candidates),
+                    (collection_name, *filter_params, max_candidates),
                ).fetchall()
                candidate_ids = [r[0] for r in rows]
            except sqlite3.Error:
@@ -499,12 +512,12 @@ def _bm25_only_via_sqlite(
                        FROM embeddings e
                        JOIN segments s ON e.segment_id = s.id
                        JOIN collections c ON s.collection = c.id
-                        WHERE c.name = 'mempalace_drawers'
+                        WHERE c.name = ?
                        {filter_sql}
                        ORDER BY e.id DESC
                        LIMIT ?
                        """,
-                        (*filter_params, max_candidates),
+                        (collection_name, *filter_params, max_candidates),
                    ).fetchall()
                    candidate_ids = [r[0] for r in rows]
                except sqlite3.Error:
@@ -720,6 +733,7 @@ def search_memories(
    max_distance: float = 0.0,
    vector_disabled: bool = False,
    candidate_strategy: str = "vector",
+    collection_name: str = None,
 ) -> dict:
    """Programmatic search — returns a dict instead of printing.

@@ -770,10 +784,11 @@ def search_memories(
            wing=wing,
            room=room,
            n_results=n_results,
+            collection_name=collection_name,
        )

    try:
-        drawers_col = get_collection(palace_path, create=False)
+        drawers_col = get_collection(palace_path, collection_name=collection_name, create=False)
    except Exception as e:
        logger.error("No palace found at %s: %s", palace_path, e)
        return {