Merge pull request #1285 from mjc/hnsw-repair
fix: harden Chroma repair preflight and rollback recovery
This commit is contained in:
+133
-32
@@ -4,7 +4,9 @@ import contextlib
|
|||||||
import datetime as _dt
|
import datetime as _dt
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import pickle
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
from numbers import Integral
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
@@ -490,22 +492,17 @@ def hnsw_capacity_status(palace_path: str, collection_name: str = "mempalace_dra
|
|||||||
divergence_floor = max(_HNSW_DIVERGENCE_FALLBACK_FLOOR, 2 * sync_threshold)
|
divergence_floor = max(_HNSW_DIVERGENCE_FALLBACK_FLOOR, 2 * sync_threshold)
|
||||||
|
|
||||||
if hnsw_count is None:
|
if hnsw_count is None:
|
||||||
# No pickle yet — segment hasn't persisted metadata. Could be
|
# No pickle yet, so this probe cannot measure HNSW capacity.
|
||||||
# fresh-but-unflushed (normal) or interrupted-mid-flush (bad).
|
# Chroma 1.5.x can have binary HNSW files without a flushed
|
||||||
# We can't distinguish without the pickle, so only flag
|
# metadata pickle; absence of the pickle alone is not proof that
|
||||||
# divergence when sqlite holds clearly more than two flush
|
# vector search is unusable or dangerous. Keep the status unknown
|
||||||
# windows worth — same threshold as the with-pickle path.
|
# so MCP does not globally disable vectors on an inconclusive
|
||||||
if sqlite_count > divergence_floor:
|
# signal. Corrupt/invalid metadata, when present, is handled by
|
||||||
out["status"] = "diverged"
|
# quarantine_invalid_hnsw_metadata before Chroma opens.
|
||||||
out["diverged"] = True
|
|
||||||
out["divergence"] = sqlite_count
|
|
||||||
out["message"] = (
|
out["message"] = (
|
||||||
f"sqlite holds {sqlite_count:,} embeddings but the HNSW segment "
|
"HNSW capacity unavailable: metadata has not been flushed; "
|
||||||
"has never flushed metadata — vector search will return nothing "
|
"leaving vector search enabled"
|
||||||
"until the segment is rebuilt. Run `mempalace repair`."
|
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
out["message"] = "HNSW segment metadata not yet flushed; skipping"
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
divergence = sqlite_count - hnsw_count
|
divergence = sqlite_count - hnsw_count
|
||||||
@@ -592,6 +589,97 @@ def _pin_hnsw_threads(collection) -> None:
|
|||||||
_BLOB_FIX_MARKER = ".blob_seq_ids_migrated"
|
_BLOB_FIX_MARKER = ".blob_seq_ids_migrated"
|
||||||
|
|
||||||
|
|
||||||
|
def _valid_dimensionality(value: object) -> bool:
|
||||||
|
return isinstance(value, Integral) and not isinstance(value, bool) and int(value) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def _persisted_metadata_fields(obj: object) -> tuple[object, object]:
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return obj.get("dimensionality"), obj.get("id_to_label")
|
||||||
|
return getattr(obj, "dimensionality", None), getattr(obj, "id_to_label", None)
|
||||||
|
|
||||||
|
|
||||||
|
def quarantine_invalid_hnsw_metadata(palace_path: str) -> list[str]:
|
||||||
|
"""Quarantine segment dirs whose ``index_metadata.pickle`` is unreadable or invalid.
|
||||||
|
|
||||||
|
Chroma's persisted HNSW metadata is untrusted disk state. If a segment has
|
||||||
|
labels but no valid positive dimensionality, current Chroma versions can
|
||||||
|
accept the pickle and crash later in the Rust loader. We rename the entire
|
||||||
|
segment out of the way before ``PersistentClient`` opens so Chroma can
|
||||||
|
rebuild cleanly instead of touching known-bad metadata.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
entries = os.listdir(palace_path)
|
||||||
|
except OSError:
|
||||||
|
return []
|
||||||
|
|
||||||
|
moved: list[str] = []
|
||||||
|
for name in entries:
|
||||||
|
if "-" not in name or name.startswith(".") or ".drift-" in name or ".corrupt-" in name:
|
||||||
|
continue
|
||||||
|
seg_dir = os.path.join(palace_path, name)
|
||||||
|
if not os.path.isdir(seg_dir):
|
||||||
|
continue
|
||||||
|
|
||||||
|
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
|
||||||
|
if not os.path.isfile(meta_path):
|
||||||
|
continue
|
||||||
|
|
||||||
|
reason = None
|
||||||
|
try:
|
||||||
|
persisted = _SafePersistentDataUnpickler.load(meta_path)
|
||||||
|
except (EOFError, OSError):
|
||||||
|
logger.debug(
|
||||||
|
"Skipping invalid-HNSW quarantine for transient metadata read in %s",
|
||||||
|
meta_path,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
except pickle.UnpicklingError as exc:
|
||||||
|
if "truncated" in str(exc).lower() or "ran out of input" in str(exc).lower():
|
||||||
|
logger.debug(
|
||||||
|
"Skipping invalid-HNSW quarantine for transient metadata read in %s",
|
||||||
|
meta_path,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
reason = f"invalid index_metadata.pickle: {exc}"
|
||||||
|
except Exception as exc:
|
||||||
|
reason = f"invalid index_metadata.pickle: {exc}"
|
||||||
|
else:
|
||||||
|
if not isinstance(persisted, dict) and not (
|
||||||
|
hasattr(persisted, "dimensionality") or hasattr(persisted, "id_to_label")
|
||||||
|
):
|
||||||
|
reason = f"unrecognized index_metadata.pickle payload: {type(persisted).__name__}"
|
||||||
|
else:
|
||||||
|
dimensionality, id_to_label = _persisted_metadata_fields(persisted)
|
||||||
|
if id_to_label is not None and not isinstance(id_to_label, dict):
|
||||||
|
reason = f"invalid id_to_label type {type(id_to_label).__name__}"
|
||||||
|
else:
|
||||||
|
has_labels = bool(id_to_label)
|
||||||
|
if has_labels and not _valid_dimensionality(dimensionality):
|
||||||
|
reason = (
|
||||||
|
"labels present but dimensionality is missing or invalid "
|
||||||
|
f"({dimensionality!r})"
|
||||||
|
)
|
||||||
|
elif dimensionality is not None and not _valid_dimensionality(dimensionality):
|
||||||
|
reason = f"invalid dimensionality {dimensionality!r}"
|
||||||
|
|
||||||
|
if reason is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
|
target = f"{seg_dir}.corrupt-{stamp}"
|
||||||
|
try:
|
||||||
|
os.rename(seg_dir, target)
|
||||||
|
moved.append(target)
|
||||||
|
logger.warning("Quarantined invalid HNSW metadata in %s: %s", seg_dir, reason)
|
||||||
|
except OSError:
|
||||||
|
logger.exception("Failed to quarantine invalid HNSW metadata in %s", seg_dir)
|
||||||
|
|
||||||
|
return moved
|
||||||
|
|
||||||
|
|
||||||
def _fix_blob_seq_ids(palace_path: str) -> None:
|
def _fix_blob_seq_ids(palace_path: str) -> None:
|
||||||
"""Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
|
"""Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
|
||||||
|
|
||||||
@@ -1045,6 +1133,13 @@ class ChromaBackend(BaseBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if cached is None or inode_changed or mtime_changed or mtime_appeared:
|
if cached is None or inode_changed or mtime_changed or mtime_appeared:
|
||||||
|
# An inode swap means we are reopening a different physical DB
|
||||||
|
# (post-restore, fresh palace at the same path, etc.); drop the
|
||||||
|
# per-process gate so the quarantine pre-checks run again
|
||||||
|
# against the new disk state instead of trusting cached "we
|
||||||
|
# already cleaned this path" credit from the prior inode.
|
||||||
|
if inode_changed:
|
||||||
|
ChromaBackend._quarantined_paths.discard(palace_path)
|
||||||
ChromaBackend._prepare_palace_for_open(palace_path)
|
ChromaBackend._prepare_palace_for_open(palace_path)
|
||||||
cached = chromadb.PersistentClient(path=palace_path)
|
cached = chromadb.PersistentClient(path=palace_path)
|
||||||
self._clients[palace_path] = cached
|
self._clients[palace_path] = cached
|
||||||
@@ -1058,26 +1153,27 @@ class ChromaBackend(BaseBackend):
|
|||||||
# Public static helpers (legacy; prefer :meth:`get_collection`)
|
# Public static helpers (legacy; prefer :meth:`get_collection`)
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
# Per-process record of palaces that have already had quarantine_stale_hnsw
|
# Per-process record of palaces that have already had the cold-start
|
||||||
# invoked at least once. The proactive drift check is a *cold-start*
|
# quarantine invoked at least once. The proactive HNSW checks are a
|
||||||
# protection — it catches HNSW segments that arrived stale relative to
|
# *cold-start* protection — they catch segments that arrive stale relative
|
||||||
# ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore,
|
# to ``chroma.sqlite3`` or invalid on disk (e.g. cross-machine replication,
|
||||||
# crashed-mid-write). Once a long-running process has opened the palace
|
# partial restore, crashed-mid-write). Once a long-running process has
|
||||||
# cleanly, re-firing on every reconnect is a *runtime thrash*: the
|
# opened the palace cleanly, re-firing the stale check on every reconnect
|
||||||
# daemon's own writes bump sqlite mtime but HNSW flushes batch on
|
# is a *runtime thrash*: the daemon's own writes bump sqlite mtime but HNSW
|
||||||
# chromadb's internal cadence, so the mtime gap naturally exceeds the
|
# flushes batch on chromadb's internal cadence, so the mtime gap naturally
|
||||||
# threshold under steady write load even though nothing is corrupt.
|
# exceeds the threshold under steady write load even though nothing is
|
||||||
|
# corrupt.
|
||||||
# Real runtime drift is still handled — palace-daemon's ``_auto_repair``
|
# Real runtime drift is still handled — palace-daemon's ``_auto_repair``
|
||||||
# calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors,
|
# calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors,
|
||||||
# which bypasses this gate.
|
# which bypasses this gate.
|
||||||
#
|
#
|
||||||
# Thread-safety: this set is mutated without a lock. Two concurrent
|
# Thread-safety: this set is mutated without a lock. Two concurrent
|
||||||
# ``make_client()`` calls for the same palace can both pass the
|
# ``make_client()`` calls for the same palace can both pass the
|
||||||
# membership check and both invoke ``quarantine_stale_hnsw``. That's
|
# membership check and both invoke the cold-start quarantine. That's
|
||||||
# safe because the function is idempotent (mtime check + timestamped
|
# safe because the functions are idempotent (mtime checks + timestamped
|
||||||
# rename of distinct directories), so the worst-case race produces
|
# rename of distinct directories), so the worst-case race produces one
|
||||||
# one redundant rename attempt that no-ops. Idempotency is the
|
# redundant rename attempt that no-ops. Idempotency is the safety
|
||||||
# safety property; locking would add cost without correctness gain.
|
# property; locking would add cost without correctness gain.
|
||||||
_quarantined_paths: set[str] = set()
|
_quarantined_paths: set[str] = set()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -1085,12 +1181,16 @@ class ChromaBackend(BaseBackend):
|
|||||||
"""Run the pre-open safety pass shared by :meth:`make_client` and
|
"""Run the pre-open safety pass shared by :meth:`make_client` and
|
||||||
:meth:`_client`.
|
:meth:`_client`.
|
||||||
|
|
||||||
Two steps, both required before constructing a ``PersistentClient``:
|
Three steps, all required before constructing a ``PersistentClient``:
|
||||||
|
|
||||||
1. ``_fix_blob_seq_ids`` — repairs the BLOB seq_id quirk that bites
|
1. ``_fix_blob_seq_ids`` — repairs the BLOB seq_id quirk that bites
|
||||||
certain chromadb migrations.
|
certain chromadb migrations.
|
||||||
2. ``quarantine_stale_hnsw`` — gated by :attr:`_quarantined_paths` so
|
2. ``quarantine_invalid_hnsw_metadata`` — renames aside any HNSW
|
||||||
it fires once per palace per process. This is the SIGSEGV
|
``index_metadata.pickle`` that fails to load, so chromadb opens
|
||||||
|
against an empty index instead of crashing on the unloadable
|
||||||
|
pickle (#1266 / PR #1285).
|
||||||
|
3. ``quarantine_stale_hnsw`` — also gated by :attr:`_quarantined_paths`
|
||||||
|
so it fires once per palace per process. This is the SIGSEGV
|
||||||
prevention path for stale HNSW segments (see #1121, #1132, #1263);
|
prevention path for stale HNSW segments (see #1121, #1132, #1263);
|
||||||
wiring it through this helper means CLI mining, search, repair,
|
wiring it through this helper means CLI mining, search, repair,
|
||||||
and status all benefit, not just the legacy ``make_client``
|
and status all benefit, not just the legacy ``make_client``
|
||||||
@@ -1102,6 +1202,7 @@ class ChromaBackend(BaseBackend):
|
|||||||
"""
|
"""
|
||||||
_fix_blob_seq_ids(palace_path)
|
_fix_blob_seq_ids(palace_path)
|
||||||
if palace_path not in ChromaBackend._quarantined_paths:
|
if palace_path not in ChromaBackend._quarantined_paths:
|
||||||
|
quarantine_invalid_hnsw_metadata(palace_path)
|
||||||
quarantine_stale_hnsw(palace_path)
|
quarantine_stale_hnsw(palace_path)
|
||||||
ChromaBackend._quarantined_paths.add(palace_path)
|
ChromaBackend._quarantined_paths.add(palace_path)
|
||||||
|
|
||||||
@@ -1113,7 +1214,7 @@ class ChromaBackend(BaseBackend):
|
|||||||
own client cache. New code should obtain a collection through
|
own client cache. New code should obtain a collection through
|
||||||
:meth:`get_collection` which manages caching internally.
|
:meth:`get_collection` which manages caching internally.
|
||||||
|
|
||||||
Quarantines stale HNSW segments **once per palace per process**. See
|
Quarantines HNSW segments **once per palace per process**. See
|
||||||
:attr:`_quarantined_paths` for the rationale (cold-start protection
|
:attr:`_quarantined_paths` for the rationale (cold-start protection
|
||||||
vs. runtime thrash on steady-write daemons).
|
vs. runtime thrash on steady-write daemons).
|
||||||
"""
|
"""
|
||||||
|
|||||||
+36
-26
@@ -654,7 +654,14 @@ def cmd_repair(args):
|
|||||||
import shutil
|
import shutil
|
||||||
from .backends.chroma import ChromaBackend
|
from .backends.chroma import ChromaBackend
|
||||||
from .migrate import confirm_destructive_action, contains_palace_database
|
from .migrate import confirm_destructive_action, contains_palace_database
|
||||||
from .repair import TruncationDetected, check_extraction_safety
|
from .repair import (
|
||||||
|
RebuildCollectionError,
|
||||||
|
TruncationDetected,
|
||||||
|
_close_chroma_handles,
|
||||||
|
_extract_drawers,
|
||||||
|
_rebuild_collection_via_temp,
|
||||||
|
check_extraction_safety,
|
||||||
|
)
|
||||||
|
|
||||||
palace_path = os.path.abspath(
|
palace_path = os.path.abspath(
|
||||||
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
|
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
|
||||||
@@ -762,18 +769,7 @@ def cmd_repair(args):
|
|||||||
# Extract all drawers in batches
|
# Extract all drawers in batches
|
||||||
print("\n Extracting drawers...")
|
print("\n Extracting drawers...")
|
||||||
batch_size = 5000
|
batch_size = 5000
|
||||||
all_ids = []
|
all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size)
|
||||||
all_docs = []
|
|
||||||
all_metas = []
|
|
||||||
offset = 0
|
|
||||||
while offset < total:
|
|
||||||
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
|
|
||||||
if not batch["ids"]:
|
|
||||||
break
|
|
||||||
all_ids.extend(batch["ids"])
|
|
||||||
all_docs.extend(batch["documents"])
|
|
||||||
all_metas.extend(batch["metadatas"])
|
|
||||||
offset += len(batch["ids"])
|
|
||||||
print(f" Extracted {len(all_ids)} drawers")
|
print(f" Extracted {len(all_ids)} drawers")
|
||||||
|
|
||||||
# ── #1208 guard ──────────────────────────────────────────────────
|
# ── #1208 guard ──────────────────────────────────────────────────
|
||||||
@@ -793,7 +789,6 @@ def cmd_repair(args):
|
|||||||
print(e.message)
|
print(e.message)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Backup and rebuild
|
|
||||||
palace_path = os.path.normpath(palace_path)
|
palace_path = os.path.normpath(palace_path)
|
||||||
backup_path = palace_path + ".backup"
|
backup_path = palace_path + ".backup"
|
||||||
if os.path.exists(backup_path):
|
if os.path.exists(backup_path):
|
||||||
@@ -807,18 +802,33 @@ def cmd_repair(args):
|
|||||||
print(f" Backing up to {backup_path}...")
|
print(f" Backing up to {backup_path}...")
|
||||||
shutil.copytree(palace_path, backup_path)
|
shutil.copytree(palace_path, backup_path)
|
||||||
|
|
||||||
print(" Rebuilding collection...")
|
try:
|
||||||
backend.delete_collection(palace_path, "mempalace_drawers")
|
filed = _rebuild_collection_via_temp(
|
||||||
new_col = backend.create_collection(palace_path, "mempalace_drawers")
|
backend,
|
||||||
|
palace_path,
|
||||||
filed = 0
|
all_ids,
|
||||||
for i in range(0, len(all_ids), batch_size):
|
all_docs,
|
||||||
batch_ids = all_ids[i : i + batch_size]
|
all_metas,
|
||||||
batch_docs = all_docs[i : i + batch_size]
|
batch_size,
|
||||||
batch_metas = all_metas[i : i + batch_size]
|
progress=print,
|
||||||
new_col.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
)
|
||||||
filed += len(batch_ids)
|
except RebuildCollectionError as e:
|
||||||
print(f" Re-filed {filed}/{len(all_ids)} drawers...")
|
print(f" Repair failed: {e}")
|
||||||
|
if getattr(e, "live_replaced", False):
|
||||||
|
print(" Live collection was already replaced; restoring from backup...")
|
||||||
|
try:
|
||||||
|
_close_chroma_handles(palace_path, backend=backend)
|
||||||
|
if os.path.exists(palace_path):
|
||||||
|
shutil.rmtree(palace_path)
|
||||||
|
shutil.copytree(backup_path, palace_path)
|
||||||
|
print(f" Restore complete from backup: {backup_path}")
|
||||||
|
except Exception as restore_error:
|
||||||
|
print(f" Automatic restore failed: {restore_error}")
|
||||||
|
print(" Manual recovery required:")
|
||||||
|
print(f" 1. Remove or rename the broken directory: {palace_path}")
|
||||||
|
print(f" 2. Restore the backup directory to: {palace_path}")
|
||||||
|
print(f" Backup location: {backup_path}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
print(f"\n Repair complete. {filed} drawers rebuilt.")
|
print(f"\n Repair complete. {filed} drawers rebuilt.")
|
||||||
print(f" Backup saved at {backup_path}")
|
print(f" Backup saved at {backup_path}")
|
||||||
|
|||||||
+136
-31
@@ -38,10 +38,13 @@ from collections import defaultdict
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Iterator, Optional
|
from typing import Iterator, Optional
|
||||||
|
|
||||||
|
from chromadb.errors import NotFoundError as ChromaNotFoundError
|
||||||
|
|
||||||
from .backends.chroma import ChromaBackend, hnsw_capacity_status
|
from .backends.chroma import ChromaBackend, hnsw_capacity_status
|
||||||
|
|
||||||
|
|
||||||
COLLECTION_NAME = "mempalace_drawers"
|
COLLECTION_NAME = "mempalace_drawers"
|
||||||
|
REPAIR_TEMP_COLLECTION = f"{COLLECTION_NAME}__repair_tmp"
|
||||||
|
|
||||||
# The closets collection (AAAK index layer) is intentionally fixed —
|
# The closets collection (AAAK index layer) is intentionally fixed —
|
||||||
# closets reference drawer IDs by string and live alongside drawers in the
|
# closets reference drawer IDs by string and live alongside drawers in the
|
||||||
@@ -125,6 +128,108 @@ def _paginate_ids(col, where=None):
|
|||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_drawers(col, total: int, batch_size: int):
|
||||||
|
all_ids = []
|
||||||
|
all_docs = []
|
||||||
|
all_metas = []
|
||||||
|
offset = 0
|
||||||
|
while offset < total:
|
||||||
|
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
|
||||||
|
if not batch["ids"]:
|
||||||
|
break
|
||||||
|
all_ids.extend(batch["ids"])
|
||||||
|
all_docs.extend(batch["documents"])
|
||||||
|
all_metas.extend(batch["metadatas"])
|
||||||
|
offset += len(batch["ids"])
|
||||||
|
return all_ids, all_docs, all_metas
|
||||||
|
|
||||||
|
|
||||||
|
def _verify_collection_count(col, expected: int, label: str) -> None:
|
||||||
|
actual = col.count()
|
||||||
|
if actual != expected:
|
||||||
|
raise RuntimeError(f"{label} count mismatch: expected {expected}, got {actual}")
|
||||||
|
|
||||||
|
|
||||||
|
def _is_missing_collection_value_error(exc: ValueError) -> bool:
|
||||||
|
message = str(exc).lower()
|
||||||
|
return "does not exist" in message or "not found" in message
|
||||||
|
|
||||||
|
|
||||||
|
def _delete_collection_if_exists(backend, palace_path: str, collection_name: str) -> None:
|
||||||
|
try:
|
||||||
|
backend.delete_collection(palace_path, collection_name)
|
||||||
|
except ValueError as exc:
|
||||||
|
if _is_missing_collection_value_error(exc):
|
||||||
|
return
|
||||||
|
raise
|
||||||
|
except (FileNotFoundError, ChromaNotFoundError):
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
class RebuildCollectionError(RuntimeError):
|
||||||
|
"""Raised when temp rebuild fails, carrying whether the live swap happened."""
|
||||||
|
|
||||||
|
def __init__(self, message: str, *, live_replaced: bool):
|
||||||
|
super().__init__(message)
|
||||||
|
self.live_replaced = live_replaced
|
||||||
|
|
||||||
|
|
||||||
|
def _rebuild_collection_via_temp(
|
||||||
|
backend,
|
||||||
|
palace_path: str,
|
||||||
|
all_ids,
|
||||||
|
all_docs,
|
||||||
|
all_metas,
|
||||||
|
batch_size: int,
|
||||||
|
progress=print,
|
||||||
|
) -> int:
|
||||||
|
expected = len(all_ids)
|
||||||
|
temp_name = REPAIR_TEMP_COLLECTION
|
||||||
|
live_replaced = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
_delete_collection_if_exists(backend, palace_path, temp_name)
|
||||||
|
|
||||||
|
progress(f" Building temporary collection: {temp_name}")
|
||||||
|
temp_col = backend.create_collection(palace_path, temp_name)
|
||||||
|
staged = 0
|
||||||
|
for i in range(0, expected, batch_size):
|
||||||
|
batch_ids = all_ids[i : i + batch_size]
|
||||||
|
batch_docs = all_docs[i : i + batch_size]
|
||||||
|
batch_metas = all_metas[i : i + batch_size]
|
||||||
|
temp_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
||||||
|
staged += len(batch_ids)
|
||||||
|
progress(f" Staged {staged}/{expected} drawers...")
|
||||||
|
_verify_collection_count(temp_col, expected, "temporary rebuild")
|
||||||
|
|
||||||
|
progress(" Rebuilding live collection...")
|
||||||
|
backend.delete_collection(palace_path, COLLECTION_NAME)
|
||||||
|
live_replaced = True
|
||||||
|
new_col = backend.create_collection(palace_path, COLLECTION_NAME)
|
||||||
|
|
||||||
|
rebuilt = 0
|
||||||
|
for i in range(0, expected, batch_size):
|
||||||
|
batch_ids = all_ids[i : i + batch_size]
|
||||||
|
batch_docs = all_docs[i : i + batch_size]
|
||||||
|
batch_metas = all_metas[i : i + batch_size]
|
||||||
|
new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
||||||
|
rebuilt += len(batch_ids)
|
||||||
|
progress(f" Re-filed {rebuilt}/{expected} drawers...")
|
||||||
|
_verify_collection_count(new_col, expected, "rebuilt live collection")
|
||||||
|
|
||||||
|
try:
|
||||||
|
_delete_collection_if_exists(backend, palace_path, temp_name)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return rebuilt
|
||||||
|
except Exception as exc:
|
||||||
|
try:
|
||||||
|
_delete_collection_if_exists(backend, palace_path, temp_name)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
raise RebuildCollectionError(str(exc), live_replaced=live_replaced) from exc
|
||||||
|
|
||||||
|
|
||||||
def scan_palace(palace_path=None, only_wing=None):
|
def scan_palace(palace_path=None, only_wing=None):
|
||||||
"""Scan the palace for corrupt/unfetchable IDs.
|
"""Scan the palace for corrupt/unfetchable IDs.
|
||||||
|
|
||||||
@@ -415,18 +520,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
|||||||
# Extract all drawers in batches
|
# Extract all drawers in batches
|
||||||
print("\n Extracting drawers...")
|
print("\n Extracting drawers...")
|
||||||
batch_size = 5000
|
batch_size = 5000
|
||||||
all_ids = []
|
all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size)
|
||||||
all_docs = []
|
|
||||||
all_metas = []
|
|
||||||
offset = 0
|
|
||||||
while offset < total:
|
|
||||||
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
|
|
||||||
if not batch["ids"]:
|
|
||||||
break
|
|
||||||
all_ids.extend(batch["ids"])
|
|
||||||
all_docs.extend(batch["documents"])
|
|
||||||
all_metas.extend(batch["metadatas"])
|
|
||||||
offset += len(batch["ids"])
|
|
||||||
print(f" Extracted {len(all_ids)} drawers")
|
print(f" Extracted {len(all_ids)} drawers")
|
||||||
|
|
||||||
# ── #1208 guard ──────────────────────────────────────────────────
|
# ── #1208 guard ──────────────────────────────────────────────────
|
||||||
@@ -449,28 +543,33 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
|||||||
|
|
||||||
# Rebuild with correct HNSW settings
|
# Rebuild with correct HNSW settings
|
||||||
print(" Rebuilding collection with hnsw:space=cosine...")
|
print(" Rebuilding collection with hnsw:space=cosine...")
|
||||||
backend.delete_collection(palace_path, COLLECTION_NAME)
|
|
||||||
new_col = backend.create_collection(palace_path, COLLECTION_NAME)
|
|
||||||
|
|
||||||
filed = 0
|
|
||||||
try:
|
try:
|
||||||
for i in range(0, len(all_ids), batch_size):
|
filed = _rebuild_collection_via_temp(
|
||||||
batch_ids = all_ids[i : i + batch_size]
|
backend,
|
||||||
batch_docs = all_docs[i : i + batch_size]
|
palace_path,
|
||||||
batch_metas = all_metas[i : i + batch_size]
|
all_ids,
|
||||||
new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
all_docs,
|
||||||
filed += len(batch_ids)
|
all_metas,
|
||||||
print(f" Re-filed {filed}/{len(all_ids)} drawers...")
|
batch_size,
|
||||||
except Exception as e:
|
progress=print,
|
||||||
|
)
|
||||||
|
except RebuildCollectionError as e:
|
||||||
print(f"\n ERROR during rebuild: {e}")
|
print(f"\n ERROR during rebuild: {e}")
|
||||||
print(f" Only {filed}/{len(all_ids)} drawers were re-filed.")
|
print(" Rebuild aborted before completion.")
|
||||||
if os.path.exists(backup_path):
|
if e.live_replaced and os.path.exists(backup_path):
|
||||||
print(f" Restoring from backup: {backup_path}")
|
print(f" Restoring from backup: {backup_path}")
|
||||||
backend.delete_collection(palace_path, COLLECTION_NAME)
|
try:
|
||||||
|
_close_chroma_handles(palace_path, backend=backend)
|
||||||
|
_delete_collection_if_exists(backend, palace_path, COLLECTION_NAME)
|
||||||
shutil.copy2(backup_path, sqlite_path)
|
shutil.copy2(backup_path, sqlite_path)
|
||||||
print(" Backup restored. Palace is back to pre-repair state.")
|
print(" Backup restored. Palace is back to pre-repair state.")
|
||||||
else:
|
except Exception as restore_error:
|
||||||
|
print(f" Backup restore failed: {restore_error}")
|
||||||
|
print(f" Manual restore required from: {backup_path}")
|
||||||
|
elif e.live_replaced:
|
||||||
print(" No backup available. Re-mine from source files to recover.")
|
print(" No backup available. Re-mine from source files to recover.")
|
||||||
|
else:
|
||||||
|
print(" Live collection was not replaced; leaving the original palace untouched.")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
print(f"\n Repair complete. {filed} drawers rebuilt.")
|
print(f"\n Repair complete. {filed} drawers rebuilt.")
|
||||||
@@ -909,12 +1008,18 @@ def status(palace_path=None) -> dict:
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def _close_chroma_handles(palace_path: str) -> None:
|
def _close_chroma_handles(palace_path: str, backend: "ChromaBackend | None" = None) -> None:
|
||||||
"""Drop ChromaBackend + chromadb singleton caches so OS mmap handles release."""
|
"""Drop ChromaBackend + chromadb singleton caches so OS mmap handles release.
|
||||||
|
|
||||||
|
When ``backend`` is provided, close the live instance so rollback/restore
|
||||||
|
releases the handles it was already using. Otherwise fall back to a
|
||||||
|
transient backend instance for the max-seq-id repair path.
|
||||||
|
"""
|
||||||
import gc
|
import gc
|
||||||
|
|
||||||
try:
|
try:
|
||||||
ChromaBackend().close_palace(palace_path)
|
closer = backend if backend is not None else ChromaBackend()
|
||||||
|
closer.close_palace(palace_path)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
|
|||||||
+37
-6
@@ -406,6 +406,31 @@ def _bm25_only_via_sqlite(
|
|||||||
"hint": "Run: mempalace init <dir> && mempalace mine <dir>",
|
"hint": "Run: mempalace init <dir> && mempalace mine <dir>",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]:
|
||||||
|
clauses = []
|
||||||
|
params = []
|
||||||
|
for key, value in (("wing", wing), ("room", room)):
|
||||||
|
if not value:
|
||||||
|
continue
|
||||||
|
clauses.append(
|
||||||
|
f"""
|
||||||
|
AND EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM embedding_metadata mf
|
||||||
|
WHERE mf.id = {row_id_expr}
|
||||||
|
AND mf.key = ?
|
||||||
|
AND COALESCE(
|
||||||
|
mf.string_value,
|
||||||
|
CAST(mf.int_value AS TEXT),
|
||||||
|
CAST(mf.float_value AS TEXT),
|
||||||
|
CAST(mf.bool_value AS TEXT)
|
||||||
|
) = ?
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
params.extend([key, value])
|
||||||
|
return "".join(clauses), params
|
||||||
|
|
||||||
try:
|
try:
|
||||||
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
|
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
@@ -418,15 +443,17 @@ def _bm25_only_via_sqlite(
|
|||||||
candidate_ids: list[int] = []
|
candidate_ids: list[int] = []
|
||||||
if tokens:
|
if tokens:
|
||||||
fts_query = " OR ".join(tokens)
|
fts_query = " OR ".join(tokens)
|
||||||
|
filter_sql, filter_params = _metadata_filter_sql("embedding_fulltext_search.rowid")
|
||||||
try:
|
try:
|
||||||
rows = conn.execute(
|
rows = conn.execute(
|
||||||
"""
|
f"""
|
||||||
SELECT rowid
|
SELECT rowid
|
||||||
FROM embedding_fulltext_search
|
FROM embedding_fulltext_search
|
||||||
WHERE embedding_fulltext_search MATCH ?
|
WHERE embedding_fulltext_search MATCH ?
|
||||||
|
{filter_sql}
|
||||||
LIMIT ?
|
LIMIT ?
|
||||||
""",
|
""",
|
||||||
(fts_query, max_candidates),
|
(fts_query, *filter_params, max_candidates),
|
||||||
).fetchall()
|
).fetchall()
|
||||||
candidate_ids = [r[0] for r in rows]
|
candidate_ids = [r[0] for r in rows]
|
||||||
except sqlite3.Error:
|
except sqlite3.Error:
|
||||||
@@ -444,17 +471,19 @@ def _bm25_only_via_sqlite(
|
|||||||
# fall back to ordering by primary-key id and finally to an
|
# fall back to ordering by primary-key id and finally to an
|
||||||
# empty result rather than letting search raise.
|
# empty result rather than letting search raise.
|
||||||
try:
|
try:
|
||||||
|
filter_sql, filter_params = _metadata_filter_sql("e.id")
|
||||||
rows = conn.execute(
|
rows = conn.execute(
|
||||||
"""
|
f"""
|
||||||
SELECT e.id
|
SELECT e.id
|
||||||
FROM embeddings e
|
FROM embeddings e
|
||||||
JOIN segments s ON e.segment_id = s.id
|
JOIN segments s ON e.segment_id = s.id
|
||||||
JOIN collections c ON s.collection = c.id
|
JOIN collections c ON s.collection = c.id
|
||||||
WHERE c.name = 'mempalace_drawers'
|
WHERE c.name = 'mempalace_drawers'
|
||||||
|
{filter_sql}
|
||||||
ORDER BY e.created_at DESC
|
ORDER BY e.created_at DESC
|
||||||
LIMIT ?
|
LIMIT ?
|
||||||
""",
|
""",
|
||||||
(max_candidates,),
|
(*filter_params, max_candidates),
|
||||||
).fetchall()
|
).fetchall()
|
||||||
candidate_ids = [r[0] for r in rows]
|
candidate_ids = [r[0] for r in rows]
|
||||||
except sqlite3.Error:
|
except sqlite3.Error:
|
||||||
@@ -463,17 +492,19 @@ def _bm25_only_via_sqlite(
|
|||||||
exc_info=True,
|
exc_info=True,
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
|
filter_sql, filter_params = _metadata_filter_sql("e.id")
|
||||||
rows = conn.execute(
|
rows = conn.execute(
|
||||||
"""
|
f"""
|
||||||
SELECT e.id
|
SELECT e.id
|
||||||
FROM embeddings e
|
FROM embeddings e
|
||||||
JOIN segments s ON e.segment_id = s.id
|
JOIN segments s ON e.segment_id = s.id
|
||||||
JOIN collections c ON s.collection = c.id
|
JOIN collections c ON s.collection = c.id
|
||||||
WHERE c.name = 'mempalace_drawers'
|
WHERE c.name = 'mempalace_drawers'
|
||||||
|
{filter_sql}
|
||||||
ORDER BY e.id DESC
|
ORDER BY e.id DESC
|
||||||
LIMIT ?
|
LIMIT ?
|
||||||
""",
|
""",
|
||||||
(max_candidates,),
|
(*filter_params, max_candidates),
|
||||||
).fetchall()
|
).fetchall()
|
||||||
candidate_ids = [r[0] for r in rows]
|
candidate_ids = [r[0] for r in rows]
|
||||||
except sqlite3.Error:
|
except sqlite3.Error:
|
||||||
|
|||||||
+276
-1
@@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import pickle
|
||||||
import shutil
|
import shutil
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -19,6 +20,7 @@ from mempalace.backends.chroma import (
|
|||||||
ChromaCollection,
|
ChromaCollection,
|
||||||
_fix_blob_seq_ids,
|
_fix_blob_seq_ids,
|
||||||
_pin_hnsw_threads,
|
_pin_hnsw_threads,
|
||||||
|
quarantine_invalid_hnsw_metadata,
|
||||||
quarantine_stale_hnsw,
|
quarantine_stale_hnsw,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -755,7 +757,10 @@ def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeyp
|
|||||||
"""Quarantine fires on first ``make_client()`` for a palace, then is
|
"""Quarantine fires on first ``make_client()`` for a palace, then is
|
||||||
skipped on subsequent calls — prevents runtime thrash where a daemon's
|
skipped on subsequent calls — prevents runtime thrash where a daemon's
|
||||||
own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes,
|
own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes,
|
||||||
making the mtime heuristic falsely trigger every reconnect."""
|
making the mtime heuristic falsely trigger every reconnect.
|
||||||
|
|
||||||
|
Invalid metadata quarantine shares the same cold-start gate here; the
|
||||||
|
more aggressive refresh path lives in ``_client()``."""
|
||||||
from mempalace.backends.chroma import ChromaBackend
|
from mempalace.backends.chroma import ChromaBackend
|
||||||
|
|
||||||
palace_path = str(tmp_path / "palace")
|
palace_path = str(tmp_path / "palace")
|
||||||
@@ -782,6 +787,34 @@ def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeyp
|
|||||||
], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
|
], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
|
||||||
|
|
||||||
|
|
||||||
|
def test_make_client_gates_invalid_metadata_on_first_call(tmp_path, monkeypatch):
|
||||||
|
"""Invalid metadata quarantine is gated on the first make_client() call."""
|
||||||
|
from mempalace.backends.chroma import ChromaBackend
|
||||||
|
|
||||||
|
palace_path = str(tmp_path / "palace")
|
||||||
|
os.makedirs(palace_path, exist_ok=True)
|
||||||
|
(Path(palace_path) / "chroma.sqlite3").write_text("")
|
||||||
|
|
||||||
|
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
|
||||||
|
|
||||||
|
calls: list[str] = []
|
||||||
|
|
||||||
|
def _invalid(path, *args, **kwargs):
|
||||||
|
calls.append(path)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _stale(path, stale_seconds=300.0):
|
||||||
|
return []
|
||||||
|
|
||||||
|
monkeypatch.setattr("mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _invalid)
|
||||||
|
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _stale)
|
||||||
|
|
||||||
|
ChromaBackend.make_client(palace_path)
|
||||||
|
ChromaBackend.make_client(palace_path)
|
||||||
|
|
||||||
|
assert calls == [palace_path]
|
||||||
|
|
||||||
|
|
||||||
def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
|
def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
|
||||||
"""Two distinct palaces each get one quarantine attempt — the gate is
|
"""Two distinct palaces each get one quarantine attempt — the gate is
|
||||||
keyed by palace path, not global."""
|
keyed by palace path, not global."""
|
||||||
@@ -919,3 +952,245 @@ def test_get_collection_applies_retrofit_on_existing_palace(tmp_path):
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1
|
assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_invalid_hnsw_metadata_renames_missing_dimensionality(tmp_path):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
seg = palace / "abcd-1234-5678"
|
||||||
|
seg.mkdir()
|
||||||
|
with open(seg / "index_metadata.pickle", "wb") as f:
|
||||||
|
pickle.dump({"dimensionality": None, "id_to_label": {"a": 1}}, f)
|
||||||
|
|
||||||
|
moved = quarantine_invalid_hnsw_metadata(str(palace))
|
||||||
|
|
||||||
|
assert len(moved) == 1
|
||||||
|
assert ".corrupt-" in moved[0]
|
||||||
|
assert not seg.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_invalid_hnsw_metadata_allows_uninitialized_segment(tmp_path):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
seg = palace / "abcd-1234-5678"
|
||||||
|
seg.mkdir()
|
||||||
|
with open(seg / "index_metadata.pickle", "wb") as f:
|
||||||
|
pickle.dump({"dimensionality": None, "id_to_label": {}}, f)
|
||||||
|
|
||||||
|
moved = quarantine_invalid_hnsw_metadata(str(palace))
|
||||||
|
|
||||||
|
assert moved == []
|
||||||
|
assert seg.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_invalid_hnsw_metadata_rejects_non_dict_id_to_label(tmp_path):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
seg = palace / "abcd-1234-5678"
|
||||||
|
seg.mkdir()
|
||||||
|
with open(seg / "index_metadata.pickle", "wb") as f:
|
||||||
|
pickle.dump({"dimensionality": 8, "id_to_label": ["a", "b"]}, f)
|
||||||
|
|
||||||
|
moved = quarantine_invalid_hnsw_metadata(str(palace))
|
||||||
|
|
||||||
|
assert len(moved) == 1
|
||||||
|
assert ".corrupt-" in moved[0]
|
||||||
|
assert not seg.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_invalid_hnsw_metadata_rejects_non_schema_payload(tmp_path):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
seg = palace / "abcd-1234-5678"
|
||||||
|
seg.mkdir()
|
||||||
|
with open(seg / "index_metadata.pickle", "wb") as f:
|
||||||
|
pickle.dump(["not", "a", "metadata", "object"], f)
|
||||||
|
|
||||||
|
moved = quarantine_invalid_hnsw_metadata(str(palace))
|
||||||
|
|
||||||
|
assert len(moved) == 1
|
||||||
|
assert ".corrupt-" in moved[0]
|
||||||
|
assert not seg.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def _dangerous_pickle_payload_executed():
|
||||||
|
raise AssertionError("unsafe pickle payload executed")
|
||||||
|
|
||||||
|
|
||||||
|
class _DangerousPickle:
|
||||||
|
def __reduce__(self):
|
||||||
|
return (_dangerous_pickle_payload_executed, ())
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_invalid_hnsw_metadata_rejects_unsafe_pickle(tmp_path):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
seg = palace / "abcd-1234-5678"
|
||||||
|
seg.mkdir()
|
||||||
|
with open(seg / "index_metadata.pickle", "wb") as f:
|
||||||
|
pickle.dump(_DangerousPickle(), f)
|
||||||
|
|
||||||
|
moved = quarantine_invalid_hnsw_metadata(str(palace))
|
||||||
|
|
||||||
|
assert len(moved) == 1
|
||||||
|
assert ".corrupt-" in moved[0]
|
||||||
|
assert not seg.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_invalid_hnsw_metadata_skips_transient_read_errors(tmp_path, monkeypatch):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
seg = palace / "abcd-1234-5678"
|
||||||
|
seg.mkdir()
|
||||||
|
meta = seg / "index_metadata.pickle"
|
||||||
|
meta.write_bytes(b"partial")
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"mempalace.backends.chroma._SafePersistentDataUnpickler.load",
|
||||||
|
lambda path: (_ for _ in ()).throw(EOFError("flush in progress")),
|
||||||
|
)
|
||||||
|
|
||||||
|
moved = quarantine_invalid_hnsw_metadata(str(palace))
|
||||||
|
|
||||||
|
assert moved == []
|
||||||
|
assert seg.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_invalid_hnsw_metadata_skips_truncated_pickle(tmp_path, monkeypatch):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
seg = palace / "abcd-1234-5678"
|
||||||
|
seg.mkdir()
|
||||||
|
meta = seg / "index_metadata.pickle"
|
||||||
|
meta.write_bytes(b"partial")
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"mempalace.backends.chroma._SafePersistentDataUnpickler.load",
|
||||||
|
lambda path: (_ for _ in ()).throw(pickle.UnpicklingError("pickle data was truncated")),
|
||||||
|
)
|
||||||
|
|
||||||
|
moved = quarantine_invalid_hnsw_metadata(str(palace))
|
||||||
|
|
||||||
|
assert moved == []
|
||||||
|
assert seg.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_chroma_backend_preflights_metadata_before_persistent_client(tmp_path, monkeypatch):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def _record(name):
|
||||||
|
def inner(path, *args, **kwargs):
|
||||||
|
calls.append((name, path))
|
||||||
|
return [] if name != "blob" else None
|
||||||
|
|
||||||
|
return inner
|
||||||
|
|
||||||
|
monkeypatch.setattr("mempalace.backends.chroma._fix_blob_seq_ids", _record("blob"))
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _record("invalid")
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _record("stale"))
|
||||||
|
|
||||||
|
class DummyClient:
|
||||||
|
pass
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"mempalace.backends.chroma.chromadb.PersistentClient", lambda path: DummyClient()
|
||||||
|
)
|
||||||
|
|
||||||
|
backend = ChromaBackend()
|
||||||
|
backend._client(str(palace))
|
||||||
|
|
||||||
|
assert calls == [
|
||||||
|
("blob", str(palace)),
|
||||||
|
("invalid", str(palace)),
|
||||||
|
("stale", str(palace)),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_chroma_backend_stale_quarantine_is_cold_start_only_on_refresh(tmp_path, monkeypatch):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
(palace / "chroma.sqlite3").write_text("")
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def _record(name):
|
||||||
|
def inner(path, *args, **kwargs):
|
||||||
|
calls.append((name, path))
|
||||||
|
return [] if name != "blob" else None
|
||||||
|
|
||||||
|
return inner
|
||||||
|
|
||||||
|
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
|
||||||
|
monkeypatch.setattr("mempalace.backends.chroma._fix_blob_seq_ids", _record("blob"))
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _record("invalid")
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _record("stale"))
|
||||||
|
|
||||||
|
class DummyClient:
|
||||||
|
pass
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"mempalace.backends.chroma.chromadb.PersistentClient", lambda path: DummyClient()
|
||||||
|
)
|
||||||
|
|
||||||
|
backend = ChromaBackend()
|
||||||
|
stats = iter([(1, 1.0), (1, 1.0), (1, 2.0), (1, 2.0)])
|
||||||
|
monkeypatch.setattr(backend, "_db_stat", lambda path: next(stats))
|
||||||
|
|
||||||
|
backend._client(str(palace))
|
||||||
|
backend._client(str(palace))
|
||||||
|
|
||||||
|
assert calls == [
|
||||||
|
("blob", str(palace)),
|
||||||
|
("invalid", str(palace)),
|
||||||
|
("stale", str(palace)),
|
||||||
|
("blob", str(palace)),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_chroma_backend_requarantines_after_inode_replacement(tmp_path, monkeypatch):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
(palace / "chroma.sqlite3").write_text("")
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def _record(name):
|
||||||
|
def inner(path, *args, **kwargs):
|
||||||
|
calls.append((name, path))
|
||||||
|
return [] if name != "blob" else None
|
||||||
|
|
||||||
|
return inner
|
||||||
|
|
||||||
|
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
|
||||||
|
monkeypatch.setattr("mempalace.backends.chroma._fix_blob_seq_ids", _record("blob"))
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _record("invalid")
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _record("stale"))
|
||||||
|
|
||||||
|
class DummyClient:
|
||||||
|
pass
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"mempalace.backends.chroma.chromadb.PersistentClient", lambda path: DummyClient()
|
||||||
|
)
|
||||||
|
|
||||||
|
backend = ChromaBackend()
|
||||||
|
stats = iter([(1, 1.0), (1, 1.0), (2, 2.0), (2, 2.0)])
|
||||||
|
monkeypatch.setattr(backend, "_db_stat", lambda path: next(stats))
|
||||||
|
|
||||||
|
backend._client(str(palace))
|
||||||
|
backend._client(str(palace))
|
||||||
|
|
||||||
|
assert calls == [
|
||||||
|
("blob", str(palace)),
|
||||||
|
("invalid", str(palace)),
|
||||||
|
("stale", str(palace)),
|
||||||
|
("blob", str(palace)),
|
||||||
|
("invalid", str(palace)),
|
||||||
|
("stale", str(palace)),
|
||||||
|
]
|
||||||
|
|||||||
+46
-1
@@ -4,7 +4,7 @@ import argparse
|
|||||||
import shlex
|
import shlex
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, call, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@@ -815,13 +815,58 @@ def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
|
|||||||
"documents": ["doc1", "doc2"],
|
"documents": ["doc1", "doc2"],
|
||||||
"metadatas": [{"wing": "a"}, {"wing": "b"}],
|
"metadatas": [{"wing": "a"}, {"wing": "b"}],
|
||||||
}
|
}
|
||||||
|
mock_temp_col = MagicMock()
|
||||||
|
mock_temp_col.count.return_value = 2
|
||||||
mock_new_col = MagicMock()
|
mock_new_col = MagicMock()
|
||||||
|
mock_new_col.count.return_value = 2
|
||||||
mock_backend = _mock_backend_for(col=mock_col, new_col=mock_new_col)
|
mock_backend = _mock_backend_for(col=mock_col, new_col=mock_new_col)
|
||||||
|
mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
|
||||||
with patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend):
|
with patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend):
|
||||||
cmd_repair(args)
|
cmd_repair(args)
|
||||||
out = capsys.readouterr().out
|
out = capsys.readouterr().out
|
||||||
assert "Repair complete" in out
|
assert "Repair complete" in out
|
||||||
assert "2 drawers rebuilt" in out
|
assert "2 drawers rebuilt" in out
|
||||||
|
assert mock_backend.delete_collection.call_args_list == [
|
||||||
|
call(str(palace_dir), "mempalace_drawers__repair_tmp"),
|
||||||
|
call(str(palace_dir), "mempalace_drawers"),
|
||||||
|
call(str(palace_dir), "mempalace_drawers__repair_tmp"),
|
||||||
|
]
|
||||||
|
mock_temp_col.upsert.assert_called_once()
|
||||||
|
mock_new_col.upsert.assert_called_once()
|
||||||
|
mock_new_col.add.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
@patch("mempalace.cli.MempalaceConfig")
|
||||||
|
def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp_path, capsys):
|
||||||
|
palace_dir = tmp_path / "palace"
|
||||||
|
palace_dir.mkdir()
|
||||||
|
(palace_dir / "chroma.sqlite3").write_text("db")
|
||||||
|
mock_config_cls.return_value.palace_path = str(palace_dir)
|
||||||
|
args = argparse.Namespace(palace=None, yes=True)
|
||||||
|
mock_col = MagicMock()
|
||||||
|
mock_col.count.return_value = 2
|
||||||
|
mock_col.get.return_value = {
|
||||||
|
"ids": ["id1", "id2"],
|
||||||
|
"documents": ["doc1", "doc2"],
|
||||||
|
"metadatas": [{"wing": "a"}, {"wing": "b"}],
|
||||||
|
}
|
||||||
|
mock_temp_col = MagicMock()
|
||||||
|
mock_temp_col.count.return_value = 2
|
||||||
|
mock_backend = _mock_backend_for(col=mock_col)
|
||||||
|
mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("live build failed")]
|
||||||
|
with patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend):
|
||||||
|
with pytest.raises(SystemExit) as excinfo:
|
||||||
|
cmd_repair(args)
|
||||||
|
out = capsys.readouterr().out
|
||||||
|
assert excinfo.value.code == 1
|
||||||
|
assert "Repair failed" in out
|
||||||
|
assert "restoring from backup" in out
|
||||||
|
mock_backend.close_palace.assert_called_once_with(str(palace_dir))
|
||||||
|
assert mock_backend.delete_collection.call_args_list == [
|
||||||
|
call(str(palace_dir), "mempalace_drawers__repair_tmp"),
|
||||||
|
call(str(palace_dir), "mempalace_drawers"),
|
||||||
|
call(str(palace_dir), "mempalace_drawers__repair_tmp"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@patch("mempalace.cli.MempalaceConfig")
|
@patch("mempalace.cli.MempalaceConfig")
|
||||||
|
|||||||
+156
-4
@@ -238,14 +238,39 @@ def test_capacity_status_tolerates_flush_lag(tmp_path):
|
|||||||
assert info["status"] == "ok"
|
assert info["status"] == "ok"
|
||||||
|
|
||||||
|
|
||||||
def test_capacity_status_flags_unflushed_with_large_sqlite(tmp_path):
|
def test_capacity_status_does_not_flag_unflushed_with_large_sqlite(tmp_path):
|
||||||
"""No pickle + many sqlite rows is its own divergence signal."""
|
"""No pickle + many sqlite rows is inconclusive, not divergence."""
|
||||||
seg = "seg-noflush"
|
seg = "seg-noflush"
|
||||||
_seed_chroma_db(str(tmp_path), sqlite_count=10_000, segment_id=seg)
|
_seed_chroma_db(str(tmp_path), sqlite_count=10_000, segment_id=seg)
|
||||||
info = hnsw_capacity_status(str(tmp_path), COLLECTION)
|
info = hnsw_capacity_status(str(tmp_path), COLLECTION)
|
||||||
assert info["diverged"] is True
|
assert info["diverged"] is False
|
||||||
|
assert info["status"] == "unknown"
|
||||||
|
assert info["divergence"] is None
|
||||||
assert info["hnsw_count"] is None
|
assert info["hnsw_count"] is None
|
||||||
assert "never flushed" in info["message"]
|
assert "capacity unavailable" in info["message"]
|
||||||
|
assert "leaving vector search enabled" in info["message"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_mcp_probe_does_not_disable_vectors_for_unflushed_metadata(tmp_path, monkeypatch):
|
||||||
|
"""The MCP preflight must not route all searches to BM25 on this signal."""
|
||||||
|
from mempalace import mcp_server
|
||||||
|
|
||||||
|
seg = "seg-mcp-noflush"
|
||||||
|
_seed_chroma_db(str(tmp_path), sqlite_count=10_000, segment_id=seg)
|
||||||
|
|
||||||
|
class _Cfg:
|
||||||
|
palace_path = str(tmp_path)
|
||||||
|
|
||||||
|
monkeypatch.setattr(mcp_server, "_config", _Cfg())
|
||||||
|
monkeypatch.setattr(mcp_server, "_vector_disabled", True)
|
||||||
|
monkeypatch.setattr(mcp_server, "_vector_disabled_reason", "old divergence")
|
||||||
|
|
||||||
|
mcp_server._refresh_vector_disabled_flag()
|
||||||
|
|
||||||
|
assert mcp_server._vector_disabled is False
|
||||||
|
assert mcp_server._vector_disabled_reason == ""
|
||||||
|
assert mcp_server._vector_capacity_status["status"] == "unknown"
|
||||||
|
assert "leaving vector search enabled" in mcp_server._vector_capacity_status["message"]
|
||||||
|
|
||||||
|
|
||||||
def test_capacity_status_quiet_for_empty_palace(tmp_path):
|
def test_capacity_status_quiet_for_empty_palace(tmp_path):
|
||||||
@@ -372,6 +397,17 @@ def _seed_drawers(palace: str, segment_id: str, drawers: list[tuple[str, dict, s
|
|||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _set_drawer_created_at(palace: str, timestamps: dict[int, str]) -> None:
|
||||||
|
db_path = os.path.join(palace, "chroma.sqlite3")
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
try:
|
||||||
|
for emb_id, created_at in timestamps.items():
|
||||||
|
conn.execute("UPDATE embeddings SET created_at = ? WHERE id = ?", (created_at, emb_id))
|
||||||
|
conn.commit()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def palace_with_drawers(tmp_path):
|
def palace_with_drawers(tmp_path):
|
||||||
seg = "seg-bm25"
|
seg = "seg-bm25"
|
||||||
@@ -417,6 +453,122 @@ def test_bm25_fallback_filters_by_wing(palace_with_drawers):
|
|||||||
assert all(r["wing"] == "design" for r in out["results"])
|
assert all(r["wing"] == "design" for r in out["results"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_bm25_fallback_applies_wing_before_fts_candidate_limit(tmp_path):
|
||||||
|
seg = "seg-bm25-fts-limit"
|
||||||
|
_seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
|
||||||
|
_seed_drawers(
|
||||||
|
str(tmp_path),
|
||||||
|
seg,
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"shared token outside target wing",
|
||||||
|
{"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"},
|
||||||
|
"d-1",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"shared token inside target wing",
|
||||||
|
{"wing": "project", "room": "diary", "source_file": "/x/project.md"},
|
||||||
|
"d-2",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
out = _bm25_only_via_sqlite("shared token", str(tmp_path), wing="project", max_candidates=1)
|
||||||
|
|
||||||
|
assert out["total_before_filter"] == 1
|
||||||
|
assert len(out["results"]) == 1
|
||||||
|
assert out["results"][0]["wing"] == "project"
|
||||||
|
|
||||||
|
|
||||||
|
def test_bm25_fallback_applies_room_before_fts_candidate_limit(tmp_path):
|
||||||
|
seg = "seg-bm25-room-limit"
|
||||||
|
_seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
|
||||||
|
_seed_drawers(
|
||||||
|
str(tmp_path),
|
||||||
|
seg,
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"shared token wrong room",
|
||||||
|
{"wing": "project", "room": "scratch", "source_file": "/x/scratch.md"},
|
||||||
|
"d-1",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"shared token right room",
|
||||||
|
{"wing": "project", "room": "diary", "source_file": "/x/diary.md"},
|
||||||
|
"d-2",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
out = _bm25_only_via_sqlite(
|
||||||
|
"shared token",
|
||||||
|
str(tmp_path),
|
||||||
|
wing="project",
|
||||||
|
room="diary",
|
||||||
|
max_candidates=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert out["total_before_filter"] == 1
|
||||||
|
assert len(out["results"]) == 1
|
||||||
|
assert out["results"][0]["wing"] == "project"
|
||||||
|
assert out["results"][0]["room"] == "diary"
|
||||||
|
|
||||||
|
|
||||||
|
def test_bm25_fallback_applies_wing_before_recency_candidate_limit(tmp_path):
|
||||||
|
seg = "seg-bm25-recency-limit"
|
||||||
|
_seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
|
||||||
|
_seed_drawers(
|
||||||
|
str(tmp_path),
|
||||||
|
seg,
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"target drawer for short query",
|
||||||
|
{"wing": "project", "room": "diary", "source_file": "/x/project.md"},
|
||||||
|
"d-1",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"newer drawer outside target wing",
|
||||||
|
{"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"},
|
||||||
|
"d-2",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
_set_drawer_created_at(
|
||||||
|
str(tmp_path),
|
||||||
|
{
|
||||||
|
1: "2026-01-01 00:00:00",
|
||||||
|
2: "2026-02-01 00:00:00",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
out = _bm25_only_via_sqlite("a", str(tmp_path), wing="project", max_candidates=1)
|
||||||
|
|
||||||
|
assert out["total_before_filter"] == 1
|
||||||
|
assert len(out["results"]) == 1
|
||||||
|
assert out["results"][0]["wing"] == "project"
|
||||||
|
|
||||||
|
|
||||||
|
def test_bm25_fallback_returns_empty_when_filtered_wing_has_no_candidates(tmp_path):
|
||||||
|
seg = "seg-bm25-empty-filter"
|
||||||
|
_seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
|
||||||
|
_seed_drawers(
|
||||||
|
str(tmp_path),
|
||||||
|
seg,
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"shared token outside target wing",
|
||||||
|
{"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"},
|
||||||
|
"d-1",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
out = _bm25_only_via_sqlite("shared token", str(tmp_path), wing="project", max_candidates=1)
|
||||||
|
|
||||||
|
assert out["total_before_filter"] == 0
|
||||||
|
assert out["results"] == []
|
||||||
|
|
||||||
|
|
||||||
def test_bm25_fallback_no_palace(tmp_path):
|
def test_bm25_fallback_no_palace(tmp_path):
|
||||||
out = _bm25_only_via_sqlite("anything", str(tmp_path))
|
out = _bm25_only_via_sqlite("anything", str(tmp_path))
|
||||||
assert "error" in out
|
assert "error" in out
|
||||||
|
|||||||
+312
-7
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, call, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@@ -229,8 +229,11 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
|
|||||||
}
|
}
|
||||||
|
|
||||||
mock_new_col = MagicMock()
|
mock_new_col = MagicMock()
|
||||||
|
mock_new_col.count.return_value = 2
|
||||||
|
mock_temp_col = MagicMock()
|
||||||
|
mock_temp_col.count.return_value = 2
|
||||||
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
|
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
|
||||||
mock_backend.create_collection.return_value = mock_new_col
|
mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
|
||||||
|
|
||||||
repair.rebuild_index(palace_path=str(tmp_path))
|
repair.rebuild_index(palace_path=str(tmp_path))
|
||||||
|
|
||||||
@@ -239,14 +242,74 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
|
|||||||
assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args)
|
assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args)
|
||||||
|
|
||||||
# Verify: deleted and recreated (cosine is the backend default)
|
# Verify: deleted and recreated (cosine is the backend default)
|
||||||
mock_backend.delete_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
|
assert mock_backend.create_collection.call_args_list == [
|
||||||
mock_backend.create_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers"),
|
||||||
|
]
|
||||||
|
assert mock_backend.delete_collection.call_args_list == [
|
||||||
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
]
|
||||||
|
|
||||||
# Verify: used upsert not add
|
# Verify: used upsert not add
|
||||||
|
mock_temp_col.upsert.assert_called_once()
|
||||||
mock_new_col.upsert.assert_called_once()
|
mock_new_col.upsert.assert_called_once()
|
||||||
mock_new_col.add.assert_not_called()
|
mock_new_col.add.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
@patch("mempalace.repair.shutil")
|
||||||
|
@patch("mempalace.repair.ChromaBackend")
|
||||||
|
def test_rebuild_index_ignores_missing_temp_collection_at_start(
|
||||||
|
mock_backend_cls, mock_shutil, tmp_path
|
||||||
|
):
|
||||||
|
sqlite_path = tmp_path / "chroma.sqlite3"
|
||||||
|
sqlite_path.write_text("fake")
|
||||||
|
|
||||||
|
def _fake_copy2(src, dst):
|
||||||
|
with open(dst, "w") as handle:
|
||||||
|
handle.write("backup")
|
||||||
|
|
||||||
|
mock_shutil.copy2.side_effect = _fake_copy2
|
||||||
|
|
||||||
|
mock_col = MagicMock()
|
||||||
|
mock_col.count.return_value = 2
|
||||||
|
mock_col.get.return_value = {
|
||||||
|
"ids": ["id1", "id2"],
|
||||||
|
"documents": ["doc1", "doc2"],
|
||||||
|
"metadatas": [{"wing": "a"}, {"wing": "b"}],
|
||||||
|
}
|
||||||
|
|
||||||
|
mock_new_col = MagicMock()
|
||||||
|
mock_new_col.count.return_value = 2
|
||||||
|
mock_temp_col = MagicMock()
|
||||||
|
mock_temp_col.count.return_value = 2
|
||||||
|
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
|
||||||
|
mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
|
||||||
|
mock_backend.delete_collection.side_effect = [
|
||||||
|
ValueError("Collection [mempalace_drawers__repair_tmp] does not exist"),
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
]
|
||||||
|
|
||||||
|
repair.rebuild_index(palace_path=str(tmp_path))
|
||||||
|
|
||||||
|
assert mock_shutil.copy2.call_count == 1
|
||||||
|
assert mock_backend.delete_collection.call_args_list == [
|
||||||
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_collection_if_exists_reraises_unexpected_value_error():
|
||||||
|
mock_backend = MagicMock()
|
||||||
|
mock_backend.delete_collection.side_effect = ValueError("invalid collection name")
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="invalid collection name"):
|
||||||
|
repair._delete_collection_if_exists(mock_backend, "/palace", "bad/name")
|
||||||
|
|
||||||
|
|
||||||
@patch("mempalace.repair.shutil")
|
@patch("mempalace.repair.shutil")
|
||||||
@patch("mempalace.repair.ChromaBackend")
|
@patch("mempalace.repair.ChromaBackend")
|
||||||
def test_rebuild_index_error_reading(mock_backend_cls, mock_shutil, tmp_path):
|
def test_rebuild_index_error_reading(mock_backend_cls, mock_shutil, tmp_path):
|
||||||
@@ -365,19 +428,261 @@ def test_rebuild_index_proceeds_with_override(mock_backend_cls, mock_shutil, tmp
|
|||||||
},
|
},
|
||||||
{"ids": [], "documents": [], "metadatas": []},
|
{"ids": [], "documents": [], "metadatas": []},
|
||||||
]
|
]
|
||||||
|
mock_temp_col = MagicMock()
|
||||||
|
mock_temp_col.count.return_value = 10_000
|
||||||
mock_new_col = MagicMock()
|
mock_new_col = MagicMock()
|
||||||
|
mock_new_col.count.return_value = 10_000
|
||||||
mock_backend.get_collection.return_value = mock_col
|
mock_backend.get_collection.return_value = mock_col
|
||||||
mock_backend.create_collection.return_value = mock_new_col
|
mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
|
||||||
mock_backend_cls.return_value = mock_backend
|
mock_backend_cls.return_value = mock_backend
|
||||||
|
|
||||||
with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
|
with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
|
||||||
repair.rebuild_index(palace_path=str(tmp_path), confirm_truncation_ok=True)
|
repair.rebuild_index(palace_path=str(tmp_path), confirm_truncation_ok=True)
|
||||||
|
|
||||||
mock_backend.delete_collection.assert_called_once()
|
assert mock_backend.delete_collection.call_count == 3
|
||||||
mock_backend.create_collection.assert_called_once()
|
assert mock_backend.create_collection.call_count == 2
|
||||||
|
mock_temp_col.upsert.assert_called()
|
||||||
mock_new_col.upsert.assert_called()
|
mock_new_col.upsert.assert_called()
|
||||||
|
|
||||||
|
|
||||||
|
@patch("mempalace.repair.shutil")
|
||||||
|
@patch("mempalace.repair.ChromaBackend")
|
||||||
|
def test_rebuild_index_stage_failure_leaves_live_collection_untouched(
|
||||||
|
mock_backend_cls, mock_shutil, tmp_path
|
||||||
|
):
|
||||||
|
sqlite_path = tmp_path / "chroma.sqlite3"
|
||||||
|
sqlite_path.write_text("fake")
|
||||||
|
|
||||||
|
mock_col = MagicMock()
|
||||||
|
mock_col.count.return_value = 2
|
||||||
|
mock_col.get.return_value = {
|
||||||
|
"ids": ["id1", "id2"],
|
||||||
|
"documents": ["doc1", "doc2"],
|
||||||
|
"metadatas": [{"wing": "a"}, {"wing": "b"}],
|
||||||
|
}
|
||||||
|
mock_temp_col = MagicMock()
|
||||||
|
mock_temp_col.count.return_value = 1
|
||||||
|
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
|
||||||
|
mock_backend.create_collection.return_value = mock_temp_col
|
||||||
|
|
||||||
|
with pytest.raises(repair.RebuildCollectionError) as excinfo:
|
||||||
|
repair.rebuild_index(palace_path=str(tmp_path))
|
||||||
|
|
||||||
|
assert excinfo.value.live_replaced is False
|
||||||
|
assert mock_shutil.copy2.call_count == 1
|
||||||
|
assert mock_backend.delete_collection.call_args_list == [
|
||||||
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@patch("mempalace.repair.shutil")
|
||||||
|
@patch("mempalace.repair.ChromaBackend")
|
||||||
|
def test_rebuild_index_live_failure_restores_backup(mock_backend_cls, mock_shutil, tmp_path):
|
||||||
|
sqlite_path = tmp_path / "chroma.sqlite3"
|
||||||
|
sqlite_path.write_text("fake")
|
||||||
|
|
||||||
|
def _fake_copy2(src, dst):
|
||||||
|
with open(dst, "w") as handle:
|
||||||
|
handle.write("backup")
|
||||||
|
|
||||||
|
mock_shutil.copy2.side_effect = _fake_copy2
|
||||||
|
|
||||||
|
mock_col = MagicMock()
|
||||||
|
mock_col.count.return_value = 2
|
||||||
|
mock_col.get.return_value = {
|
||||||
|
"ids": ["id1", "id2"],
|
||||||
|
"documents": ["doc1", "doc2"],
|
||||||
|
"metadatas": [{"wing": "a"}, {"wing": "b"}],
|
||||||
|
}
|
||||||
|
mock_temp_col = MagicMock()
|
||||||
|
mock_temp_col.count.return_value = 2
|
||||||
|
mock_new_col = MagicMock()
|
||||||
|
mock_new_col.upsert.side_effect = RuntimeError("live upsert failed")
|
||||||
|
active_backend = MagicMock()
|
||||||
|
active_backend.get_collection.return_value = mock_col
|
||||||
|
active_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
|
||||||
|
helper_backend = MagicMock()
|
||||||
|
mock_backend_cls.side_effect = [active_backend, helper_backend]
|
||||||
|
|
||||||
|
with pytest.raises(repair.RebuildCollectionError) as excinfo:
|
||||||
|
repair.rebuild_index(palace_path=str(tmp_path))
|
||||||
|
|
||||||
|
assert excinfo.value.live_replaced is True
|
||||||
|
assert mock_shutil.copy2.call_count == 2
|
||||||
|
assert active_backend.delete_collection.call_args_list == [
|
||||||
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers"),
|
||||||
|
]
|
||||||
|
active_backend.close_palace.assert_called_once_with(str(tmp_path))
|
||||||
|
helper_backend.close_palace.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
@patch("mempalace.repair.shutil")
|
||||||
|
@patch("mempalace.repair.ChromaBackend")
|
||||||
|
def test_rebuild_index_live_delete_missing_still_restores_backup(
|
||||||
|
mock_backend_cls, mock_shutil, tmp_path
|
||||||
|
):
|
||||||
|
sqlite_path = tmp_path / "chroma.sqlite3"
|
||||||
|
sqlite_path.write_text("fake")
|
||||||
|
|
||||||
|
def _fake_copy2(src, dst):
|
||||||
|
with open(dst, "w") as handle:
|
||||||
|
handle.write("backup")
|
||||||
|
|
||||||
|
mock_shutil.copy2.side_effect = _fake_copy2
|
||||||
|
|
||||||
|
mock_col = MagicMock()
|
||||||
|
mock_col.count.return_value = 2
|
||||||
|
mock_col.get.return_value = {
|
||||||
|
"ids": ["id1", "id2"],
|
||||||
|
"documents": ["doc1", "doc2"],
|
||||||
|
"metadatas": [{"wing": "a"}, {"wing": "b"}],
|
||||||
|
}
|
||||||
|
mock_temp_col = MagicMock()
|
||||||
|
mock_temp_col.count.return_value = 2
|
||||||
|
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
|
||||||
|
mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("create failed")]
|
||||||
|
mock_backend.delete_collection.side_effect = [
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
repair.ChromaNotFoundError("missing"),
|
||||||
|
]
|
||||||
|
|
||||||
|
with pytest.raises(repair.RebuildCollectionError) as excinfo:
|
||||||
|
repair.rebuild_index(palace_path=str(tmp_path))
|
||||||
|
|
||||||
|
assert excinfo.value.live_replaced is True
|
||||||
|
assert mock_shutil.copy2.call_count == 2
|
||||||
|
assert mock_backend.delete_collection.call_args_list == [
|
||||||
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@patch("mempalace.repair.shutil")
|
||||||
|
@patch("mempalace.repair.ChromaBackend")
|
||||||
|
def test_rebuild_index_restore_failure_preserves_original_error(
|
||||||
|
mock_backend_cls, mock_shutil, tmp_path, capsys
|
||||||
|
):
|
||||||
|
sqlite_path = tmp_path / "chroma.sqlite3"
|
||||||
|
sqlite_path.write_text("fake")
|
||||||
|
|
||||||
|
def _copy2_side_effect(src, dst):
|
||||||
|
if str(src).endswith(".backup"):
|
||||||
|
raise PermissionError("locked sqlite")
|
||||||
|
with open(dst, "w") as handle:
|
||||||
|
handle.write("backup")
|
||||||
|
|
||||||
|
mock_shutil.copy2.side_effect = _copy2_side_effect
|
||||||
|
|
||||||
|
mock_col = MagicMock()
|
||||||
|
mock_col.count.return_value = 2
|
||||||
|
mock_col.get.return_value = {
|
||||||
|
"ids": ["id1", "id2"],
|
||||||
|
"documents": ["doc1", "doc2"],
|
||||||
|
"metadatas": [{"wing": "a"}, {"wing": "b"}],
|
||||||
|
}
|
||||||
|
mock_temp_col = MagicMock()
|
||||||
|
mock_temp_col.count.return_value = 2
|
||||||
|
mock_new_col = MagicMock()
|
||||||
|
mock_new_col.upsert.side_effect = RuntimeError("live upsert failed")
|
||||||
|
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
|
||||||
|
mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
|
||||||
|
|
||||||
|
with pytest.raises(repair.RebuildCollectionError) as excinfo:
|
||||||
|
repair.rebuild_index(palace_path=str(tmp_path))
|
||||||
|
|
||||||
|
out = capsys.readouterr().out
|
||||||
|
assert "locked sqlite" in out
|
||||||
|
assert "Manual restore required" in out
|
||||||
|
assert "live upsert failed" in str(excinfo.value)
|
||||||
|
|
||||||
|
|
||||||
|
@patch("mempalace.repair.ChromaBackend")
|
||||||
|
def test_rebuild_collection_via_temp_keeps_original_error_when_cleanup_fails(
|
||||||
|
mock_backend_cls,
|
||||||
|
):
|
||||||
|
mock_col = MagicMock()
|
||||||
|
mock_col.count.return_value = 2
|
||||||
|
mock_temp_col = MagicMock()
|
||||||
|
mock_temp_col.count.return_value = 2
|
||||||
|
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
|
||||||
|
mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("live build failed")]
|
||||||
|
mock_backend.delete_collection.side_effect = [
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
RuntimeError("cleanup failed"),
|
||||||
|
]
|
||||||
|
|
||||||
|
with pytest.raises(repair.RebuildCollectionError) as excinfo:
|
||||||
|
repair._rebuild_collection_via_temp(
|
||||||
|
mock_backend,
|
||||||
|
"/palace",
|
||||||
|
["id1", "id2"],
|
||||||
|
["doc1", "doc2"],
|
||||||
|
[{"wing": "a"}, {"wing": "b"}],
|
||||||
|
batch_size=5000,
|
||||||
|
progress=lambda *args, **kwargs: None,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "live build failed" in str(excinfo.value)
|
||||||
|
assert excinfo.value.live_replaced is True
|
||||||
|
assert mock_backend.delete_collection.call_args_list == [
|
||||||
|
call("/palace", "mempalace_drawers__repair_tmp"),
|
||||||
|
call("/palace", "mempalace_drawers"),
|
||||||
|
call("/palace", "mempalace_drawers__repair_tmp"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@patch("mempalace.repair.shutil")
|
||||||
|
@patch("mempalace.repair.ChromaBackend")
|
||||||
|
def test_rebuild_index_ignores_temp_cleanup_failure_after_success(
|
||||||
|
mock_backend_cls, mock_shutil, tmp_path
|
||||||
|
):
|
||||||
|
sqlite_path = tmp_path / "chroma.sqlite3"
|
||||||
|
sqlite_path.write_text("fake")
|
||||||
|
|
||||||
|
def _fake_copy2(src, dst):
|
||||||
|
with open(dst, "w") as handle:
|
||||||
|
handle.write("backup")
|
||||||
|
|
||||||
|
mock_shutil.copy2.side_effect = _fake_copy2
|
||||||
|
|
||||||
|
mock_col = MagicMock()
|
||||||
|
mock_col.count.return_value = 2
|
||||||
|
mock_col.get.return_value = {
|
||||||
|
"ids": ["id1", "id2"],
|
||||||
|
"documents": ["doc1", "doc2"],
|
||||||
|
"metadatas": [{"wing": "a"}, {"wing": "b"}],
|
||||||
|
}
|
||||||
|
mock_temp_col = MagicMock()
|
||||||
|
mock_temp_col.count.return_value = 2
|
||||||
|
mock_new_col = MagicMock()
|
||||||
|
mock_new_col.count.return_value = 2
|
||||||
|
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
|
||||||
|
mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
|
||||||
|
mock_backend.delete_collection.side_effect = [
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
RuntimeError("cleanup failed"),
|
||||||
|
]
|
||||||
|
|
||||||
|
repair.rebuild_index(palace_path=str(tmp_path))
|
||||||
|
|
||||||
|
assert mock_shutil.copy2.call_count == 1
|
||||||
|
assert mock_backend.delete_collection.call_args_list == [
|
||||||
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers"),
|
||||||
|
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# ── repair_max_seq_id ─────────────────────────────────────────────────
|
# ── repair_max_seq_id ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user