diff --git a/mempalace/cli.py b/mempalace/cli.py index 374e894..e4ddba0 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -410,6 +410,7 @@ def cmd_repair(args): import shutil from .backends.chroma import ChromaBackend from .migrate import confirm_destructive_action, contains_palace_database + from .repair import TruncationDetected, check_extraction_safety palace_path = os.path.abspath( os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path @@ -466,6 +467,23 @@ def cmd_repair(args): offset += len(batch["ids"]) print(f" Extracted {len(all_ids)} drawers") + # ── #1208 guard ────────────────────────────────────────────────── + # Cross-check against the SQLite ground truth before doing anything + # destructive. Catches the user-reported case where chromadb's + # collection-layer get() silently caps at 10,000 rows even on much + # larger palaces (e.g. after manual HNSW quarantine). Override with + # --confirm-truncation-ok only after independently verifying the + # extraction count is real. + try: + check_extraction_safety( + palace_path, + len(all_ids), + confirm_truncation_ok=getattr(args, "confirm_truncation_ok", False), + ) + except TruncationDetected as e: + print(e.message) + return + # Backup and rebuild palace_path = os.path.normpath(palace_path) backup_path = palace_path + ".backup" @@ -868,10 +886,23 @@ def main(): instructions_sub.add_parser(instr_name, help=f"Output {instr_name} instructions") # repair - sub.add_parser( + p_repair = sub.add_parser( "repair", help="Rebuild palace vector index from stored data (fixes segfaults after corruption)", - ).add_argument("--yes", action="store_true", help="Skip confirmation for destructive changes") + ) + p_repair.add_argument( + "--yes", action="store_true", help="Skip confirmation for destructive changes" + ) + p_repair.add_argument( + "--confirm-truncation-ok", + action="store_true", + help=( + "Override the #1208 safety guard. Required when chromadb's collection-layer " + "extraction returns exactly 10,000 drawers and the SQLite ground-truth check " + "either matches or can't be read. Use only after independently confirming " + "the palace really contains that count." + ), + ) # mcp sub.add_parser( diff --git a/mempalace/repair.py b/mempalace/repair.py index d391d38..a75ce35 100644 --- a/mempalace/repair.py +++ b/mempalace/repair.py @@ -201,13 +201,143 @@ def prune_corrupt(palace_path=None, confirm=False): print(f" Collection size: {before:,} → {after:,}") -def rebuild_index(palace_path=None): +# ChromaDB's ``collection.get()`` enforces an internal default ``limit`` +# of 10 000 rows when the caller does not pass one. We pass an explicit +# ``limit=batch_size`` below, but the underlying segment also caps reads +# during stale/quarantined-HNSW recovery flows: extraction silently stops +# at exactly 10 000 even on palaces with many more rows. Refusing to +# overwrite when this exact value comes back is the simplest signal we +# can detect without depending on chromadb internals. +CHROMADB_DEFAULT_GET_LIMIT = 10_000 + + +class TruncationDetected(Exception): + """Raised by :func:`check_extraction_safety` when extraction looks short. + + Carries the human-readable abort message so callers (CLI ``cmd_repair``, + ``rebuild_index``) can print and exit consistently without re-deriving + the wording. + """ + + def __init__(self, message: str, sqlite_count: "int | None", extracted: int): + super().__init__(message) + self.message = message + self.sqlite_count = sqlite_count + self.extracted = extracted + + +def check_extraction_safety( + palace_path: str, extracted: int, confirm_truncation_ok: bool = False +) -> None: + """Cross-check that ``extracted`` matches the SQLite ground truth. + + Two signals trip the guard: + + 1. **Strong** — ``chroma.sqlite3`` reports more drawers than were + extracted. This is the user-reported #1208 case: 67 580 on disk, + 10 000 came back through the chromadb collection layer, repair + would have destroyed the difference. + 2. **Weak** — extracted count equals exactly ``CHROMADB_DEFAULT_GET_LIMIT`` + AND the SQLite check couldn't run (schema drift, locked file). + Hitting the chromadb default ``get()`` cap exactly is suspicious + enough to refuse without explicit acknowledgement. + + Raises :class:`TruncationDetected` with a printable message when the + guard fires. Does nothing on safe extractions or when + ``confirm_truncation_ok`` is set. + """ + if confirm_truncation_ok: + return + + sqlite_count = sqlite_drawer_count(palace_path) + cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT + + if sqlite_count is not None and sqlite_count > extracted: + loss = sqlite_count - extracted + pct = 100 * loss / sqlite_count + message = ( + f"\n ABORT: chroma.sqlite3 reports {sqlite_count:,} drawers but only {extracted:,}\n" + " came back through the chromadb collection layer. The segment metadata is\n" + " stale (often after manual HNSW quarantine) — proceeding would silently\n" + f" destroy {loss:,} drawers (~{pct:.0f}%).\n" + "\n" + " Recovery options:\n" + " 1. Restore from your most recent palace backup, then re-mine.\n" + " 2. Direct-extract from chroma.sqlite3 (rows are still on disk) and\n" + " rebuild the palace from source files.\n" + " 3. If you have independently confirmed the palace really contains only\n" + f" {extracted:,} drawers, re-run with --confirm-truncation-ok.\n" + ) + raise TruncationDetected(message, sqlite_count, extracted) + + if cap_signal and sqlite_count is None: + message = ( + f"\n ABORT: extracted exactly {CHROMADB_DEFAULT_GET_LIMIT:,} drawers, which matches\n" + " ChromaDB's internal default get() limit. The on-disk SQLite count couldn't\n" + " be cross-checked from this Python context, so we can't tell whether the\n" + f" palace genuinely holds {CHROMADB_DEFAULT_GET_LIMIT:,} rows or whether extraction was\n" + " silently capped. Refusing to overwrite the palace.\n" + "\n" + " If you have independently confirmed (e.g. via direct sqlite3 query) that\n" + f" the palace really contains exactly {CHROMADB_DEFAULT_GET_LIMIT:,} drawers, re-run with\n" + " --confirm-truncation-ok.\n" + ) + raise TruncationDetected(message, sqlite_count, extracted) + + +def sqlite_drawer_count(palace_path: str) -> "int | None": + """Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection. + + Used as an independent ground-truth check against the chromadb + collection-layer ``count()`` / ``get()``: when the on-disk SQLite + row count exceeds the extraction count, the segment metadata is + stale and repair would destroy the difference. + + Returns ``None`` when the schema isn't readable (chromadb version + drift, missing tables, locked file). Callers treat ``None`` as + "unknown" and fall back to the cap-detection check. + """ + sqlite_path = os.path.join(palace_path, "chroma.sqlite3") + if not os.path.exists(sqlite_path): + return None + try: + import sqlite3 + + conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True) + try: + row = conn.execute( + """ + SELECT COUNT(*) + FROM embeddings e + JOIN segments s ON e.segment_id = s.id + JOIN collections c ON s.collection = c.id + WHERE c.name = ? + """, + (COLLECTION_NAME,), + ).fetchone() + return int(row[0]) if row and row[0] is not None else None + finally: + conn.close() + except Exception: + # chromadb schema differs by version (segments / collections column + # names occasionally rename). Silent fallback is correct here — + # the cap-detection check still catches the user-reported case. + return None + + +def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False): """Rebuild the HNSW index from scratch. 1. Extract all drawers via ChromaDB get() - 2. Back up ONLY chroma.sqlite3 (not the bloated HNSW files) - 3. Delete and recreate the collection with hnsw:space=cosine - 4. Upsert all drawers back + 2. Cross-check against the SQLite ground truth (#1208 guard) + 3. Back up ONLY chroma.sqlite3 (not the bloated HNSW files) + 4. Delete and recreate the collection with hnsw:space=cosine + 5. Upsert all drawers back + + ``confirm_truncation_ok`` overrides the safety guard from step 2. + Set to ``True`` only when you have independently verified that the + palace genuinely contains exactly the extracted number of drawers + (typically only a concern for palaces sized at exactly 10 000 rows). """ palace_path = palace_path or _get_palace_path() @@ -252,6 +382,16 @@ def rebuild_index(palace_path=None): offset += len(batch["ids"]) print(f" Extracted {len(all_ids)} drawers") + # ── #1208 guard ────────────────────────────────────────────────── + # Refuse to ``delete_collection`` + rebuild when extraction looks + # short of the SQLite ground truth (or when extraction == chromadb + # default get() cap and the SQLite check couldn't run). + try: + check_extraction_safety(palace_path, len(all_ids), confirm_truncation_ok) + except TruncationDetected as e: + print(e.message) + return + # Back up ONLY the SQLite database, not the bloated HNSW files sqlite_path = os.path.join(palace_path, "chroma.sqlite3") backup_path = sqlite_path + ".backup" diff --git a/tests/test_repair.py b/tests/test_repair.py index 9ae1812..00bcb02 100644 --- a/tests/test_repair.py +++ b/tests/test_repair.py @@ -254,3 +254,123 @@ def test_rebuild_index_error_reading(mock_backend_cls, mock_shutil, tmp_path): repair.rebuild_index(palace_path=str(tmp_path)) mock_backend.delete_collection.assert_not_called() + + +# ── #1208 truncation safety ─────────────────────────────────────────── + + +def test_check_extraction_safety_passes_when_counts_match(tmp_path): + """SQLite reports same count as extracted → no exception.""" + with patch("mempalace.repair.sqlite_drawer_count", return_value=500): + repair.check_extraction_safety(str(tmp_path), 500) + + +def test_check_extraction_safety_passes_when_sqlite_unreadable_and_under_cap(tmp_path): + """SQLite check fails (None) but extraction is well under the cap → safe.""" + with patch("mempalace.repair.sqlite_drawer_count", return_value=None): + repair.check_extraction_safety(str(tmp_path), 5_000) + + +def test_check_extraction_safety_aborts_when_sqlite_higher(tmp_path): + """SQLite reports more than extracted — the user-reported #1208 case.""" + with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580): + try: + repair.check_extraction_safety(str(tmp_path), 10_000) + except repair.TruncationDetected as e: + assert e.sqlite_count == 67_580 + assert e.extracted == 10_000 + assert "67,580" in e.message + assert "10,000" in e.message + assert "57,580" in e.message # the loss number + else: + raise AssertionError("expected TruncationDetected") + + +def test_check_extraction_safety_aborts_when_unreadable_and_at_cap(tmp_path): + """SQLite unreadable but extraction == default get() cap → suspicious.""" + with patch("mempalace.repair.sqlite_drawer_count", return_value=None): + try: + repair.check_extraction_safety(str(tmp_path), repair.CHROMADB_DEFAULT_GET_LIMIT) + except repair.TruncationDetected as e: + assert e.sqlite_count is None + assert e.extracted == repair.CHROMADB_DEFAULT_GET_LIMIT + assert "10,000" in e.message + else: + raise AssertionError("expected TruncationDetected") + + +def test_check_extraction_safety_override_skips_check(tmp_path): + """``confirm_truncation_ok=True`` short-circuits both signals.""" + with patch("mempalace.repair.sqlite_drawer_count", return_value=99_999): + # Would normally abort — override allows through + repair.check_extraction_safety(str(tmp_path), 10_000, confirm_truncation_ok=True) + + +def test_sqlite_drawer_count_returns_none_on_missing_file(tmp_path): + """Palace dir exists but no chroma.sqlite3 → None, not crash.""" + assert repair.sqlite_drawer_count(str(tmp_path)) is None + + +def test_sqlite_drawer_count_returns_none_on_unreadable_schema(tmp_path): + """File exists but isn't a chromadb sqlite → None, not crash.""" + sqlite_path = os.path.join(str(tmp_path), "chroma.sqlite3") + with open(sqlite_path, "wb") as f: + f.write(b"not a sqlite file at all") + assert repair.sqlite_drawer_count(str(tmp_path)) is None + + +@patch("mempalace.repair.shutil") +@patch("mempalace.repair.ChromaBackend") +def test_rebuild_index_aborts_on_truncation_signal(mock_backend_cls, mock_shutil, tmp_path): + """rebuild_index honors the safety guard: SQLite says 67k, get() returns + 10k → no delete_collection, no upsert, no backup.""" + mock_backend = MagicMock() + mock_col = MagicMock() + mock_col.count.return_value = 10_000 + # Single page comes back with 10_000 ids + mock_col.get.side_effect = [ + { + "ids": [f"id{i}" for i in range(10_000)], + "documents": ["x"] * 10_000, + "metadatas": [{}] * 10_000, + }, + {"ids": [], "documents": [], "metadatas": []}, + ] + mock_backend.get_collection.return_value = mock_col + mock_backend_cls.return_value = mock_backend + + with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580): + repair.rebuild_index(palace_path=str(tmp_path)) + + # Guard fired: nothing destructive happened + mock_backend.delete_collection.assert_not_called() + mock_backend.create_collection.assert_not_called() + mock_shutil.copy2.assert_not_called() + + +@patch("mempalace.repair.shutil") +@patch("mempalace.repair.ChromaBackend") +def test_rebuild_index_proceeds_with_override(mock_backend_cls, mock_shutil, tmp_path): + """Override flag lets repair proceed even when the guard would fire.""" + mock_backend = MagicMock() + mock_col = MagicMock() + mock_col.count.return_value = 10_000 + mock_col.get.side_effect = [ + { + "ids": [f"id{i}" for i in range(10_000)], + "documents": ["x"] * 10_000, + "metadatas": [{}] * 10_000, + }, + {"ids": [], "documents": [], "metadatas": []}, + ] + mock_new_col = MagicMock() + mock_backend.get_collection.return_value = mock_col + mock_backend.create_collection.return_value = mock_new_col + mock_backend_cls.return_value = mock_backend + + with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580): + repair.rebuild_index(palace_path=str(tmp_path), confirm_truncation_ok=True) + + mock_backend.delete_collection.assert_called_once() + mock_backend.create_collection.assert_called_once() + mock_new_col.upsert.assert_called()