diff --git a/mempalace/cli.py b/mempalace/cli.py index f2606a4..103c4ae 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -654,7 +654,11 @@ def cmd_repair(args): import shutil from .backends.chroma import ChromaBackend from .migrate import confirm_destructive_action, contains_palace_database - from .repair import TruncationDetected, check_extraction_safety + from .repair import ( + TruncationDetected, + check_extraction_safety, + maybe_repair_poisoned_max_seq_id_before_rebuild, + ) palace_path = os.path.abspath( os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path @@ -679,11 +683,20 @@ def cmd_repair(args): print(f"\n No palace found at {palace_path}") return if not contains_palace_database(palace_path): - print(f"\n No palace database found at {db_path}") + print(f"\n No palace database found at {db_path}") + return + + preflight = maybe_repair_poisoned_max_seq_id_before_rebuild( + palace_path, + backup=getattr(args, "backup", True), + dry_run=getattr(args, "dry_run", False), + assume_yes=getattr(args, "yes", False), + ) + if preflight is not None: return print(f"\n{'=' * 55}") - print(" MemPalace Repair") + print(" MemPalace Repair") print(f"{'=' * 55}\n") print(f" Palace: {palace_path}") diff --git a/mempalace/repair.py b/mempalace/repair.py index 1cd1556..09970de 100644 --- a/mempalace/repair.py +++ b/mempalace/repair.py @@ -330,6 +330,56 @@ def sqlite_drawer_count(palace_path: str) -> "int | None": return None +def maybe_repair_poisoned_max_seq_id_before_rebuild( + palace_path: str, + *, + backup: bool = True, + dry_run: bool = False, + assume_yes: bool = False, +) -> "dict | None": + """Run non-destructive max_seq_id repair before a rebuild if needed. + + A poisoned ``max_seq_id`` row can make Chroma believe it has already + consumed every row in ``embeddings_queue``. Writes then report success + because they land in the queue, but they never become visible in + ``embeddings``. + + If this precise corruption is present, do the narrow bookmark repair and + stop instead of continuing into the legacy rebuild path. The rebuild path + extracts only already-visible embeddings and can discard queued writes. + """ + + db_path = os.path.join(palace_path, "chroma.sqlite3") + if not os.path.isfile(db_path): + return None + + try: + poisoned = _detect_poisoned_max_seq_ids(db_path) + except Exception: + return None + + if not poisoned: + return None + + print("\n Detected poisoned max_seq_id rows before repair rebuild.") + print( + " This can make writes report success while embeddings_queue grows " + "and embeddings stay static." + ) + print(" Running the non-destructive max_seq_id repair instead of rebuilding " "the collection.") + print( + " Queued writes remain in chroma.sqlite3 for Chroma to drain after " + "the bookmark is unpoisoned." + ) + + return repair_max_seq_id( + palace_path, + backup=backup, + dry_run=dry_run, + assume_yes=assume_yes, + ) + + def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False): """Rebuild the HNSW index from scratch. @@ -353,7 +403,14 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False): print(f"\n{'=' * 55}") print(" MemPalace Repair — Index Rebuild") print(f"{'=' * 55}\n") - print(f" Palace: {palace_path}") + print(f" Palace: {palace_path}") + + preflight = maybe_repair_poisoned_max_seq_id_before_rebuild( + palace_path, + assume_yes=True, + ) + if preflight is not None: + return backend = ChromaBackend() try: diff --git a/tests/test_repair.py b/tests/test_repair.py index bc770dd..6f1802f 100644 --- a/tests/test_repair.py +++ b/tests/test_repair.py @@ -682,3 +682,69 @@ def test_max_seq_id_rollback_on_verification_failure(tmp_path, monkeypatch): # A backup file is still present — caller can roll back from it. leftover = [fn for fn in os.listdir(palace) if "max-seq-id-backup-" in fn] assert leftover + + +def test_max_seq_id_preflight_preserves_embeddings_queue(tmp_path): + """#1295: default repair preflight must not drop queued writes.""" + + palace = str(tmp_path / "palace") + seg = _seed_poisoned_max_seq_id( + palace, + drawers_meta_max=102, + closets_meta_max=11, + ) + db_path = os.path.join(palace, "chroma.sqlite3") + + with sqlite3.connect(db_path) as conn: + conn.executemany( + "INSERT INTO embeddings_queue(seq_id, topic, id) VALUES (?, ?, ?)", + [ + (seq_id, "persistent://default/default/mempalace_drawers", f"queued-{seq_id}") + for seq_id in range(103, 123) + ], + ) + conn.commit() + + result = repair.maybe_repair_poisoned_max_seq_id_before_rebuild( + palace, + assume_yes=True, + ) + + assert result is not None + assert result["segment_repaired"] + + with sqlite3.connect(db_path) as conn: + max_seq_rows = dict(conn.execute("SELECT segment_id, seq_id FROM max_seq_id")) + queue_count = conn.execute("SELECT COUNT(*) FROM embeddings_queue").fetchone()[0] + + assert max_seq_rows[seg["drawers_vec"]] == seg["drawers_meta_max"] + assert max_seq_rows[seg["drawers_meta"]] == seg["drawers_meta_max"] + assert max_seq_rows[seg["closets_vec"]] == seg["closets_meta_max"] + assert max_seq_rows[seg["closets_meta"]] == seg["closets_meta_max"] + + # The old legacy rebuild path can discard queued writes. The preflight + # repair must leave them on disk for Chroma to drain after the bookmark is + # unpoisoned. + assert queue_count == 20 + + +def test_rebuild_index_repairs_poisoned_max_seq_id_before_collection_rebuild(tmp_path, capsys): + """A poisoned bookmark should short-circuit before the legacy rebuild path.""" + + palace = str(tmp_path / "palace") + _seed_poisoned_max_seq_id(palace) + + with patch("mempalace.repair.ChromaBackend") as mock_backend: + repair.rebuild_index(palace) + + out = capsys.readouterr().out + backend = mock_backend.return_value + + # repair_max_seq_id may instantiate ChromaBackend to close cached clients + # after editing sqlite directly. That is safe. The important thing is that + # rebuild_index must not continue into the legacy Chroma collection read / + # count / rebuild path after the max_seq_id preflight handles the issue. + backend.get_collection.assert_not_called() + + assert "Detected poisoned max_seq_id rows" in out + assert "non-destructive max_seq_id repair" in out