Merge pull request #1357 from fatkobra/fix/1295-repair-max-seq-id-preflight
fix(repair): preflight poisoned max_seq_id before rebuild (#1295)
This commit is contained in:
@@ -661,6 +661,7 @@ def cmd_repair(args):
|
||||
_extract_drawers,
|
||||
_rebuild_collection_via_temp,
|
||||
check_extraction_safety,
|
||||
maybe_repair_poisoned_max_seq_id_before_rebuild,
|
||||
)
|
||||
|
||||
config = MempalaceConfig()
|
||||
@@ -742,6 +743,15 @@ def cmd_repair(args):
|
||||
print(f"\n No palace database found at {db_path}")
|
||||
return
|
||||
|
||||
preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
|
||||
palace_path,
|
||||
backup=getattr(args, "backup", True),
|
||||
dry_run=getattr(args, "dry_run", False),
|
||||
assume_yes=getattr(args, "yes", False),
|
||||
)
|
||||
if preflight is not None:
|
||||
return
|
||||
|
||||
print(f"\n{'=' * 55}")
|
||||
print(" MemPalace Repair")
|
||||
print(f"{'=' * 55}\n")
|
||||
|
||||
@@ -486,6 +486,58 @@ def sqlite_drawer_count(palace_path: str, collection_name: Optional[str] = None)
|
||||
return None
|
||||
|
||||
|
||||
def maybe_repair_poisoned_max_seq_id_before_rebuild(
|
||||
palace_path: str,
|
||||
*,
|
||||
backup: bool = True,
|
||||
dry_run: bool = False,
|
||||
assume_yes: bool = False,
|
||||
) -> "dict | None":
|
||||
"""Run non-destructive max_seq_id repair before a rebuild if needed.
|
||||
|
||||
A poisoned ``max_seq_id`` row can make Chroma believe it has already
|
||||
consumed every row in ``embeddings_queue``. Writes then report success
|
||||
because they land in the queue, but they never become visible in
|
||||
``embeddings``.
|
||||
|
||||
If this precise corruption is present, do the narrow bookmark repair and
|
||||
stop instead of continuing into the legacy rebuild path. The rebuild path
|
||||
extracts only already-visible embeddings and can discard queued writes.
|
||||
"""
|
||||
|
||||
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||
if not os.path.isfile(db_path):
|
||||
return None
|
||||
|
||||
try:
|
||||
poisoned = _detect_poisoned_max_seq_ids(db_path)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if not poisoned:
|
||||
return None
|
||||
|
||||
print("\n Detected poisoned max_seq_id rows before repair rebuild.")
|
||||
print(
|
||||
" This can make writes report success while embeddings_queue grows "
|
||||
"and embeddings stay static."
|
||||
)
|
||||
print(
|
||||
" Running the non-destructive max_seq_id repair instead of rebuilding " "the collection."
|
||||
)
|
||||
print(
|
||||
" Queued writes remain in chroma.sqlite3 for Chroma to drain after "
|
||||
"the bookmark is unpoisoned."
|
||||
)
|
||||
|
||||
return repair_max_seq_id(
|
||||
palace_path,
|
||||
backup=backup,
|
||||
dry_run=dry_run,
|
||||
assume_yes=assume_yes,
|
||||
)
|
||||
|
||||
|
||||
def rebuild_index(
|
||||
palace_path=None,
|
||||
confirm_truncation_ok: bool = False,
|
||||
@@ -516,6 +568,13 @@ def rebuild_index(
|
||||
print(f"{'=' * 55}\n")
|
||||
print(f" Palace: {palace_path}")
|
||||
|
||||
preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
|
||||
palace_path,
|
||||
assume_yes=True,
|
||||
)
|
||||
if preflight is not None:
|
||||
return
|
||||
|
||||
backend = ChromaBackend()
|
||||
try:
|
||||
col = backend.get_collection(palace_path, collection_name)
|
||||
|
||||
@@ -1081,6 +1081,72 @@ def test_max_seq_id_rollback_on_verification_failure(tmp_path, monkeypatch):
|
||||
assert leftover
|
||||
|
||||
|
||||
def test_max_seq_id_preflight_preserves_embeddings_queue(tmp_path):
|
||||
"""#1295: default repair preflight must not drop queued writes."""
|
||||
|
||||
palace = str(tmp_path / "palace")
|
||||
seg = _seed_poisoned_max_seq_id(
|
||||
palace,
|
||||
drawers_meta_max=102,
|
||||
closets_meta_max=11,
|
||||
)
|
||||
db_path = os.path.join(palace, "chroma.sqlite3")
|
||||
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
conn.executemany(
|
||||
"INSERT INTO embeddings_queue(seq_id, topic, id) VALUES (?, ?, ?)",
|
||||
[
|
||||
(seq_id, "persistent://default/default/mempalace_drawers", f"queued-{seq_id}")
|
||||
for seq_id in range(103, 123)
|
||||
],
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
result = repair.maybe_repair_poisoned_max_seq_id_before_rebuild(
|
||||
palace,
|
||||
assume_yes=True,
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result["segment_repaired"]
|
||||
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
max_seq_rows = dict(conn.execute("SELECT segment_id, seq_id FROM max_seq_id"))
|
||||
queue_count = conn.execute("SELECT COUNT(*) FROM embeddings_queue").fetchone()[0]
|
||||
|
||||
assert max_seq_rows[seg["drawers_vec"]] == seg["drawers_meta_max"]
|
||||
assert max_seq_rows[seg["drawers_meta"]] == seg["drawers_meta_max"]
|
||||
assert max_seq_rows[seg["closets_vec"]] == seg["closets_meta_max"]
|
||||
assert max_seq_rows[seg["closets_meta"]] == seg["closets_meta_max"]
|
||||
|
||||
# The old legacy rebuild path can discard queued writes. The preflight
|
||||
# repair must leave them on disk for Chroma to drain after the bookmark is
|
||||
# unpoisoned.
|
||||
assert queue_count == 20
|
||||
|
||||
|
||||
def test_rebuild_index_repairs_poisoned_max_seq_id_before_collection_rebuild(tmp_path, capsys):
|
||||
"""A poisoned bookmark should short-circuit before the legacy rebuild path."""
|
||||
|
||||
palace = str(tmp_path / "palace")
|
||||
_seed_poisoned_max_seq_id(palace)
|
||||
|
||||
with patch("mempalace.repair.ChromaBackend") as mock_backend:
|
||||
repair.rebuild_index(palace)
|
||||
|
||||
out = capsys.readouterr().out
|
||||
backend = mock_backend.return_value
|
||||
|
||||
# repair_max_seq_id may instantiate ChromaBackend to close cached clients
|
||||
# after editing sqlite directly. That is safe. The important thing is that
|
||||
# rebuild_index must not continue into the legacy Chroma collection read /
|
||||
# count / rebuild path after the max_seq_id preflight handles the issue.
|
||||
backend.get_collection.assert_not_called()
|
||||
|
||||
assert "Detected poisoned max_seq_id rows" in out
|
||||
assert "non-destructive max_seq_id repair" in out
|
||||
|
||||
|
||||
# ── extract_via_sqlite + rebuild_from_sqlite (#1308) ──────────────────
|
||||
#
|
||||
# These tests build real chromadb palaces in tmp_path rather than mocking
|
||||
|
||||
Reference in New Issue
Block a user