fix: narrow _fix_blob_seq_ids shim + add repair --mode max-seq-id

The BLOB-seq_id migration shim (PR #664) ran int.from_bytes(..., 'big')
over every BLOB in max_seq_id, including chromadb 1.5.x's own native
format (b'\x11\x11' + 6 ASCII digits). That conversion yields a ~1.23e18
integer that silently suppresses every subsequent embeddings_queue write
for the affected segment (queue filter is seq_id > start), causing
silent drawer-write drops after a 1.5.x upgrade.

Two-part fix:

1. Shim narrowing (mempalace/backends/chroma.py)
   - Drop max_seq_id from the shim loop. chromadb owns that column's
     format; we don't reinterpret it.
   - Defense-in-depth: skip rows in embeddings whose seq_id BLOB has the
     sysdb-10 b'\x11\x11' prefix rather than misconvert.

2. Recovery command (mempalace/repair.py, mempalace/cli.py)
   - mempalace repair --mode max-seq-id [--segment <uuid>]
     [--from-sidecar <path>] [--dry-run] [--yes] [--no-backup]
   - Detects poisoned rows via threshold (seq_id > 2**53).
   - Default heuristic: MAX(embeddings.seq_id) over the collection owning
     the poisoned segment. Matches METADATA max exactly; VECTOR segments
     get a few seq_ids ahead (queue skips an already-indexed window — an
     acceptable loss vs. resetting to 0 and re-processing everything).
   - --from-sidecar copies clean values from a pre-corruption sqlite db.
   - Backs up chroma.sqlite3, closes chroma handles, atomic UPDATEs,
     post-repair verification that raises MaxSeqIdVerificationError if
     any row is still above threshold.

Tests: 8 new in tests/test_repair.py (detection, heuristic, sidecar,
dry-run, segment filter, no-op, backup, rollback-on-verify-failure).
3 new in tests/test_backends.py (max_seq_id untouched by shim,
sysdb-10 prefix skipped in embeddings, legacy big-endian u64 BLOBs
still convert). Full suite: 1103 passed.
This commit is contained in:
eblander
2026-04-23 14:45:38 -04:00
committed by igorls
parent bc5d3fa911
commit f5c8b095dd
6 changed files with 684 additions and 20 deletions
+66 -7
View File
@@ -341,12 +341,9 @@ def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path):
db_path = tmp_path / "chroma.sqlite3"
conn = sqlite3.connect(str(db_path))
conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
conn.execute("CREATE TABLE max_seq_id (rowid INTEGER PRIMARY KEY, seq_id)")
# Insert BLOB seq_ids like ChromaDB 0.6.x would
# Insert BLOB seq_id like ChromaDB 0.6.x would
blob_42 = (42).to_bytes(8, byteorder="big")
blob_99 = (99).to_bytes(8, byteorder="big")
conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", (blob_42,))
conn.execute("INSERT INTO max_seq_id (seq_id) VALUES (?)", (blob_99,))
conn.commit()
conn.close()
@@ -355,8 +352,6 @@ def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path):
conn = sqlite3.connect(str(db_path))
row = conn.execute("SELECT seq_id, typeof(seq_id) FROM embeddings").fetchone()
assert row == (42, "integer")
row = conn.execute("SELECT seq_id, typeof(seq_id) FROM max_seq_id").fetchone()
assert row == (99, "integer")
conn.close()
@@ -382,6 +377,71 @@ def test_fix_blob_seq_ids_noop_without_database(tmp_path):
_fix_blob_seq_ids(str(tmp_path)) # should not raise
def test_fix_blob_seq_ids_does_not_touch_max_seq_id(tmp_path):
"""chromadb 1.5.x owns max_seq_id; the shim must not interpret its BLOBs.
Regression guard for the 2026-04-20 incident: the old shim ran
int.from_bytes(..., 'big') over chromadb 1.5.x's native
b'\\x11\\x11' + ASCII-digit BLOB, producing a ~1.23e18 integer that
silently suppressed every subsequent embeddings_queue write.
"""
db_path = tmp_path / "chroma.sqlite3"
conn = sqlite3.connect(str(db_path))
conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
conn.execute("CREATE TABLE max_seq_id (rowid INTEGER PRIMARY KEY, seq_id)")
sysdb10_blob = b"\x11\x11502607"
conn.execute("INSERT INTO max_seq_id (seq_id) VALUES (?)", (sysdb10_blob,))
conn.commit()
conn.close()
_fix_blob_seq_ids(str(tmp_path))
conn = sqlite3.connect(str(db_path))
row = conn.execute("SELECT seq_id, typeof(seq_id) FROM max_seq_id").fetchone()
assert row == (sysdb10_blob, "blob")
conn.close()
def test_fix_blob_seq_ids_skips_sysdb10_prefix_in_embeddings(tmp_path):
"""Defense-in-depth: sysdb-10 prefix in embeddings.seq_id is skipped."""
db_path = tmp_path / "chroma.sqlite3"
conn = sqlite3.connect(str(db_path))
conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
sysdb10_blob = b"\x11\x11502607"
conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", (sysdb10_blob,))
conn.commit()
conn.close()
_fix_blob_seq_ids(str(tmp_path))
conn = sqlite3.connect(str(db_path))
row = conn.execute("SELECT seq_id, typeof(seq_id) FROM embeddings").fetchone()
# Still a BLOB — not converted to 1.23e18.
assert row == (sysdb10_blob, "blob")
conn.close()
def test_fix_blob_seq_ids_still_converts_legacy_blobs_in_embeddings(tmp_path):
"""Regression guard: pure big-endian u64 BLOBs still convert for genuine 0.6.x."""
db_path = tmp_path / "chroma.sqlite3"
conn = sqlite3.connect(str(db_path))
conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", ((42).to_bytes(8, "big"),))
conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", (b"\x11\x11502607",))
conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", ((7).to_bytes(8, "big"),))
conn.commit()
conn.close()
_fix_blob_seq_ids(str(tmp_path))
conn = sqlite3.connect(str(db_path))
rows = conn.execute("SELECT seq_id, typeof(seq_id) FROM embeddings ORDER BY rowid").fetchall()
assert rows[0] == (42, "integer")
assert rows[1] == (b"\x11\x11502607", "blob") # sysdb-10 row left alone
assert rows[2] == (7, "integer")
conn.close()
def test_fix_blob_seq_ids_writes_marker_after_blob_path(tmp_path):
"""The .blob_seq_ids_migrated marker is written after a successful BLOB → INTEGER conversion."""
from mempalace.backends.chroma import _BLOB_FIX_MARKER
@@ -447,7 +507,6 @@ def test_fix_blob_seq_ids_skips_sqlite_when_marker_present(tmp_path):
mock_connect.assert_not_called()
# ── quarantine_stale_hnsw ─────────────────────────────────────────────────