fix(repair): decode BLOB embeddings.seq_id in max-seq-id heuristic (#1254)

`_compute_heuristic_seq_id` ran `int(row[0])` directly on the result
of `MAX(e.seq_id)`. On palaces where chromadb 1.5.x has been writing
seq_ids natively (8-byte big-endian uint64 BLOB), that raises
`ValueError: invalid literal for int() with base 10: b'...'` before
the dry-run can print, leaving users with no path through the
recovery feature added in #1135 — the only documented un-poison
route for palaces hit by the original PR #664 shim bug.

Decode BLOB return values via `int.from_bytes(val, "big")` and
keep the existing `int(val)` path for INTEGER rows. Regression
test seeds a BLOB row in `embeddings.seq_id` and asserts the
heuristic surfaces the correct integer.
This commit is contained in:
Igor Lins e Silva
2026-04-30 22:04:41 -03:00
parent fdfaf017ab
commit 3b5ebcc9fc
2 changed files with 35 additions and 1 deletions
+26
View File
@@ -588,6 +588,32 @@ def test_max_seq_id_segment_filter(tmp_path):
assert rows[other] > repair.MAX_SEQ_ID_SANITY_THRESHOLD
def test_max_seq_id_heuristic_decodes_blob_embeddings_seq_id(tmp_path):
"""`embeddings.seq_id` rows can be BLOB-typed on palaces where chromadb
1.5.x has been writing seq_ids natively (8-byte big-endian uint64).
`_compute_heuristic_seq_id` must decode those rather than crashing on
`int(bytes)` — the recovery feature is meaningless if it can't read
the storage format it was designed to repair.
"""
palace = str(tmp_path / "palace")
seg = _seed_poisoned_max_seq_id(palace)
db_path = os.path.join(palace, "chroma.sqlite3")
drawers_meta_max = seg["drawers_meta_max"]
blob_max = drawers_meta_max + 7
blob_value = blob_max.to_bytes(8, "big")
with sqlite3.connect(db_path) as conn:
conn.execute(
"INSERT INTO embeddings(segment_id, embedding_id, seq_id) VALUES (?, ?, ?)",
(seg["drawers_meta"], "d-blob-max", blob_value),
)
conn.commit()
result = repair.repair_max_seq_id(palace, dry_run=True)
assert result["after"][seg["drawers_vec"]] == blob_max
assert result["after"][seg["drawers_meta"]] == blob_max
def test_max_seq_id_no_poison_is_noop(tmp_path):
palace = str(tmp_path / "palace")
os.makedirs(palace)