fix(repair): decode BLOB embeddings.seq_id in max-seq-id heuristic (#1254)
`_compute_heuristic_seq_id` ran `int(row[0])` directly on the result of `MAX(e.seq_id)`. On palaces where chromadb 1.5.x has been writing seq_ids natively (8-byte big-endian uint64 BLOB), that raises `ValueError: invalid literal for int() with base 10: b'...'` before the dry-run can print, leaving users with no path through the recovery feature added in #1135 — the only documented un-poison route for palaces hit by the original PR #664 shim bug. Decode BLOB return values via `int.from_bytes(val, "big")` and keep the existing `int(val)` path for INTEGER rows. Regression test seeds a BLOB row in `embeddings.seq_id` and asserts the heuristic surfaces the correct integer.
This commit is contained in:
@@ -588,6 +588,32 @@ def test_max_seq_id_segment_filter(tmp_path):
|
||||
assert rows[other] > repair.MAX_SEQ_ID_SANITY_THRESHOLD
|
||||
|
||||
|
||||
def test_max_seq_id_heuristic_decodes_blob_embeddings_seq_id(tmp_path):
|
||||
"""`embeddings.seq_id` rows can be BLOB-typed on palaces where chromadb
|
||||
1.5.x has been writing seq_ids natively (8-byte big-endian uint64).
|
||||
`_compute_heuristic_seq_id` must decode those rather than crashing on
|
||||
`int(bytes)` — the recovery feature is meaningless if it can't read
|
||||
the storage format it was designed to repair.
|
||||
"""
|
||||
palace = str(tmp_path / "palace")
|
||||
seg = _seed_poisoned_max_seq_id(palace)
|
||||
db_path = os.path.join(palace, "chroma.sqlite3")
|
||||
|
||||
drawers_meta_max = seg["drawers_meta_max"]
|
||||
blob_max = drawers_meta_max + 7
|
||||
blob_value = blob_max.to_bytes(8, "big")
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
conn.execute(
|
||||
"INSERT INTO embeddings(segment_id, embedding_id, seq_id) VALUES (?, ?, ?)",
|
||||
(seg["drawers_meta"], "d-blob-max", blob_value),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
result = repair.repair_max_seq_id(palace, dry_run=True)
|
||||
assert result["after"][seg["drawers_vec"]] == blob_max
|
||||
assert result["after"][seg["drawers_meta"]] == blob_max
|
||||
|
||||
|
||||
def test_max_seq_id_no_poison_is_noop(tmp_path):
|
||||
palace = str(tmp_path / "palace")
|
||||
os.makedirs(palace)
|
||||
|
||||
Reference in New Issue
Block a user