From 3b5ebcc9fce390cf8b7887c5238a7eac7de40f75 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 30 Apr 2026 22:04:41 -0300 Subject: [PATCH] fix(repair): decode BLOB embeddings.seq_id in max-seq-id heuristic (#1254) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_compute_heuristic_seq_id` ran `int(row[0])` directly on the result of `MAX(e.seq_id)`. On palaces where chromadb 1.5.x has been writing seq_ids natively (8-byte big-endian uint64 BLOB), that raises `ValueError: invalid literal for int() with base 10: b'...'` before the dry-run can print, leaving users with no path through the recovery feature added in #1135 — the only documented un-poison route for palaces hit by the original PR #664 shim bug. Decode BLOB return values via `int.from_bytes(val, "big")` and keep the existing `int(val)` path for INTEGER rows. Regression test seeds a BLOB row in `embeddings.seq_id` and asserts the heuristic surfaces the correct integer. --- mempalace/repair.py | 10 +++++++++- tests/test_repair.py | 26 ++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/mempalace/repair.py b/mempalace/repair.py index fe2ba15..1cd1556 100644 --- a/mempalace/repair.py +++ b/mempalace/repair.py @@ -559,6 +559,11 @@ def _compute_heuristic_seq_id(cur: sqlite3.Cursor, segment_id: str) -> int: already-indexed embeddings on next subscribe. That is an acceptable loss vs. resetting to 0 (which would re-process the entire queue and risk HNSW bloat from issue #1046). + + ``embeddings.seq_id`` rows can be BLOB-typed on palaces where + chromadb 1.5.x has been writing seq_ids natively (8-byte big-endian + uint64). When SQLite's ``MAX`` returns such a row, decode it back to + an integer rather than crashing on ``int(bytes)``. """ row = cur.execute( """ @@ -573,7 +578,10 @@ def _compute_heuristic_seq_id(cur: sqlite3.Cursor, segment_id: str) -> int: ).fetchone() if row is None or row[0] is None: return 0 - return int(row[0]) + val = row[0] + if isinstance(val, (bytes, bytearray)): + return int.from_bytes(val, "big") + return int(val) def _read_sidecar_seq_ids(sidecar_path: str) -> dict[str, int]: diff --git a/tests/test_repair.py b/tests/test_repair.py index 18dd9c4..bc770dd 100644 --- a/tests/test_repair.py +++ b/tests/test_repair.py @@ -588,6 +588,32 @@ def test_max_seq_id_segment_filter(tmp_path): assert rows[other] > repair.MAX_SEQ_ID_SANITY_THRESHOLD +def test_max_seq_id_heuristic_decodes_blob_embeddings_seq_id(tmp_path): + """`embeddings.seq_id` rows can be BLOB-typed on palaces where chromadb + 1.5.x has been writing seq_ids natively (8-byte big-endian uint64). + `_compute_heuristic_seq_id` must decode those rather than crashing on + `int(bytes)` — the recovery feature is meaningless if it can't read + the storage format it was designed to repair. + """ + palace = str(tmp_path / "palace") + seg = _seed_poisoned_max_seq_id(palace) + db_path = os.path.join(palace, "chroma.sqlite3") + + drawers_meta_max = seg["drawers_meta_max"] + blob_max = drawers_meta_max + 7 + blob_value = blob_max.to_bytes(8, "big") + with sqlite3.connect(db_path) as conn: + conn.execute( + "INSERT INTO embeddings(segment_id, embedding_id, seq_id) VALUES (?, ?, ?)", + (seg["drawers_meta"], "d-blob-max", blob_value), + ) + conn.commit() + + result = repair.repair_max_seq_id(palace, dry_run=True) + assert result["after"][seg["drawers_vec"]] == blob_max + assert result["after"][seg["drawers_meta"]] == blob_max + + def test_max_seq_id_no_poison_is_noop(tmp_path): palace = str(tmp_path / "palace") os.makedirs(palace)