diff --git a/mempalace/repair.py b/mempalace/repair.py index fe2ba15..1cd1556 100644 --- a/mempalace/repair.py +++ b/mempalace/repair.py @@ -559,6 +559,11 @@ def _compute_heuristic_seq_id(cur: sqlite3.Cursor, segment_id: str) -> int: already-indexed embeddings on next subscribe. That is an acceptable loss vs. resetting to 0 (which would re-process the entire queue and risk HNSW bloat from issue #1046). + + ``embeddings.seq_id`` rows can be BLOB-typed on palaces where + chromadb 1.5.x has been writing seq_ids natively (8-byte big-endian + uint64). When SQLite's ``MAX`` returns such a row, decode it back to + an integer rather than crashing on ``int(bytes)``. """ row = cur.execute( """ @@ -573,7 +578,10 @@ def _compute_heuristic_seq_id(cur: sqlite3.Cursor, segment_id: str) -> int: ).fetchone() if row is None or row[0] is None: return 0 - return int(row[0]) + val = row[0] + if isinstance(val, (bytes, bytearray)): + return int.from_bytes(val, "big") + return int(val) def _read_sidecar_seq_ids(sidecar_path: str) -> dict[str, int]: diff --git a/tests/test_repair.py b/tests/test_repair.py index 18dd9c4..bc770dd 100644 --- a/tests/test_repair.py +++ b/tests/test_repair.py @@ -588,6 +588,32 @@ def test_max_seq_id_segment_filter(tmp_path): assert rows[other] > repair.MAX_SEQ_ID_SANITY_THRESHOLD +def test_max_seq_id_heuristic_decodes_blob_embeddings_seq_id(tmp_path): + """`embeddings.seq_id` rows can be BLOB-typed on palaces where chromadb + 1.5.x has been writing seq_ids natively (8-byte big-endian uint64). + `_compute_heuristic_seq_id` must decode those rather than crashing on + `int(bytes)` — the recovery feature is meaningless if it can't read + the storage format it was designed to repair. + """ + palace = str(tmp_path / "palace") + seg = _seed_poisoned_max_seq_id(palace) + db_path = os.path.join(palace, "chroma.sqlite3") + + drawers_meta_max = seg["drawers_meta_max"] + blob_max = drawers_meta_max + 7 + blob_value = blob_max.to_bytes(8, "big") + with sqlite3.connect(db_path) as conn: + conn.execute( + "INSERT INTO embeddings(segment_id, embedding_id, seq_id) VALUES (?, ?, ?)", + (seg["drawers_meta"], "d-blob-max", blob_value), + ) + conn.commit() + + result = repair.repair_max_seq_id(palace, dry_run=True) + assert result["after"][seg["drawers_vec"]] == blob_max + assert result["after"][seg["drawers_meta"]] == blob_max + + def test_max_seq_id_no_poison_is_noop(tmp_path): palace = str(tmp_path / "palace") os.makedirs(palace)