merge: develop into fix/1295-repair-max-seq-id-preflight

Three conflicts, all from develop landing #1285/#1310/#1312 after this
branch was authored:

- mempalace/cli.py: keep both import sets — this branch's
  maybe_repair_poisoned_max_seq_id_before_rebuild plus develop's
  RebuildCollectionError / _close_chroma_handles / _extract_drawers /
  _rebuild_collection_via_temp added in #1285.

- mempalace/repair.py: keep this branch's
  maybe_repair_poisoned_max_seq_id_before_rebuild definition; use
  develop's rebuild_index signature with the collection_name parameter
  added in #1312. Normalized print indent to 2 spaces matching the
  rest of the file.

- tests/test_repair.py: keep both this branch's max_seq_id preflight
  tests and develop's rebuild_from_sqlite + configured-collection-name
  tests; they exercise distinct code paths and don't overlap.

Local: 1617 tests pass, ruff lint+format clean against 0.4.x CI pin.
This commit is contained in:
Igor Lins e Silva
2026-05-07 10:32:29 -03:00
49 changed files with 5341 additions and 366 deletions
+788 -7
View File
@@ -2,7 +2,7 @@
import os
import sqlite3
from unittest.mock import MagicMock, patch
from unittest.mock import MagicMock, call, patch
import pytest
@@ -28,6 +28,16 @@ def test_get_palace_path_fallback():
assert ".mempalace" in result
def test_get_collection_name_from_config():
from mempalace.config import get_configured_collection_name
get_configured_collection_name.cache_clear()
with patch("mempalace.config.MempalaceConfig") as mock_config_cls:
mock_config_cls.return_value.collection_name = "custom_drawers"
assert repair._drawers_collection_name() == "custom_drawers"
get_configured_collection_name.cache_clear()
# ── _paginate_ids ─────────────────────────────────────────────────────
@@ -229,8 +239,11 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
}
mock_new_col = MagicMock()
mock_new_col.count.return_value = 2
mock_temp_col = MagicMock()
mock_temp_col.count.return_value = 2
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
mock_backend.create_collection.return_value = mock_new_col
mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
repair.rebuild_index(palace_path=str(tmp_path))
@@ -239,14 +252,74 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args)
# Verify: deleted and recreated (cosine is the backend default)
mock_backend.delete_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
mock_backend.create_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
assert mock_backend.create_collection.call_args_list == [
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
call(str(tmp_path), "mempalace_drawers"),
]
assert mock_backend.delete_collection.call_args_list == [
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
call(str(tmp_path), "mempalace_drawers"),
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
]
# Verify: used upsert not add
mock_temp_col.upsert.assert_called_once()
mock_new_col.upsert.assert_called_once()
mock_new_col.add.assert_not_called()
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_ignores_missing_temp_collection_at_start(
mock_backend_cls, mock_shutil, tmp_path
):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
def _fake_copy2(src, dst):
with open(dst, "w") as handle:
handle.write("backup")
mock_shutil.copy2.side_effect = _fake_copy2
mock_col = MagicMock()
mock_col.count.return_value = 2
mock_col.get.return_value = {
"ids": ["id1", "id2"],
"documents": ["doc1", "doc2"],
"metadatas": [{"wing": "a"}, {"wing": "b"}],
}
mock_new_col = MagicMock()
mock_new_col.count.return_value = 2
mock_temp_col = MagicMock()
mock_temp_col.count.return_value = 2
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
mock_backend.delete_collection.side_effect = [
ValueError("Collection [mempalace_drawers__repair_tmp] does not exist"),
None,
None,
]
repair.rebuild_index(palace_path=str(tmp_path))
assert mock_shutil.copy2.call_count == 1
assert mock_backend.delete_collection.call_args_list == [
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
call(str(tmp_path), "mempalace_drawers"),
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
]
def test_delete_collection_if_exists_reraises_unexpected_value_error():
mock_backend = MagicMock()
mock_backend.delete_collection.side_effect = ValueError("invalid collection name")
with pytest.raises(ValueError, match="invalid collection name"):
repair._delete_collection_if_exists(mock_backend, "/palace", "bad/name")
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_error_reading(mock_backend_cls, mock_shutil, tmp_path):
@@ -267,6 +340,21 @@ def test_check_extraction_safety_passes_when_counts_match(tmp_path):
repair.check_extraction_safety(str(tmp_path), 500)
def test_check_extraction_safety_uses_configured_collection(tmp_path):
with patch("mempalace.repair.sqlite_drawer_count", return_value=500) as count:
repair.check_extraction_safety(str(tmp_path), 500, collection_name="custom_drawers")
count.assert_called_once_with(str(tmp_path), "custom_drawers")
def test_check_extraction_safety_default_uses_configured_collection(tmp_path):
with (
patch("mempalace.repair._drawers_collection_name", return_value="custom_drawers"),
patch("mempalace.repair.sqlite_drawer_count", return_value=500) as count,
):
repair.check_extraction_safety(str(tmp_path), 500)
count.assert_called_once_with(str(tmp_path), "custom_drawers")
def test_check_extraction_safety_passes_when_sqlite_unreadable_and_under_cap(tmp_path):
"""SQLite check fails (None) but extraction is well under the cap → safe."""
with patch("mempalace.repair.sqlite_drawer_count", return_value=None):
@@ -321,6 +409,73 @@ def test_sqlite_drawer_count_returns_none_on_unreadable_schema(tmp_path):
assert repair.sqlite_drawer_count(str(tmp_path)) is None
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_default_uses_configured_collection(mock_backend_cls, mock_shutil, tmp_path):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
mock_col = MagicMock()
mock_col.count.return_value = 2
mock_col.get.return_value = {
"ids": ["id1", "id2"],
"documents": ["doc1", "doc2"],
"metadatas": [{"wing": "a"}, {"wing": "b"}],
}
mock_temp_col = MagicMock()
mock_temp_col.count.return_value = 2
mock_new_col = MagicMock()
mock_new_col.count.return_value = 2
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
with (
patch("mempalace.repair._drawers_collection_name", return_value="custom_drawers"),
patch("mempalace.repair.sqlite_drawer_count", return_value=2) as count,
):
repair.rebuild_index(palace_path=str(tmp_path))
mock_backend.get_collection.assert_called_once_with(str(tmp_path), "custom_drawers")
count.assert_called_once_with(str(tmp_path), "custom_drawers")
assert mock_backend.create_collection.call_args_list == [
call(str(tmp_path), "custom_drawers__repair_tmp"),
call(str(tmp_path), "custom_drawers"),
]
assert mock_backend.delete_collection.call_args_list == [
call(str(tmp_path), "custom_drawers__repair_tmp"),
call(str(tmp_path), "custom_drawers"),
call(str(tmp_path), "custom_drawers__repair_tmp"),
]
def test_status_default_uses_configured_drawer_collection(tmp_path):
with (
patch("mempalace.repair._drawers_collection_name", return_value="custom_drawers"),
patch("mempalace.repair.hnsw_capacity_status") as capacity_status,
):
capacity_status.side_effect = [
{
"sqlite_count": 1,
"hnsw_count": 1,
"divergence": 0,
"diverged": False,
"status": "ok",
"message": "",
},
{
"sqlite_count": 0,
"hnsw_count": 0,
"divergence": 0,
"diverged": False,
"status": "ok",
"message": "",
},
]
repair.status(palace_path=str(tmp_path))
assert capacity_status.call_args_list[0].args == (str(tmp_path), "custom_drawers")
assert capacity_status.call_args_list[1].args == (str(tmp_path), "mempalace_closets")
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_aborts_on_truncation_signal(mock_backend_cls, mock_shutil, tmp_path):
@@ -365,19 +520,261 @@ def test_rebuild_index_proceeds_with_override(mock_backend_cls, mock_shutil, tmp
},
{"ids": [], "documents": [], "metadatas": []},
]
mock_temp_col = MagicMock()
mock_temp_col.count.return_value = 10_000
mock_new_col = MagicMock()
mock_new_col.count.return_value = 10_000
mock_backend.get_collection.return_value = mock_col
mock_backend.create_collection.return_value = mock_new_col
mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
mock_backend_cls.return_value = mock_backend
with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
repair.rebuild_index(palace_path=str(tmp_path), confirm_truncation_ok=True)
mock_backend.delete_collection.assert_called_once()
mock_backend.create_collection.assert_called_once()
assert mock_backend.delete_collection.call_count == 3
assert mock_backend.create_collection.call_count == 2
mock_temp_col.upsert.assert_called()
mock_new_col.upsert.assert_called()
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_stage_failure_leaves_live_collection_untouched(
mock_backend_cls, mock_shutil, tmp_path
):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
mock_col = MagicMock()
mock_col.count.return_value = 2
mock_col.get.return_value = {
"ids": ["id1", "id2"],
"documents": ["doc1", "doc2"],
"metadatas": [{"wing": "a"}, {"wing": "b"}],
}
mock_temp_col = MagicMock()
mock_temp_col.count.return_value = 1
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
mock_backend.create_collection.return_value = mock_temp_col
with pytest.raises(repair.RebuildCollectionError) as excinfo:
repair.rebuild_index(palace_path=str(tmp_path))
assert excinfo.value.live_replaced is False
assert mock_shutil.copy2.call_count == 1
assert mock_backend.delete_collection.call_args_list == [
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
]
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_live_failure_restores_backup(mock_backend_cls, mock_shutil, tmp_path):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
def _fake_copy2(src, dst):
with open(dst, "w") as handle:
handle.write("backup")
mock_shutil.copy2.side_effect = _fake_copy2
mock_col = MagicMock()
mock_col.count.return_value = 2
mock_col.get.return_value = {
"ids": ["id1", "id2"],
"documents": ["doc1", "doc2"],
"metadatas": [{"wing": "a"}, {"wing": "b"}],
}
mock_temp_col = MagicMock()
mock_temp_col.count.return_value = 2
mock_new_col = MagicMock()
mock_new_col.upsert.side_effect = RuntimeError("live upsert failed")
active_backend = MagicMock()
active_backend.get_collection.return_value = mock_col
active_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
helper_backend = MagicMock()
mock_backend_cls.side_effect = [active_backend, helper_backend]
with pytest.raises(repair.RebuildCollectionError) as excinfo:
repair.rebuild_index(palace_path=str(tmp_path))
assert excinfo.value.live_replaced is True
assert mock_shutil.copy2.call_count == 2
assert active_backend.delete_collection.call_args_list == [
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
call(str(tmp_path), "mempalace_drawers"),
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
call(str(tmp_path), "mempalace_drawers"),
]
active_backend.close_palace.assert_called_once_with(str(tmp_path))
helper_backend.close_palace.assert_not_called()
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_live_delete_missing_still_restores_backup(
mock_backend_cls, mock_shutil, tmp_path
):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
def _fake_copy2(src, dst):
with open(dst, "w") as handle:
handle.write("backup")
mock_shutil.copy2.side_effect = _fake_copy2
mock_col = MagicMock()
mock_col.count.return_value = 2
mock_col.get.return_value = {
"ids": ["id1", "id2"],
"documents": ["doc1", "doc2"],
"metadatas": [{"wing": "a"}, {"wing": "b"}],
}
mock_temp_col = MagicMock()
mock_temp_col.count.return_value = 2
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("create failed")]
mock_backend.delete_collection.side_effect = [
None,
None,
None,
repair.ChromaNotFoundError("missing"),
]
with pytest.raises(repair.RebuildCollectionError) as excinfo:
repair.rebuild_index(palace_path=str(tmp_path))
assert excinfo.value.live_replaced is True
assert mock_shutil.copy2.call_count == 2
assert mock_backend.delete_collection.call_args_list == [
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
call(str(tmp_path), "mempalace_drawers"),
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
call(str(tmp_path), "mempalace_drawers"),
]
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_restore_failure_preserves_original_error(
mock_backend_cls, mock_shutil, tmp_path, capsys
):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
def _copy2_side_effect(src, dst):
if str(src).endswith(".backup"):
raise PermissionError("locked sqlite")
with open(dst, "w") as handle:
handle.write("backup")
mock_shutil.copy2.side_effect = _copy2_side_effect
mock_col = MagicMock()
mock_col.count.return_value = 2
mock_col.get.return_value = {
"ids": ["id1", "id2"],
"documents": ["doc1", "doc2"],
"metadatas": [{"wing": "a"}, {"wing": "b"}],
}
mock_temp_col = MagicMock()
mock_temp_col.count.return_value = 2
mock_new_col = MagicMock()
mock_new_col.upsert.side_effect = RuntimeError("live upsert failed")
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
with pytest.raises(repair.RebuildCollectionError) as excinfo:
repair.rebuild_index(palace_path=str(tmp_path))
out = capsys.readouterr().out
assert "locked sqlite" in out
assert "Manual restore required" in out
assert "live upsert failed" in str(excinfo.value)
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_collection_via_temp_keeps_original_error_when_cleanup_fails(
mock_backend_cls,
):
mock_col = MagicMock()
mock_col.count.return_value = 2
mock_temp_col = MagicMock()
mock_temp_col.count.return_value = 2
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("live build failed")]
mock_backend.delete_collection.side_effect = [
None,
None,
RuntimeError("cleanup failed"),
]
with pytest.raises(repair.RebuildCollectionError) as excinfo:
repair._rebuild_collection_via_temp(
mock_backend,
"/palace",
["id1", "id2"],
["doc1", "doc2"],
[{"wing": "a"}, {"wing": "b"}],
batch_size=5000,
progress=lambda *args, **kwargs: None,
)
assert "live build failed" in str(excinfo.value)
assert excinfo.value.live_replaced is True
assert mock_backend.delete_collection.call_args_list == [
call("/palace", "mempalace_drawers__repair_tmp"),
call("/palace", "mempalace_drawers"),
call("/palace", "mempalace_drawers__repair_tmp"),
]
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_ignores_temp_cleanup_failure_after_success(
mock_backend_cls, mock_shutil, tmp_path
):
sqlite_path = tmp_path / "chroma.sqlite3"
sqlite_path.write_text("fake")
def _fake_copy2(src, dst):
with open(dst, "w") as handle:
handle.write("backup")
mock_shutil.copy2.side_effect = _fake_copy2
mock_col = MagicMock()
mock_col.count.return_value = 2
mock_col.get.return_value = {
"ids": ["id1", "id2"],
"documents": ["doc1", "doc2"],
"metadatas": [{"wing": "a"}, {"wing": "b"}],
}
mock_temp_col = MagicMock()
mock_temp_col.count.return_value = 2
mock_new_col = MagicMock()
mock_new_col.count.return_value = 2
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
mock_backend.delete_collection.side_effect = [
None,
None,
RuntimeError("cleanup failed"),
]
repair.rebuild_index(palace_path=str(tmp_path))
assert mock_shutil.copy2.call_count == 1
assert mock_backend.delete_collection.call_args_list == [
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
call(str(tmp_path), "mempalace_drawers"),
call(str(tmp_path), "mempalace_drawers__repair_tmp"),
]
# ── repair_max_seq_id ─────────────────────────────────────────────────
@@ -748,3 +1145,387 @@ def test_rebuild_index_repairs_poisoned_max_seq_id_before_collection_rebuild(tmp
assert "Detected poisoned max_seq_id rows" in out
assert "non-destructive max_seq_id repair" in out
# ── extract_via_sqlite + rebuild_from_sqlite (#1308) ──────────────────
#
# These tests build real chromadb palaces in tmp_path rather than mocking
# the SQLite layer. The bug class they guard against is "extraction sees
# different rows than chromadb stored" — the only honest check is to let
# chromadb actually write rows and then read them back via the SQLite
# bypass. Mocking the SQLite cursor would defeat the test.
def _seed_palace(palace_path, collection_name, rows):
"""Build a real chromadb palace at ``palace_path`` and add ``rows``.
``rows`` is a list of ``(id, document, metadata)`` tuples.
"""
from mempalace.backends.chroma import ChromaBackend
backend = ChromaBackend()
try:
col = backend.create_collection(str(palace_path), collection_name)
col.upsert(
ids=[r[0] for r in rows],
documents=[r[1] for r in rows],
metadatas=[r[2] for r in rows],
)
finally:
# Release chromadb's rust-side SQLite/HNSW file locks before the
# caller proceeds. Without this, an in-place rebuild on Windows
# fails with WinError 32 on data_level0.bin during the archive
# rename (cf. PR #1310 test-windows job).
backend.close()
def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):
"""Round-trip: a chromadb palace with N upserted rows returns those
same N rows when read via the SQLite bypass.
Catches: anyone who breaks the segments/embeddings/embedding_metadata
JOIN, swaps the metadata vs vector segment, or changes how the
document is stored under the ``chroma:document`` key.
Also asserts every embedding row underlying the extraction lives in
a ``segments.scope = 'METADATA'`` segment. Document + metadata rows
are stored under METADATA in Chroma's segment layout while HNSW
files live under ``VECTOR``; locking that assumption in here means a
future refactor that accidentally points the JOIN at ``VECTOR``
fails this test instead of silently regressing the recovery path.
"""
rows = [
(f"drawer_{i:03d}", f"document body {i}", {"wing": "test_wing", "room": f"r{i % 3}"})
for i in range(25)
]
_seed_palace(tmp_path, "mempalace_drawers", rows)
extracted = list(repair.extract_via_sqlite(str(tmp_path), "mempalace_drawers"))
assert len(extracted) == 25
by_id = {emb_id: (doc, meta) for emb_id, doc, meta in extracted}
assert set(by_id) == {r[0] for r in rows}
for emb_id, doc, meta in rows:
got_doc, got_meta = by_id[emb_id]
assert got_doc == doc, f"document mangled for {emb_id}"
assert got_meta == meta, f"metadata mangled for {emb_id}: {got_meta!r}"
# Lock the segment-scope assumption directly against Chroma's on-disk
# layout so a future change that points the extraction JOIN at the
# VECTOR segment cannot pass this test. Query each extracted row's
# backing segment scope via the same SQLite tables ``extract_via_sqlite``
# reads from.
sqlite_path = os.path.join(str(tmp_path), "chroma.sqlite3")
conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
try:
scopes = {
scope
for (scope,) in conn.execute(
"""
SELECT DISTINCT s.scope
FROM embeddings e
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = ? AND e.embedding_id IN ({})
""".format(",".join("?" * len(extracted))),
("mempalace_drawers", *(emb_id for emb_id, _, _ in extracted)),
)
}
finally:
conn.close()
assert scopes == {"METADATA"}, (
f"extraction is reading from segments scoped {scopes!r}; only "
"'METADATA' should back the document/metadata rows. If Chroma's "
"segment layout changed, update extract_via_sqlite's WHERE clause."
)
def test_extract_via_sqlite_preserves_typed_metadata(tmp_path):
"""Chromadb stores int / float / bool / string in distinct typed
columns. Extraction must round-trip the original type, not coerce
everything to string.
Catches: a regression where the SELECT order changes and ints come
back as None, or where the column-resolution rule prefers the wrong
column.
"""
rows = [
(
"drawer_typed",
"doc",
{
"wing": "w",
"chunk_index": 7, # int
"score": 0.42, # float
"is_active": True, # bool
},
),
]
_seed_palace(tmp_path, "mempalace_drawers", rows)
extracted = list(repair.extract_via_sqlite(str(tmp_path), "mempalace_drawers"))
assert len(extracted) == 1
_, _, meta = extracted[0]
assert meta["chunk_index"] == 7 and isinstance(meta["chunk_index"], int)
assert meta["score"] == 0.42 and isinstance(meta["score"], float)
assert meta["is_active"] is True
assert meta["wing"] == "w"
def test_extract_via_sqlite_unknown_collection_yields_nothing(tmp_path):
"""Asking for a collection that isn't in the palace must return an
empty iterator, not silently fall back to another collection's
metadata segment. Seeds two real collections and queries for a third
name so a regression that drops the WHERE c.name=? filter would leak
rows from the seeded collections rather than passing.
"""
_seed_palace(tmp_path, "mempalace_drawers", [("d1", "doc", {"wing": "w"})])
_seed_palace(tmp_path, "mempalace_closets", [("c1", "abbrev", {"wing": "w"})])
assert list(repair.extract_via_sqlite(str(tmp_path), "not_a_real_collection")) == []
def test_extract_via_sqlite_missing_palace_yields_nothing(tmp_path):
"""No chroma.sqlite3 → empty iterator, no exception. Callers depend
on this when probing speculatively."""
empty = tmp_path / "no_palace_here"
empty.mkdir()
assert list(repair.extract_via_sqlite(str(empty), "mempalace_drawers")) == []
def test_rebuild_from_sqlite_roundtrips_via_real_chromadb(tmp_path):
"""End-to-end: seed source palace, rebuild into a fresh dest, then
open dest with a fresh ChromaBackend and verify ``count()`` and
metadata filters return the original rows. Also asserts a closet
document round-trips so a future regression that re-embeds with the
wrong EF or swaps drawer/closet content would fail here.
This is the single most important regression guard. If
``rebuild_from_sqlite`` silently drops rows or mangles metadata, no
other test in this file would catch it because they all stop at the
extraction layer.
"""
from mempalace.backends.chroma import ChromaBackend
source = tmp_path / "source"
dest = tmp_path / "dest"
rows = [
(f"drawer_{i:03d}", f"body {i}", {"wing": "alpha" if i % 2 else "beta", "room": "r0"})
for i in range(40)
]
_seed_palace(source, "mempalace_drawers", rows)
_seed_palace(
source,
"mempalace_closets",
[("closet_x", "abbrev pointer →drawer_001", {"wing": "alpha"})],
)
counts = repair.rebuild_from_sqlite(str(source), str(dest))
assert counts == {"mempalace_drawers": 40, "mempalace_closets": 1}
backend = ChromaBackend()
drawers = backend.get_collection(str(dest), "mempalace_drawers")
assert drawers.count() == 40
alpha = drawers.get(where={"wing": "alpha"})
assert len(alpha["ids"]) == 20
# Spot-check that document text round-trips for one specific drawer
# — protects against a regression where extraction or upsert order
# silently swaps document bodies between IDs.
one = drawers.get(ids=["drawer_007"], include=["documents", "metadatas"])
assert one["documents"] == ["body 7"]
assert one["metadatas"][0]["wing"] == "alpha"
# Closets: the AAAK index layer. Re-embedded with the same EF so a
# known closet ID and its document body must come back intact.
closets = backend.get_collection(str(dest), "mempalace_closets")
assert closets.count() == 1
closet_row = closets.get(ids=["closet_x"], include=["documents", "metadatas"])
assert closet_row["documents"] == ["abbrev pointer →drawer_001"]
assert closet_row["metadatas"][0] == {"wing": "alpha"}
def test_rebuild_from_sqlite_refuses_existing_dest(tmp_path):
"""Refuse to write into a directory that already exists when source
and dest differ. Without this, an unattended re-run would silently
interleave a partial rebuild with whatever's already at dest.
"""
source = tmp_path / "source"
dest = tmp_path / "dest"
_seed_palace(source, "mempalace_drawers", [("d1", "doc", {"wing": "w"})])
dest.mkdir()
# Drop a marker file so we can prove the dir wasn't touched.
(dest / "marker.txt").write_text("preexisting")
counts = repair.rebuild_from_sqlite(str(source), str(dest))
assert counts == {}
assert (dest / "marker.txt").read_text() == "preexisting"
assert not (dest / "chroma.sqlite3").exists()
def test_rebuild_from_sqlite_in_place_archives_when_opted_in(tmp_path):
"""In-place rebuild (source == dest) with ``archive_existing_dest=True``
must move the original aside to ``<dest>.pre-rebuild-<ts>`` and read
from the archive — the original drawer rows must survive in the new
palace, AND the archive itself must still contain the original rows.
Catches: a refactor that moves the original out but then reads from
the now-empty original location, producing an empty rebuild; also
catches a swap that empties the archive after reading.
"""
palace = tmp_path / "palace"
rows = [(f"d{i}", f"body {i}", {"wing": "w", "room": "r"}) for i in range(15)]
_seed_palace(palace, "mempalace_drawers", rows)
counts = repair.rebuild_from_sqlite(str(palace), str(palace), archive_existing_dest=True)
assert counts["mempalace_drawers"] == 15
archives = [p for p in tmp_path.iterdir() if p.name.startswith("palace.pre-rebuild-")]
assert len(archives) == 1
assert (archives[0] / "chroma.sqlite3").exists()
# Archive must still hold the same row count via the SQLite bypass —
# proves the archive wasn't silently truncated as a side effect.
archived_rows = list(repair.extract_via_sqlite(str(archives[0]), "mempalace_drawers"))
assert len(archived_rows) == 15
from mempalace.backends.chroma import ChromaBackend
rebuilt = ChromaBackend().get_collection(str(palace), "mempalace_drawers")
assert rebuilt.count() == 15
def test_rebuild_from_sqlite_in_place_refuses_without_archive_flag(tmp_path):
"""Source == dest without archive flag must abort untouched. The
most catastrophic possible regression of this code path is silently
deleting the only copy of the user's data."""
palace = tmp_path / "palace"
_seed_palace(palace, "mempalace_drawers", [("d1", "doc", {"wing": "w"})])
sqlite_before = (palace / "chroma.sqlite3").stat().st_size
counts = repair.rebuild_from_sqlite(str(palace), str(palace))
assert counts == {}
# Same file, untouched.
assert (palace / "chroma.sqlite3").stat().st_size == sqlite_before
archives = [p for p in tmp_path.iterdir() if "pre-rebuild" in p.name]
assert archives == []
def test_rebuild_from_sqlite_source_missing_chroma_db(tmp_path):
"""Source dir exists but has no chroma.sqlite3 → returns empty,
leaves dest untouched."""
source = tmp_path / "source"
source.mkdir()
(source / "stray_file").write_text("not a palace")
dest = tmp_path / "dest"
counts = repair.rebuild_from_sqlite(str(source), str(dest))
assert counts == {}
assert not dest.exists()
def test_rebuild_from_sqlite_in_place_validates_source_before_archiving(tmp_path):
"""In-place + archive_existing_dest=True with a dir that lacks
chroma.sqlite3 must NOT rename the dir before bailing. An earlier
revision archived first and validated second, leaving the user with
a renamed empty dir to manually undo. Catches that ordering bug.
"""
palace = tmp_path / "palace"
palace.mkdir()
(palace / "marker.txt").write_text("not a real palace")
counts = repair.rebuild_from_sqlite(str(palace), str(palace), archive_existing_dest=True)
assert counts == {}
# No archive created — original dir still in place with its marker.
assert palace.exists()
assert (palace / "marker.txt").read_text() == "not a real palace"
archives = [p for p in tmp_path.iterdir() if "pre-rebuild" in p.name]
assert archives == []
def test_rebuild_from_sqlite_raises_on_upsert_failure(tmp_path, monkeypatch):
"""Mid-batch upsert failure must raise ``RebuildPartialError`` and
surface the failed collection + archive path so the user can recover.
Without this, an unattended script gets exit-code-zero on a partial
rebuild and the user discovers the data loss only when search starts
returning fewer hits.
"""
palace = tmp_path / "palace"
rows = [(f"d{i}", f"body {i}", {"wing": "w", "room": "r"}) for i in range(5)]
_seed_palace(palace, "mempalace_drawers", rows)
# Make the very first upsert raise so we don't depend on batch
# boundary behavior. Patching ChromaCollection.upsert (the wrapper
# mempalace's backend returns) keeps the failure path realistic.
# ``monkeypatch`` is pytest's built-in fixture that auto-restores
# the original attribute when the test exits, so we don't need to
# undo this manually.
from mempalace.backends.chroma import ChromaCollection
def boom(self, **kwargs):
raise RuntimeError("simulated chromadb upsert failure")
monkeypatch.setattr(ChromaCollection, "upsert", boom)
with pytest.raises(repair.RebuildPartialError) as excinfo:
repair.rebuild_from_sqlite(str(palace), str(palace), archive_existing_dest=True)
err = excinfo.value
assert err.failed_collection == "mempalace_drawers"
assert err.partial_counts.get("mempalace_drawers") == 0
assert err.archive_path is not None
assert os.path.isfile(os.path.join(err.archive_path, "chroma.sqlite3"))
assert err.dest_palace == os.path.abspath(str(palace))
def test_rebuild_from_sqlite_honors_configured_drawer_collection_name(tmp_path, monkeypatch):
"""A user with a non-default drawers collection name (set via
``MempalaceConfig().collection_name``) must have THAT collection
rebuilt — not the hardcoded ``mempalace_drawers``.
Catches: a regression where the recovery path silently rebuilds the
default-name collection on a custom-named palace, leaving the user's
actual data unrebuilt while reporting "rebuild complete." This is
the failure mode reviewer mjc flagged on PR #1310 as needing to line
up with the configured-collection-name work in #1312. Closets stay
fixed (``mempalace_closets``) by design — the AAAK index references
drawer IDs by string and is not per-deployment configurable.
Strategy: monkeypatch the lazy resolver so the test is hermetic and
does not depend on the global config file or env state.
"""
from mempalace.backends.chroma import ChromaBackend
custom_drawers = "custom_drawers_xyz"
monkeypatch.setattr(repair, "_drawers_collection_name", lambda: custom_drawers)
source = tmp_path / "source"
dest = tmp_path / "dest"
drawer_rows = [(f"d{i}", f"body {i}", {"wing": "alpha"}) for i in range(3)]
closet_rows = [("closet_a", "abbrev →d0", {"wing": "alpha"})]
_seed_palace(source, custom_drawers, drawer_rows)
_seed_palace(source, "mempalace_closets", closet_rows)
counts = repair.rebuild_from_sqlite(str(source), str(dest))
# Rebuilt under the custom name, not under the default "mempalace_drawers".
assert counts == {custom_drawers: 3, "mempalace_closets": 1}
backend = ChromaBackend()
rebuilt_drawers = backend.get_collection(str(dest), custom_drawers)
assert rebuilt_drawers.count() == 3
# Default-name collection must NOT exist in dest — proves we did not
# silently fall back to the hardcoded name during rebuild.
try:
rebuilt_default = backend.get_collection(str(dest), "mempalace_drawers")
# If get_collection returns without raising, count() should be 0
# (chromadb may auto-create on get with some EFs); a non-zero
# count would mean we wrote rows to the wrong collection.
assert rebuilt_default.count() == 0, (
"rebuild leaked rows into the default-name collection on a "
"custom-name palace — recovery wrote to the wrong collection."
)
except Exception:
pass # Expected: collection wasn't created.