fix(repair): address PR #1310 review feedback
Five small hardening fixes for the from-sqlite rebuild path, all from mjc's review on #1310: - repair.py: drawers collection name now resolves from MempalaceConfig().collection_name via _drawers_collection_name() (closets stays fixed by design — AAAK index references drawer IDs by string). Lines up with the broader configured-collection work in #1312 so that PR can rebase cleanly on top. - repair.py: create_collection() moved inside the try block in _rebuild_one_collection so a Chroma "Collection already exists" failure surfaces as RebuildPartialError with archive_path, not an unstructured exception that strands the user without recovery instructions. - repair.py: rebuild_from_sqlite wraps backend lifetime in try/finally with backend.close() so PersistentClient handles to dest_palace are released on every exit path. The from-sqlite path post-dates #1285's lifecycle hardening of the legacy rebuild, so this needed its own cleanup. - cli.py: cmd_repair (from-sqlite mode) now exits non-zero when rebuild_from_sqlite returns {} (validation refusal sentinel), so unattended scripts/CI distinguish "invalid inputs" from a successful rebuild that legitimately found zero rows. - tests/test_repair.py: test_extract_via_sqlite_returns_all_rows_with_metadata now asserts every backing segment is scope='METADATA', locking in the segment-layout assumption against future regressions that point the JOIN at the VECTOR segment. New test coverage: - test_rebuild_from_sqlite_honors_configured_drawer_collection_name - test_cmd_repair_from_sqlite_validation_refusal_exits_nonzero - test_cmd_repair_from_sqlite_success_does_not_exit Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
committed by
Igor Lins e Silva
parent
cb6bfd5231
commit
d92c741084
+10
-1
@@ -699,7 +699,7 @@ def cmd_repair(args):
|
||||
return
|
||||
|
||||
try:
|
||||
rebuild_from_sqlite(
|
||||
counts = rebuild_from_sqlite(
|
||||
source_palace=source_path,
|
||||
dest_palace=palace_path,
|
||||
archive_existing_dest=archive_existing,
|
||||
@@ -713,6 +713,15 @@ def cmd_repair(args):
|
||||
f"Failed in collection: {exc.failed_collection}"
|
||||
)
|
||||
sys.exit(1)
|
||||
# An empty counts dict is rebuild_from_sqlite's documented signal
|
||||
# for a validation refusal (missing source, existing dest,
|
||||
# in-place without --archive-existing). The library already
|
||||
# printed an actionable message; exit non-zero so unattended
|
||||
# scripts/CI distinguish "invalid inputs" from a successful
|
||||
# rebuild that legitimately found zero rows (which still returns
|
||||
# a populated dict with 0-valued counts).
|
||||
if not counts:
|
||||
sys.exit(1)
|
||||
return
|
||||
|
||||
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||
|
||||
+70
-11
@@ -43,11 +43,46 @@ from .backends.chroma import ChromaBackend, hnsw_capacity_status
|
||||
|
||||
COLLECTION_NAME = "mempalace_drawers"
|
||||
|
||||
# Collections rebuilt by ``rebuild_from_sqlite``. Order matters for the
|
||||
# upsert pass — drawers carry the bulk of the data, closets are the AAAK
|
||||
# index layer and reference drawer IDs by string in their documents (no
|
||||
# foreign-key validation, so ordering is informational, not load-bearing).
|
||||
RECOVERABLE_COLLECTIONS = ("mempalace_drawers", "mempalace_closets")
|
||||
# The closets collection (AAAK index layer) is intentionally fixed —
|
||||
# closets reference drawer IDs by string and live alongside drawers in the
|
||||
# same palace; renaming the closets collection per-deployment would break
|
||||
# cross-palace AAAK lookups. Drawer collection name comes from config
|
||||
# (see ``_recoverable_collections``).
|
||||
CLOSETS_COLLECTION_NAME = "mempalace_closets"
|
||||
|
||||
|
||||
def _drawers_collection_name() -> str:
|
||||
"""Resolve the drawers collection name from user config, falling back
|
||||
to the module default ``COLLECTION_NAME`` if config is unreadable.
|
||||
|
||||
Recovery flows must honor ``MempalaceConfig().collection_name`` so a
|
||||
user with a non-default drawer collection (e.g. multi-palace setups)
|
||||
rebuilds the right rows. Closets remain fixed — see
|
||||
``CLOSETS_COLLECTION_NAME``.
|
||||
"""
|
||||
try:
|
||||
from .config import MempalaceConfig
|
||||
|
||||
return MempalaceConfig().collection_name or COLLECTION_NAME
|
||||
except Exception:
|
||||
return COLLECTION_NAME
|
||||
|
||||
|
||||
def _recoverable_collections() -> tuple[str, ...]:
|
||||
"""Collections rebuilt by ``rebuild_from_sqlite``, in upsert order.
|
||||
|
||||
Drawers first (bulk data), then closets (AAAK index layer that
|
||||
references drawer IDs by string in their documents — no
|
||||
foreign-key validation, so ordering is informational, not
|
||||
load-bearing).
|
||||
"""
|
||||
return (_drawers_collection_name(), CLOSETS_COLLECTION_NAME)
|
||||
|
||||
|
||||
# Back-compat alias for callers that imported the constant. New code
|
||||
# should call ``_recoverable_collections()`` so config changes are picked
|
||||
# up at call time.
|
||||
RECOVERABLE_COLLECTIONS = (COLLECTION_NAME, CLOSETS_COLLECTION_NAME)
|
||||
|
||||
|
||||
def _get_palace_path():
|
||||
@@ -487,12 +522,11 @@ def _rebuild_one_collection(
|
||||
caller can stop the loop and print recovery instructions instead of
|
||||
silently shipping a partial palace.
|
||||
"""
|
||||
col = backend.create_collection(dest_palace, collection_name)
|
||||
|
||||
ids: list[str] = []
|
||||
docs: list[str] = []
|
||||
metas: list[dict] = []
|
||||
upserted = 0
|
||||
col = None
|
||||
|
||||
def _flush() -> int:
|
||||
nonlocal upserted
|
||||
@@ -507,6 +541,14 @@ def _rebuild_one_collection(
|
||||
return upserted
|
||||
|
||||
try:
|
||||
# ``create_collection`` lives inside the try so a Chroma-side
|
||||
# "Collection already exists" failure (which can happen when the
|
||||
# process-wide System cache still holds a pre-archive schema) is
|
||||
# reported as a structured ``RebuildPartialError`` carrying
|
||||
# ``archive_path`` — instead of an unstructured exception that
|
||||
# strands the user without recovery instructions.
|
||||
col = backend.create_collection(dest_palace, collection_name)
|
||||
|
||||
for emb_id, doc, meta in extract_via_sqlite(source_palace, collection_name):
|
||||
ids.append(emb_id)
|
||||
docs.append(doc or "")
|
||||
@@ -664,8 +706,14 @@ def rebuild_from_sqlite(
|
||||
|
||||
Returns a ``{collection_name: row_count}`` dict so callers (CLI,
|
||||
tests) can verify the per-collection rebuild count without parsing
|
||||
stdout. Returns ``{}`` on validation failures (missing source,
|
||||
refusing to overwrite). Raises :class:`RebuildPartialError` if a
|
||||
stdout. A successful rebuild always returns a dict with one key per
|
||||
recoverable collection (values may be ``0`` when a collection is
|
||||
legitimately empty in the source). The empty dict ``{}`` is reserved
|
||||
for validation refusals (missing source DB, refusing to overwrite an
|
||||
existing dest, in-place mode without ``archive_existing_dest``); CLI
|
||||
callers should treat ``{}`` as an error and exit non-zero so CI and
|
||||
scripts can distinguish "invalid inputs" from "successful recovery
|
||||
that found zero rows." Raises :class:`RebuildPartialError` if a
|
||||
chromadb upsert fails partway through; the dest palace is left in
|
||||
place so the user can inspect what landed, and the in-place archive
|
||||
(when applicable) is reported in the error so the user can re-run
|
||||
@@ -765,10 +813,19 @@ def rebuild_from_sqlite(
|
||||
|
||||
os.makedirs(dest_palace, exist_ok=True)
|
||||
|
||||
# Backend lifetime is wrapped in try/finally so the dest palace's
|
||||
# PersistentClient handle (opened lazily inside ``create_collection``
|
||||
# / ``get_collection``) is released on every exit path: success,
|
||||
# ``RebuildPartialError``, or any unexpected exception. Without this,
|
||||
# a long-running process that calls ``rebuild_from_sqlite`` would
|
||||
# leak SQLite/HNSW file handles into Chroma's ``SharedSystemClient``
|
||||
# cache, surfacing later as "Collection already exists" on the next
|
||||
# in-place rebuild or as a Windows file-lock failure on cleanup
|
||||
# (cf. #1285's lifecycle hardening for the legacy rebuild path).
|
||||
backend = ChromaBackend()
|
||||
counts: dict[str, int] = {}
|
||||
|
||||
for cname in RECOVERABLE_COLLECTIONS:
|
||||
try:
|
||||
for cname in _recoverable_collections():
|
||||
print(f"\n [{cname}]")
|
||||
upserted = _rebuild_one_collection(
|
||||
backend=backend,
|
||||
@@ -790,6 +847,8 @@ def rebuild_from_sqlite(
|
||||
print(f" Original palace archived at: {archive_path}")
|
||||
print(f"{'=' * 55}\n")
|
||||
return counts
|
||||
finally:
|
||||
backend.close()
|
||||
|
||||
|
||||
def status(palace_path=None) -> dict:
|
||||
|
||||
@@ -1097,3 +1097,64 @@ def test_reconfigure_stdio_is_noop_off_windows():
|
||||
_reconfigure_stdio_utf8_on_windows()
|
||||
|
||||
assert stdin.reconfigure_calls == []
|
||||
|
||||
|
||||
# ── cmd_repair: from-sqlite mode exit codes ──────────────────────────
|
||||
|
||||
|
||||
@patch("mempalace.cli.MempalaceConfig")
|
||||
def test_cmd_repair_from_sqlite_validation_refusal_exits_nonzero(mock_config_cls, tmp_path, capsys):
|
||||
"""When ``rebuild_from_sqlite`` returns ``{}`` for a validation
|
||||
refusal (missing source DB, in-place without --archive-existing,
|
||||
refusing to overwrite an existing dest), the CLI must surface a
|
||||
non-zero exit so unattended scripts and CI distinguish "invalid
|
||||
inputs" from "successful recovery that found zero rows."
|
||||
|
||||
Catches: a regression where the CLI treats the validation-refusal
|
||||
sentinel as success, leaving CI green on a no-op repair that should
|
||||
have alerted an operator.
|
||||
"""
|
||||
palace_dir = tmp_path / "palace"
|
||||
palace_dir.mkdir()
|
||||
mock_config_cls.return_value.palace_path = str(palace_dir)
|
||||
|
||||
args = argparse.Namespace(
|
||||
palace=str(palace_dir),
|
||||
mode="from-sqlite",
|
||||
source=None,
|
||||
archive_existing=False,
|
||||
yes=True,
|
||||
)
|
||||
with patch("mempalace.repair.rebuild_from_sqlite", return_value={}):
|
||||
with pytest.raises(SystemExit) as excinfo:
|
||||
cmd_repair(args)
|
||||
assert excinfo.value.code == 1
|
||||
|
||||
|
||||
@patch("mempalace.cli.MempalaceConfig")
|
||||
def test_cmd_repair_from_sqlite_success_does_not_exit(mock_config_cls, tmp_path):
|
||||
"""A successful from-sqlite rebuild — even one that finds zero rows
|
||||
in a legitimately empty source palace — must NOT call ``sys.exit``.
|
||||
A populated counts dict (with ``0`` values) is the success signal;
|
||||
only the empty dict ``{}`` is reserved for validation refusal.
|
||||
|
||||
Catches: a regression where ``if not counts`` is replaced by
|
||||
``if not sum(counts.values())`` or similar, conflating "empty source"
|
||||
with "validation refused" and breaking idempotent recovery scripts.
|
||||
"""
|
||||
palace_dir = tmp_path / "palace"
|
||||
palace_dir.mkdir()
|
||||
mock_config_cls.return_value.palace_path = str(palace_dir)
|
||||
|
||||
args = argparse.Namespace(
|
||||
palace=str(palace_dir),
|
||||
mode="from-sqlite",
|
||||
source=None,
|
||||
archive_existing=False,
|
||||
yes=True,
|
||||
)
|
||||
# Zero rows but per-collection keys present → success, no exit.
|
||||
fake_counts = {"mempalace_drawers": 0, "mempalace_closets": 0}
|
||||
with patch("mempalace.repair.rebuild_from_sqlite", return_value=fake_counts):
|
||||
# Should return cleanly; no SystemExit raised.
|
||||
cmd_repair(args)
|
||||
|
||||
@@ -719,6 +719,13 @@ def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):
|
||||
Catches: anyone who breaks the segments/embeddings/embedding_metadata
|
||||
JOIN, swaps the metadata vs vector segment, or changes how the
|
||||
document is stored under the ``chroma:document`` key.
|
||||
|
||||
Also asserts every embedding row underlying the extraction lives in
|
||||
a ``segments.scope = 'METADATA'`` segment. Document + metadata rows
|
||||
are stored under METADATA in Chroma's segment layout while HNSW
|
||||
files live under ``VECTOR``; locking that assumption in here means a
|
||||
future refactor that accidentally points the JOIN at ``VECTOR``
|
||||
fails this test instead of silently regressing the recovery path.
|
||||
"""
|
||||
rows = [
|
||||
(f"drawer_{i:03d}", f"document body {i}", {"wing": "test_wing", "room": f"r{i % 3}"})
|
||||
@@ -736,6 +743,35 @@ def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):
|
||||
assert got_doc == doc, f"document mangled for {emb_id}"
|
||||
assert got_meta == meta, f"metadata mangled for {emb_id}: {got_meta!r}"
|
||||
|
||||
# Lock the segment-scope assumption directly against Chroma's on-disk
|
||||
# layout so a future change that points the extraction JOIN at the
|
||||
# VECTOR segment cannot pass this test. Query each extracted row's
|
||||
# backing segment scope via the same SQLite tables ``extract_via_sqlite``
|
||||
# reads from.
|
||||
sqlite_path = os.path.join(str(tmp_path), "chroma.sqlite3")
|
||||
conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
|
||||
try:
|
||||
scopes = {
|
||||
scope
|
||||
for (scope,) in conn.execute(
|
||||
"""
|
||||
SELECT DISTINCT s.scope
|
||||
FROM embeddings e
|
||||
JOIN segments s ON e.segment_id = s.id
|
||||
JOIN collections c ON s.collection = c.id
|
||||
WHERE c.name = ? AND e.embedding_id IN ({})
|
||||
""".format(",".join("?" * len(extracted))),
|
||||
("mempalace_drawers", *(emb_id for emb_id, _, _ in extracted)),
|
||||
)
|
||||
}
|
||||
finally:
|
||||
conn.close()
|
||||
assert scopes == {"METADATA"}, (
|
||||
f"extraction is reading from segments scoped {scopes!r}; only "
|
||||
"'METADATA' should back the document/metadata rows. If Chroma's "
|
||||
"segment layout changed, update extract_via_sqlite's WHERE clause."
|
||||
)
|
||||
|
||||
|
||||
def test_extract_via_sqlite_preserves_typed_metadata(tmp_path):
|
||||
"""Chromadb stores int / float / bool / string in distinct typed
|
||||
@@ -973,3 +1009,56 @@ def test_rebuild_from_sqlite_raises_on_upsert_failure(tmp_path, monkeypatch):
|
||||
assert err.archive_path is not None
|
||||
assert os.path.isfile(os.path.join(err.archive_path, "chroma.sqlite3"))
|
||||
assert err.dest_palace == os.path.abspath(str(palace))
|
||||
|
||||
|
||||
def test_rebuild_from_sqlite_honors_configured_drawer_collection_name(tmp_path, monkeypatch):
|
||||
"""A user with a non-default drawers collection name (set via
|
||||
``MempalaceConfig().collection_name``) must have THAT collection
|
||||
rebuilt — not the hardcoded ``mempalace_drawers``.
|
||||
|
||||
Catches: a regression where the recovery path silently rebuilds the
|
||||
default-name collection on a custom-named palace, leaving the user's
|
||||
actual data unrebuilt while reporting "rebuild complete." This is
|
||||
the failure mode reviewer mjc flagged on PR #1310 as needing to line
|
||||
up with the configured-collection-name work in #1312. Closets stay
|
||||
fixed (``mempalace_closets``) by design — the AAAK index references
|
||||
drawer IDs by string and is not per-deployment configurable.
|
||||
|
||||
Strategy: monkeypatch the lazy resolver so the test is hermetic and
|
||||
does not depend on the global config file or env state.
|
||||
"""
|
||||
from mempalace.backends.chroma import ChromaBackend
|
||||
|
||||
custom_drawers = "custom_drawers_xyz"
|
||||
monkeypatch.setattr(repair, "_drawers_collection_name", lambda: custom_drawers)
|
||||
|
||||
source = tmp_path / "source"
|
||||
dest = tmp_path / "dest"
|
||||
|
||||
drawer_rows = [(f"d{i}", f"body {i}", {"wing": "alpha"}) for i in range(3)]
|
||||
closet_rows = [("closet_a", "abbrev →d0", {"wing": "alpha"})]
|
||||
_seed_palace(source, custom_drawers, drawer_rows)
|
||||
_seed_palace(source, "mempalace_closets", closet_rows)
|
||||
|
||||
counts = repair.rebuild_from_sqlite(str(source), str(dest))
|
||||
|
||||
# Rebuilt under the custom name, not under the default "mempalace_drawers".
|
||||
assert counts == {custom_drawers: 3, "mempalace_closets": 1}
|
||||
|
||||
backend = ChromaBackend()
|
||||
rebuilt_drawers = backend.get_collection(str(dest), custom_drawers)
|
||||
assert rebuilt_drawers.count() == 3
|
||||
|
||||
# Default-name collection must NOT exist in dest — proves we did not
|
||||
# silently fall back to the hardcoded name during rebuild.
|
||||
try:
|
||||
rebuilt_default = backend.get_collection(str(dest), "mempalace_drawers")
|
||||
# If get_collection returns without raising, count() should be 0
|
||||
# (chromadb may auto-create on get with some EFs); a non-zero
|
||||
# count would mean we wrote rows to the wrong collection.
|
||||
assert rebuilt_default.count() == 0, (
|
||||
"rebuild leaked rows into the default-name collection on a "
|
||||
"custom-name palace — recovery wrote to the wrong collection."
|
||||
)
|
||||
except Exception:
|
||||
pass # Expected: collection wasn't created.
|
||||
|
||||
Reference in New Issue
Block a user