From d92c741084c7a3d276ecd31c483931ff468bdd35 Mon Sep 17 00:00:00 2001 From: Brian potter Date: Sat, 2 May 2026 12:12:08 -0500 Subject: [PATCH] fix(repair): address PR #1310 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five small hardening fixes for the from-sqlite rebuild path, all from mjc's review on #1310: - repair.py: drawers collection name now resolves from MempalaceConfig().collection_name via _drawers_collection_name() (closets stays fixed by design — AAAK index references drawer IDs by string). Lines up with the broader configured-collection work in #1312 so that PR can rebase cleanly on top. - repair.py: create_collection() moved inside the try block in _rebuild_one_collection so a Chroma "Collection already exists" failure surfaces as RebuildPartialError with archive_path, not an unstructured exception that strands the user without recovery instructions. - repair.py: rebuild_from_sqlite wraps backend lifetime in try/finally with backend.close() so PersistentClient handles to dest_palace are released on every exit path. The from-sqlite path post-dates #1285's lifecycle hardening of the legacy rebuild, so this needed its own cleanup. - cli.py: cmd_repair (from-sqlite mode) now exits non-zero when rebuild_from_sqlite returns {} (validation refusal sentinel), so unattended scripts/CI distinguish "invalid inputs" from a successful rebuild that legitimately found zero rows. - tests/test_repair.py: test_extract_via_sqlite_returns_all_rows_with_metadata now asserts every backing segment is scope='METADATA', locking in the segment-layout assumption against future regressions that point the JOIN at the VECTOR segment. New test coverage: - test_rebuild_from_sqlite_honors_configured_drawer_collection_name - test_cmd_repair_from_sqlite_validation_refusal_exits_nonzero - test_cmd_repair_from_sqlite_success_does_not_exit Co-Authored-By: Claude Opus 4.7 (1M context) --- mempalace/cli.py | 11 +++- mempalace/repair.py | 121 ++++++++++++++++++++++++++++++++----------- tests/test_cli.py | 61 ++++++++++++++++++++++ tests/test_repair.py | 89 +++++++++++++++++++++++++++++++ 4 files changed, 250 insertions(+), 32 deletions(-) diff --git a/mempalace/cli.py b/mempalace/cli.py index 468b765..95915eb 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -699,7 +699,7 @@ def cmd_repair(args): return try: - rebuild_from_sqlite( + counts = rebuild_from_sqlite( source_palace=source_path, dest_palace=palace_path, archive_existing_dest=archive_existing, @@ -713,6 +713,15 @@ def cmd_repair(args): f"Failed in collection: {exc.failed_collection}" ) sys.exit(1) + # An empty counts dict is rebuild_from_sqlite's documented signal + # for a validation refusal (missing source, existing dest, + # in-place without --archive-existing). The library already + # printed an actionable message; exit non-zero so unattended + # scripts/CI distinguish "invalid inputs" from a successful + # rebuild that legitimately found zero rows (which still returns + # a populated dict with 0-valued counts). + if not counts: + sys.exit(1) return db_path = os.path.join(palace_path, "chroma.sqlite3") diff --git a/mempalace/repair.py b/mempalace/repair.py index 7e98f0f..90fb0cf 100644 --- a/mempalace/repair.py +++ b/mempalace/repair.py @@ -43,11 +43,46 @@ from .backends.chroma import ChromaBackend, hnsw_capacity_status COLLECTION_NAME = "mempalace_drawers" -# Collections rebuilt by ``rebuild_from_sqlite``. Order matters for the -# upsert pass — drawers carry the bulk of the data, closets are the AAAK -# index layer and reference drawer IDs by string in their documents (no -# foreign-key validation, so ordering is informational, not load-bearing). -RECOVERABLE_COLLECTIONS = ("mempalace_drawers", "mempalace_closets") +# The closets collection (AAAK index layer) is intentionally fixed — +# closets reference drawer IDs by string and live alongside drawers in the +# same palace; renaming the closets collection per-deployment would break +# cross-palace AAAK lookups. Drawer collection name comes from config +# (see ``_recoverable_collections``). +CLOSETS_COLLECTION_NAME = "mempalace_closets" + + +def _drawers_collection_name() -> str: + """Resolve the drawers collection name from user config, falling back + to the module default ``COLLECTION_NAME`` if config is unreadable. + + Recovery flows must honor ``MempalaceConfig().collection_name`` so a + user with a non-default drawer collection (e.g. multi-palace setups) + rebuilds the right rows. Closets remain fixed — see + ``CLOSETS_COLLECTION_NAME``. + """ + try: + from .config import MempalaceConfig + + return MempalaceConfig().collection_name or COLLECTION_NAME + except Exception: + return COLLECTION_NAME + + +def _recoverable_collections() -> tuple[str, ...]: + """Collections rebuilt by ``rebuild_from_sqlite``, in upsert order. + + Drawers first (bulk data), then closets (AAAK index layer that + references drawer IDs by string in their documents — no + foreign-key validation, so ordering is informational, not + load-bearing). + """ + return (_drawers_collection_name(), CLOSETS_COLLECTION_NAME) + + +# Back-compat alias for callers that imported the constant. New code +# should call ``_recoverable_collections()`` so config changes are picked +# up at call time. +RECOVERABLE_COLLECTIONS = (COLLECTION_NAME, CLOSETS_COLLECTION_NAME) def _get_palace_path(): @@ -487,12 +522,11 @@ def _rebuild_one_collection( caller can stop the loop and print recovery instructions instead of silently shipping a partial palace. """ - col = backend.create_collection(dest_palace, collection_name) - ids: list[str] = [] docs: list[str] = [] metas: list[dict] = [] upserted = 0 + col = None def _flush() -> int: nonlocal upserted @@ -507,6 +541,14 @@ def _rebuild_one_collection( return upserted try: + # ``create_collection`` lives inside the try so a Chroma-side + # "Collection already exists" failure (which can happen when the + # process-wide System cache still holds a pre-archive schema) is + # reported as a structured ``RebuildPartialError`` carrying + # ``archive_path`` — instead of an unstructured exception that + # strands the user without recovery instructions. + col = backend.create_collection(dest_palace, collection_name) + for emb_id, doc, meta in extract_via_sqlite(source_palace, collection_name): ids.append(emb_id) docs.append(doc or "") @@ -664,8 +706,14 @@ def rebuild_from_sqlite( Returns a ``{collection_name: row_count}`` dict so callers (CLI, tests) can verify the per-collection rebuild count without parsing - stdout. Returns ``{}`` on validation failures (missing source, - refusing to overwrite). Raises :class:`RebuildPartialError` if a + stdout. A successful rebuild always returns a dict with one key per + recoverable collection (values may be ``0`` when a collection is + legitimately empty in the source). The empty dict ``{}`` is reserved + for validation refusals (missing source DB, refusing to overwrite an + existing dest, in-place mode without ``archive_existing_dest``); CLI + callers should treat ``{}`` as an error and exit non-zero so CI and + scripts can distinguish "invalid inputs" from "successful recovery + that found zero rows." Raises :class:`RebuildPartialError` if a chromadb upsert fails partway through; the dest palace is left in place so the user can inspect what landed, and the in-place archive (when applicable) is reported in the error so the user can re-run @@ -765,31 +813,42 @@ def rebuild_from_sqlite( os.makedirs(dest_palace, exist_ok=True) + # Backend lifetime is wrapped in try/finally so the dest palace's + # PersistentClient handle (opened lazily inside ``create_collection`` + # / ``get_collection``) is released on every exit path: success, + # ``RebuildPartialError``, or any unexpected exception. Without this, + # a long-running process that calls ``rebuild_from_sqlite`` would + # leak SQLite/HNSW file handles into Chroma's ``SharedSystemClient`` + # cache, surfacing later as "Collection already exists" on the next + # in-place rebuild or as a Windows file-lock failure on cleanup + # (cf. #1285's lifecycle hardening for the legacy rebuild path). backend = ChromaBackend() counts: dict[str, int] = {} + try: + for cname in _recoverable_collections(): + print(f"\n [{cname}]") + upserted = _rebuild_one_collection( + backend=backend, + source_palace=source_palace, + dest_palace=dest_palace, + collection_name=cname, + batch_size=batch_size, + archive_path=archive_path, + counts_so_far=counts, + ) + counts[cname] = upserted + if upserted == 0: + print(f" no rows found for {cname} in source palace") + else: + print(f" done: {upserted} rows in {cname}") - for cname in RECOVERABLE_COLLECTIONS: - print(f"\n [{cname}]") - upserted = _rebuild_one_collection( - backend=backend, - source_palace=source_palace, - dest_palace=dest_palace, - collection_name=cname, - batch_size=batch_size, - archive_path=archive_path, - counts_so_far=counts, - ) - counts[cname] = upserted - if upserted == 0: - print(f" no rows found for {cname} in source palace") - else: - print(f" done: {upserted} rows in {cname}") - - print(f"\n Rebuild complete. {sum(counts.values())} total rows.") - if archive_path is not None: - print(f" Original palace archived at: {archive_path}") - print(f"{'=' * 55}\n") - return counts + print(f"\n Rebuild complete. {sum(counts.values())} total rows.") + if archive_path is not None: + print(f" Original palace archived at: {archive_path}") + print(f"{'=' * 55}\n") + return counts + finally: + backend.close() def status(palace_path=None) -> dict: diff --git a/tests/test_cli.py b/tests/test_cli.py index 6b4b7b3..71ca63d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1097,3 +1097,64 @@ def test_reconfigure_stdio_is_noop_off_windows(): _reconfigure_stdio_utf8_on_windows() assert stdin.reconfigure_calls == [] + + +# ── cmd_repair: from-sqlite mode exit codes ────────────────────────── + + +@patch("mempalace.cli.MempalaceConfig") +def test_cmd_repair_from_sqlite_validation_refusal_exits_nonzero(mock_config_cls, tmp_path, capsys): + """When ``rebuild_from_sqlite`` returns ``{}`` for a validation + refusal (missing source DB, in-place without --archive-existing, + refusing to overwrite an existing dest), the CLI must surface a + non-zero exit so unattended scripts and CI distinguish "invalid + inputs" from "successful recovery that found zero rows." + + Catches: a regression where the CLI treats the validation-refusal + sentinel as success, leaving CI green on a no-op repair that should + have alerted an operator. + """ + palace_dir = tmp_path / "palace" + palace_dir.mkdir() + mock_config_cls.return_value.palace_path = str(palace_dir) + + args = argparse.Namespace( + palace=str(palace_dir), + mode="from-sqlite", + source=None, + archive_existing=False, + yes=True, + ) + with patch("mempalace.repair.rebuild_from_sqlite", return_value={}): + with pytest.raises(SystemExit) as excinfo: + cmd_repair(args) + assert excinfo.value.code == 1 + + +@patch("mempalace.cli.MempalaceConfig") +def test_cmd_repair_from_sqlite_success_does_not_exit(mock_config_cls, tmp_path): + """A successful from-sqlite rebuild — even one that finds zero rows + in a legitimately empty source palace — must NOT call ``sys.exit``. + A populated counts dict (with ``0`` values) is the success signal; + only the empty dict ``{}`` is reserved for validation refusal. + + Catches: a regression where ``if not counts`` is replaced by + ``if not sum(counts.values())`` or similar, conflating "empty source" + with "validation refused" and breaking idempotent recovery scripts. + """ + palace_dir = tmp_path / "palace" + palace_dir.mkdir() + mock_config_cls.return_value.palace_path = str(palace_dir) + + args = argparse.Namespace( + palace=str(palace_dir), + mode="from-sqlite", + source=None, + archive_existing=False, + yes=True, + ) + # Zero rows but per-collection keys present → success, no exit. + fake_counts = {"mempalace_drawers": 0, "mempalace_closets": 0} + with patch("mempalace.repair.rebuild_from_sqlite", return_value=fake_counts): + # Should return cleanly; no SystemExit raised. + cmd_repair(args) diff --git a/tests/test_repair.py b/tests/test_repair.py index 35e6a44..8ca72fb 100644 --- a/tests/test_repair.py +++ b/tests/test_repair.py @@ -719,6 +719,13 @@ def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path): Catches: anyone who breaks the segments/embeddings/embedding_metadata JOIN, swaps the metadata vs vector segment, or changes how the document is stored under the ``chroma:document`` key. + + Also asserts every embedding row underlying the extraction lives in + a ``segments.scope = 'METADATA'`` segment. Document + metadata rows + are stored under METADATA in Chroma's segment layout while HNSW + files live under ``VECTOR``; locking that assumption in here means a + future refactor that accidentally points the JOIN at ``VECTOR`` + fails this test instead of silently regressing the recovery path. """ rows = [ (f"drawer_{i:03d}", f"document body {i}", {"wing": "test_wing", "room": f"r{i % 3}"}) @@ -736,6 +743,35 @@ def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path): assert got_doc == doc, f"document mangled for {emb_id}" assert got_meta == meta, f"metadata mangled for {emb_id}: {got_meta!r}" + # Lock the segment-scope assumption directly against Chroma's on-disk + # layout so a future change that points the extraction JOIN at the + # VECTOR segment cannot pass this test. Query each extracted row's + # backing segment scope via the same SQLite tables ``extract_via_sqlite`` + # reads from. + sqlite_path = os.path.join(str(tmp_path), "chroma.sqlite3") + conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True) + try: + scopes = { + scope + for (scope,) in conn.execute( + """ + SELECT DISTINCT s.scope + FROM embeddings e + JOIN segments s ON e.segment_id = s.id + JOIN collections c ON s.collection = c.id + WHERE c.name = ? AND e.embedding_id IN ({}) + """.format(",".join("?" * len(extracted))), + ("mempalace_drawers", *(emb_id for emb_id, _, _ in extracted)), + ) + } + finally: + conn.close() + assert scopes == {"METADATA"}, ( + f"extraction is reading from segments scoped {scopes!r}; only " + "'METADATA' should back the document/metadata rows. If Chroma's " + "segment layout changed, update extract_via_sqlite's WHERE clause." + ) + def test_extract_via_sqlite_preserves_typed_metadata(tmp_path): """Chromadb stores int / float / bool / string in distinct typed @@ -973,3 +1009,56 @@ def test_rebuild_from_sqlite_raises_on_upsert_failure(tmp_path, monkeypatch): assert err.archive_path is not None assert os.path.isfile(os.path.join(err.archive_path, "chroma.sqlite3")) assert err.dest_palace == os.path.abspath(str(palace)) + + +def test_rebuild_from_sqlite_honors_configured_drawer_collection_name(tmp_path, monkeypatch): + """A user with a non-default drawers collection name (set via + ``MempalaceConfig().collection_name``) must have THAT collection + rebuilt — not the hardcoded ``mempalace_drawers``. + + Catches: a regression where the recovery path silently rebuilds the + default-name collection on a custom-named palace, leaving the user's + actual data unrebuilt while reporting "rebuild complete." This is + the failure mode reviewer mjc flagged on PR #1310 as needing to line + up with the configured-collection-name work in #1312. Closets stay + fixed (``mempalace_closets``) by design — the AAAK index references + drawer IDs by string and is not per-deployment configurable. + + Strategy: monkeypatch the lazy resolver so the test is hermetic and + does not depend on the global config file or env state. + """ + from mempalace.backends.chroma import ChromaBackend + + custom_drawers = "custom_drawers_xyz" + monkeypatch.setattr(repair, "_drawers_collection_name", lambda: custom_drawers) + + source = tmp_path / "source" + dest = tmp_path / "dest" + + drawer_rows = [(f"d{i}", f"body {i}", {"wing": "alpha"}) for i in range(3)] + closet_rows = [("closet_a", "abbrev →d0", {"wing": "alpha"})] + _seed_palace(source, custom_drawers, drawer_rows) + _seed_palace(source, "mempalace_closets", closet_rows) + + counts = repair.rebuild_from_sqlite(str(source), str(dest)) + + # Rebuilt under the custom name, not under the default "mempalace_drawers". + assert counts == {custom_drawers: 3, "mempalace_closets": 1} + + backend = ChromaBackend() + rebuilt_drawers = backend.get_collection(str(dest), custom_drawers) + assert rebuilt_drawers.count() == 3 + + # Default-name collection must NOT exist in dest — proves we did not + # silently fall back to the hardcoded name during rebuild. + try: + rebuilt_default = backend.get_collection(str(dest), "mempalace_drawers") + # If get_collection returns without raising, count() should be 0 + # (chromadb may auto-create on get with some EFs); a non-zero + # count would mean we wrote rows to the wrong collection. + assert rebuilt_default.count() == 0, ( + "rebuild leaked rows into the default-name collection on a " + "custom-name palace — recovery wrote to the wrong collection." + ) + except Exception: + pass # Expected: collection wasn't created.