fix(repair): address PR #1310 review feedback

Five small hardening fixes for the from-sqlite rebuild path, all from
mjc's review on #1310:

- repair.py: drawers collection name now resolves from
  MempalaceConfig().collection_name via _drawers_collection_name() (closets
  stays fixed by design — AAAK index references drawer IDs by string).
  Lines up with the broader configured-collection work in #1312 so that
  PR can rebase cleanly on top.
- repair.py: create_collection() moved inside the try block in
  _rebuild_one_collection so a Chroma "Collection already exists" failure
  surfaces as RebuildPartialError with archive_path, not an unstructured
  exception that strands the user without recovery instructions.
- repair.py: rebuild_from_sqlite wraps backend lifetime in try/finally
  with backend.close() so PersistentClient handles to dest_palace are
  released on every exit path. The from-sqlite path post-dates #1285's
  lifecycle hardening of the legacy rebuild, so this needed its own
  cleanup.
- cli.py: cmd_repair (from-sqlite mode) now exits non-zero when
  rebuild_from_sqlite returns {} (validation refusal sentinel), so
  unattended scripts/CI distinguish "invalid inputs" from a successful
  rebuild that legitimately found zero rows.
- tests/test_repair.py: test_extract_via_sqlite_returns_all_rows_with_metadata
  now asserts every backing segment is scope='METADATA', locking in the
  segment-layout assumption against future regressions that point the
  JOIN at the VECTOR segment.

New test coverage:
- test_rebuild_from_sqlite_honors_configured_drawer_collection_name
- test_cmd_repair_from_sqlite_validation_refusal_exits_nonzero
- test_cmd_repair_from_sqlite_success_does_not_exit

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Brian potter
2026-05-02 12:12:08 -05:00
committed by Igor Lins e Silva
parent cb6bfd5231
commit d92c741084
4 changed files with 250 additions and 32 deletions
+10 -1
View File
@@ -699,7 +699,7 @@ def cmd_repair(args):
return
try:
rebuild_from_sqlite(
counts = rebuild_from_sqlite(
source_palace=source_path,
dest_palace=palace_path,
archive_existing_dest=archive_existing,
@@ -713,6 +713,15 @@ def cmd_repair(args):
f"Failed in collection: {exc.failed_collection}"
)
sys.exit(1)
# An empty counts dict is rebuild_from_sqlite's documented signal
# for a validation refusal (missing source, existing dest,
# in-place without --archive-existing). The library already
# printed an actionable message; exit non-zero so unattended
# scripts/CI distinguish "invalid inputs" from a successful
# rebuild that legitimately found zero rows (which still returns
# a populated dict with 0-valued counts).
if not counts:
sys.exit(1)
return
db_path = os.path.join(palace_path, "chroma.sqlite3")
+70 -11
View File
@@ -43,11 +43,46 @@ from .backends.chroma import ChromaBackend, hnsw_capacity_status
COLLECTION_NAME = "mempalace_drawers"
# Collections rebuilt by ``rebuild_from_sqlite``. Order matters for the
# upsert pass — drawers carry the bulk of the data, closets are the AAAK
# index layer and reference drawer IDs by string in their documents (no
# foreign-key validation, so ordering is informational, not load-bearing).
RECOVERABLE_COLLECTIONS = ("mempalace_drawers", "mempalace_closets")
# The closets collection (AAAK index layer) is intentionally fixed —
# closets reference drawer IDs by string and live alongside drawers in the
# same palace; renaming the closets collection per-deployment would break
# cross-palace AAAK lookups. Drawer collection name comes from config
# (see ``_recoverable_collections``).
CLOSETS_COLLECTION_NAME = "mempalace_closets"
def _drawers_collection_name() -> str:
"""Resolve the drawers collection name from user config, falling back
to the module default ``COLLECTION_NAME`` if config is unreadable.
Recovery flows must honor ``MempalaceConfig().collection_name`` so a
user with a non-default drawer collection (e.g. multi-palace setups)
rebuilds the right rows. Closets remain fixed — see
``CLOSETS_COLLECTION_NAME``.
"""
try:
from .config import MempalaceConfig
return MempalaceConfig().collection_name or COLLECTION_NAME
except Exception:
return COLLECTION_NAME
def _recoverable_collections() -> tuple[str, ...]:
"""Collections rebuilt by ``rebuild_from_sqlite``, in upsert order.
Drawers first (bulk data), then closets (AAAK index layer that
references drawer IDs by string in their documents — no
foreign-key validation, so ordering is informational, not
load-bearing).
"""
return (_drawers_collection_name(), CLOSETS_COLLECTION_NAME)
# Back-compat alias for callers that imported the constant. New code
# should call ``_recoverable_collections()`` so config changes are picked
# up at call time.
RECOVERABLE_COLLECTIONS = (COLLECTION_NAME, CLOSETS_COLLECTION_NAME)
def _get_palace_path():
@@ -487,12 +522,11 @@ def _rebuild_one_collection(
caller can stop the loop and print recovery instructions instead of
silently shipping a partial palace.
"""
col = backend.create_collection(dest_palace, collection_name)
ids: list[str] = []
docs: list[str] = []
metas: list[dict] = []
upserted = 0
col = None
def _flush() -> int:
nonlocal upserted
@@ -507,6 +541,14 @@ def _rebuild_one_collection(
return upserted
try:
# ``create_collection`` lives inside the try so a Chroma-side
# "Collection already exists" failure (which can happen when the
# process-wide System cache still holds a pre-archive schema) is
# reported as a structured ``RebuildPartialError`` carrying
# ``archive_path`` — instead of an unstructured exception that
# strands the user without recovery instructions.
col = backend.create_collection(dest_palace, collection_name)
for emb_id, doc, meta in extract_via_sqlite(source_palace, collection_name):
ids.append(emb_id)
docs.append(doc or "")
@@ -664,8 +706,14 @@ def rebuild_from_sqlite(
Returns a ``{collection_name: row_count}`` dict so callers (CLI,
tests) can verify the per-collection rebuild count without parsing
stdout. Returns ``{}`` on validation failures (missing source,
refusing to overwrite). Raises :class:`RebuildPartialError` if a
stdout. A successful rebuild always returns a dict with one key per
recoverable collection (values may be ``0`` when a collection is
legitimately empty in the source). The empty dict ``{}`` is reserved
for validation refusals (missing source DB, refusing to overwrite an
existing dest, in-place mode without ``archive_existing_dest``); CLI
callers should treat ``{}`` as an error and exit non-zero so CI and
scripts can distinguish "invalid inputs" from "successful recovery
that found zero rows." Raises :class:`RebuildPartialError` if a
chromadb upsert fails partway through; the dest palace is left in
place so the user can inspect what landed, and the in-place archive
(when applicable) is reported in the error so the user can re-run
@@ -765,10 +813,19 @@ def rebuild_from_sqlite(
os.makedirs(dest_palace, exist_ok=True)
# Backend lifetime is wrapped in try/finally so the dest palace's
# PersistentClient handle (opened lazily inside ``create_collection``
# / ``get_collection``) is released on every exit path: success,
# ``RebuildPartialError``, or any unexpected exception. Without this,
# a long-running process that calls ``rebuild_from_sqlite`` would
# leak SQLite/HNSW file handles into Chroma's ``SharedSystemClient``
# cache, surfacing later as "Collection already exists" on the next
# in-place rebuild or as a Windows file-lock failure on cleanup
# (cf. #1285's lifecycle hardening for the legacy rebuild path).
backend = ChromaBackend()
counts: dict[str, int] = {}
for cname in RECOVERABLE_COLLECTIONS:
try:
for cname in _recoverable_collections():
print(f"\n [{cname}]")
upserted = _rebuild_one_collection(
backend=backend,
@@ -790,6 +847,8 @@ def rebuild_from_sqlite(
print(f" Original palace archived at: {archive_path}")
print(f"{'=' * 55}\n")
return counts
finally:
backend.close()
def status(palace_path=None) -> dict:
+61
View File
@@ -1097,3 +1097,64 @@ def test_reconfigure_stdio_is_noop_off_windows():
_reconfigure_stdio_utf8_on_windows()
assert stdin.reconfigure_calls == []
# ── cmd_repair: from-sqlite mode exit codes ──────────────────────────
@patch("mempalace.cli.MempalaceConfig")
def test_cmd_repair_from_sqlite_validation_refusal_exits_nonzero(mock_config_cls, tmp_path, capsys):
"""When ``rebuild_from_sqlite`` returns ``{}`` for a validation
refusal (missing source DB, in-place without --archive-existing,
refusing to overwrite an existing dest), the CLI must surface a
non-zero exit so unattended scripts and CI distinguish "invalid
inputs" from "successful recovery that found zero rows."
Catches: a regression where the CLI treats the validation-refusal
sentinel as success, leaving CI green on a no-op repair that should
have alerted an operator.
"""
palace_dir = tmp_path / "palace"
palace_dir.mkdir()
mock_config_cls.return_value.palace_path = str(palace_dir)
args = argparse.Namespace(
palace=str(palace_dir),
mode="from-sqlite",
source=None,
archive_existing=False,
yes=True,
)
with patch("mempalace.repair.rebuild_from_sqlite", return_value={}):
with pytest.raises(SystemExit) as excinfo:
cmd_repair(args)
assert excinfo.value.code == 1
@patch("mempalace.cli.MempalaceConfig")
def test_cmd_repair_from_sqlite_success_does_not_exit(mock_config_cls, tmp_path):
"""A successful from-sqlite rebuild — even one that finds zero rows
in a legitimately empty source palace — must NOT call ``sys.exit``.
A populated counts dict (with ``0`` values) is the success signal;
only the empty dict ``{}`` is reserved for validation refusal.
Catches: a regression where ``if not counts`` is replaced by
``if not sum(counts.values())`` or similar, conflating "empty source"
with "validation refused" and breaking idempotent recovery scripts.
"""
palace_dir = tmp_path / "palace"
palace_dir.mkdir()
mock_config_cls.return_value.palace_path = str(palace_dir)
args = argparse.Namespace(
palace=str(palace_dir),
mode="from-sqlite",
source=None,
archive_existing=False,
yes=True,
)
# Zero rows but per-collection keys present → success, no exit.
fake_counts = {"mempalace_drawers": 0, "mempalace_closets": 0}
with patch("mempalace.repair.rebuild_from_sqlite", return_value=fake_counts):
# Should return cleanly; no SystemExit raised.
cmd_repair(args)
+89
View File
@@ -719,6 +719,13 @@ def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):
Catches: anyone who breaks the segments/embeddings/embedding_metadata
JOIN, swaps the metadata vs vector segment, or changes how the
document is stored under the ``chroma:document`` key.
Also asserts every embedding row underlying the extraction lives in
a ``segments.scope = 'METADATA'`` segment. Document + metadata rows
are stored under METADATA in Chroma's segment layout while HNSW
files live under ``VECTOR``; locking that assumption in here means a
future refactor that accidentally points the JOIN at ``VECTOR``
fails this test instead of silently regressing the recovery path.
"""
rows = [
(f"drawer_{i:03d}", f"document body {i}", {"wing": "test_wing", "room": f"r{i % 3}"})
@@ -736,6 +743,35 @@ def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):
assert got_doc == doc, f"document mangled for {emb_id}"
assert got_meta == meta, f"metadata mangled for {emb_id}: {got_meta!r}"
# Lock the segment-scope assumption directly against Chroma's on-disk
# layout so a future change that points the extraction JOIN at the
# VECTOR segment cannot pass this test. Query each extracted row's
# backing segment scope via the same SQLite tables ``extract_via_sqlite``
# reads from.
sqlite_path = os.path.join(str(tmp_path), "chroma.sqlite3")
conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
try:
scopes = {
scope
for (scope,) in conn.execute(
"""
SELECT DISTINCT s.scope
FROM embeddings e
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = ? AND e.embedding_id IN ({})
""".format(",".join("?" * len(extracted))),
("mempalace_drawers", *(emb_id for emb_id, _, _ in extracted)),
)
}
finally:
conn.close()
assert scopes == {"METADATA"}, (
f"extraction is reading from segments scoped {scopes!r}; only "
"'METADATA' should back the document/metadata rows. If Chroma's "
"segment layout changed, update extract_via_sqlite's WHERE clause."
)
def test_extract_via_sqlite_preserves_typed_metadata(tmp_path):
"""Chromadb stores int / float / bool / string in distinct typed
@@ -973,3 +1009,56 @@ def test_rebuild_from_sqlite_raises_on_upsert_failure(tmp_path, monkeypatch):
assert err.archive_path is not None
assert os.path.isfile(os.path.join(err.archive_path, "chroma.sqlite3"))
assert err.dest_palace == os.path.abspath(str(palace))
def test_rebuild_from_sqlite_honors_configured_drawer_collection_name(tmp_path, monkeypatch):
"""A user with a non-default drawers collection name (set via
``MempalaceConfig().collection_name``) must have THAT collection
rebuilt — not the hardcoded ``mempalace_drawers``.
Catches: a regression where the recovery path silently rebuilds the
default-name collection on a custom-named palace, leaving the user's
actual data unrebuilt while reporting "rebuild complete." This is
the failure mode reviewer mjc flagged on PR #1310 as needing to line
up with the configured-collection-name work in #1312. Closets stay
fixed (``mempalace_closets``) by design — the AAAK index references
drawer IDs by string and is not per-deployment configurable.
Strategy: monkeypatch the lazy resolver so the test is hermetic and
does not depend on the global config file or env state.
"""
from mempalace.backends.chroma import ChromaBackend
custom_drawers = "custom_drawers_xyz"
monkeypatch.setattr(repair, "_drawers_collection_name", lambda: custom_drawers)
source = tmp_path / "source"
dest = tmp_path / "dest"
drawer_rows = [(f"d{i}", f"body {i}", {"wing": "alpha"}) for i in range(3)]
closet_rows = [("closet_a", "abbrev →d0", {"wing": "alpha"})]
_seed_palace(source, custom_drawers, drawer_rows)
_seed_palace(source, "mempalace_closets", closet_rows)
counts = repair.rebuild_from_sqlite(str(source), str(dest))
# Rebuilt under the custom name, not under the default "mempalace_drawers".
assert counts == {custom_drawers: 3, "mempalace_closets": 1}
backend = ChromaBackend()
rebuilt_drawers = backend.get_collection(str(dest), custom_drawers)
assert rebuilt_drawers.count() == 3
# Default-name collection must NOT exist in dest — proves we did not
# silently fall back to the hardcoded name during rebuild.
try:
rebuilt_default = backend.get_collection(str(dest), "mempalace_drawers")
# If get_collection returns without raising, count() should be 0
# (chromadb may auto-create on get with some EFs); a non-zero
# count would mean we wrote rows to the wrong collection.
assert rebuilt_default.count() == 0, (
"rebuild leaked rows into the default-name collection on a "
"custom-name palace — recovery wrote to the wrong collection."
)
except Exception:
pass # Expected: collection wasn't created.