Merge pull request #1210 from MemPalace/fix/repair-extraction-cap-detection

fix(repair): refuse to overwrite when extraction looks truncated (#1208)
This commit is contained in:
Igor Lins e Silva
2026-04-26 04:49:01 -03:00
committed by GitHub
3 changed files with 297 additions and 6 deletions
+33 -2
View File
@@ -410,6 +410,7 @@ def cmd_repair(args):
import shutil import shutil
from .backends.chroma import ChromaBackend from .backends.chroma import ChromaBackend
from .migrate import confirm_destructive_action, contains_palace_database from .migrate import confirm_destructive_action, contains_palace_database
from .repair import TruncationDetected, check_extraction_safety
palace_path = os.path.abspath( palace_path = os.path.abspath(
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
@@ -466,6 +467,23 @@ def cmd_repair(args):
offset += len(batch["ids"]) offset += len(batch["ids"])
print(f" Extracted {len(all_ids)} drawers") print(f" Extracted {len(all_ids)} drawers")
# ── #1208 guard ──────────────────────────────────────────────────
# Cross-check against the SQLite ground truth before doing anything
# destructive. Catches the user-reported case where chromadb's
# collection-layer get() silently caps at 10,000 rows even on much
# larger palaces (e.g. after manual HNSW quarantine). Override with
# --confirm-truncation-ok only after independently verifying the
# extraction count is real.
try:
check_extraction_safety(
palace_path,
len(all_ids),
confirm_truncation_ok=getattr(args, "confirm_truncation_ok", False),
)
except TruncationDetected as e:
print(e.message)
return
# Backup and rebuild # Backup and rebuild
palace_path = os.path.normpath(palace_path) palace_path = os.path.normpath(palace_path)
backup_path = palace_path + ".backup" backup_path = palace_path + ".backup"
@@ -868,10 +886,23 @@ def main():
instructions_sub.add_parser(instr_name, help=f"Output {instr_name} instructions") instructions_sub.add_parser(instr_name, help=f"Output {instr_name} instructions")
# repair # repair
sub.add_parser( p_repair = sub.add_parser(
"repair", "repair",
help="Rebuild palace vector index from stored data (fixes segfaults after corruption)", help="Rebuild palace vector index from stored data (fixes segfaults after corruption)",
).add_argument("--yes", action="store_true", help="Skip confirmation for destructive changes") )
p_repair.add_argument(
"--yes", action="store_true", help="Skip confirmation for destructive changes"
)
p_repair.add_argument(
"--confirm-truncation-ok",
action="store_true",
help=(
"Override the #1208 safety guard. Required when chromadb's collection-layer "
"extraction returns exactly 10,000 drawers and the SQLite ground-truth check "
"either matches or can't be read. Use only after independently confirming "
"the palace really contains that count."
),
)
# mcp # mcp
sub.add_parser( sub.add_parser(
+144 -4
View File
@@ -201,13 +201,143 @@ def prune_corrupt(palace_path=None, confirm=False):
print(f" Collection size: {before:,}{after:,}") print(f" Collection size: {before:,}{after:,}")
def rebuild_index(palace_path=None): # ChromaDB's ``collection.get()`` enforces an internal default ``limit``
# of 10 000 rows when the caller does not pass one. We pass an explicit
# ``limit=batch_size`` below, but the underlying segment also caps reads
# during stale/quarantined-HNSW recovery flows: extraction silently stops
# at exactly 10 000 even on palaces with many more rows. Refusing to
# overwrite when this exact value comes back is the simplest signal we
# can detect without depending on chromadb internals.
CHROMADB_DEFAULT_GET_LIMIT = 10_000
class TruncationDetected(Exception):
"""Raised by :func:`check_extraction_safety` when extraction looks short.
Carries the human-readable abort message so callers (CLI ``cmd_repair``,
``rebuild_index``) can print and exit consistently without re-deriving
the wording.
"""
def __init__(self, message: str, sqlite_count: "int | None", extracted: int):
super().__init__(message)
self.message = message
self.sqlite_count = sqlite_count
self.extracted = extracted
def check_extraction_safety(
palace_path: str, extracted: int, confirm_truncation_ok: bool = False
) -> None:
"""Cross-check that ``extracted`` matches the SQLite ground truth.
Two signals trip the guard:
1. **Strong** — ``chroma.sqlite3`` reports more drawers than were
extracted. This is the user-reported #1208 case: 67 580 on disk,
10 000 came back through the chromadb collection layer, repair
would have destroyed the difference.
2. **Weak** — extracted count equals exactly ``CHROMADB_DEFAULT_GET_LIMIT``
AND the SQLite check couldn't run (schema drift, locked file).
Hitting the chromadb default ``get()`` cap exactly is suspicious
enough to refuse without explicit acknowledgement.
Raises :class:`TruncationDetected` with a printable message when the
guard fires. Does nothing on safe extractions or when
``confirm_truncation_ok`` is set.
"""
if confirm_truncation_ok:
return
sqlite_count = sqlite_drawer_count(palace_path)
cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT
if sqlite_count is not None and sqlite_count > extracted:
loss = sqlite_count - extracted
pct = 100 * loss / sqlite_count
message = (
f"\n ABORT: chroma.sqlite3 reports {sqlite_count:,} drawers but only {extracted:,}\n"
" came back through the chromadb collection layer. The segment metadata is\n"
" stale (often after manual HNSW quarantine) — proceeding would silently\n"
f" destroy {loss:,} drawers (~{pct:.0f}%).\n"
"\n"
" Recovery options:\n"
" 1. Restore from your most recent palace backup, then re-mine.\n"
" 2. Direct-extract from chroma.sqlite3 (rows are still on disk) and\n"
" rebuild the palace from source files.\n"
" 3. If you have independently confirmed the palace really contains only\n"
f" {extracted:,} drawers, re-run with --confirm-truncation-ok.\n"
)
raise TruncationDetected(message, sqlite_count, extracted)
if cap_signal and sqlite_count is None:
message = (
f"\n ABORT: extracted exactly {CHROMADB_DEFAULT_GET_LIMIT:,} drawers, which matches\n"
" ChromaDB's internal default get() limit. The on-disk SQLite count couldn't\n"
" be cross-checked from this Python context, so we can't tell whether the\n"
f" palace genuinely holds {CHROMADB_DEFAULT_GET_LIMIT:,} rows or whether extraction was\n"
" silently capped. Refusing to overwrite the palace.\n"
"\n"
" If you have independently confirmed (e.g. via direct sqlite3 query) that\n"
f" the palace really contains exactly {CHROMADB_DEFAULT_GET_LIMIT:,} drawers, re-run with\n"
" --confirm-truncation-ok.\n"
)
raise TruncationDetected(message, sqlite_count, extracted)
def sqlite_drawer_count(palace_path: str) -> "int | None":
"""Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection.
Used as an independent ground-truth check against the chromadb
collection-layer ``count()`` / ``get()``: when the on-disk SQLite
row count exceeds the extraction count, the segment metadata is
stale and repair would destroy the difference.
Returns ``None`` when the schema isn't readable (chromadb version
drift, missing tables, locked file). Callers treat ``None`` as
"unknown" and fall back to the cap-detection check.
"""
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.exists(sqlite_path):
return None
try:
import sqlite3
conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
try:
row = conn.execute(
"""
SELECT COUNT(*)
FROM embeddings e
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = ?
""",
(COLLECTION_NAME,),
).fetchone()
return int(row[0]) if row and row[0] is not None else None
finally:
conn.close()
except Exception:
# chromadb schema differs by version (segments / collections column
# names occasionally rename). Silent fallback is correct here —
# the cap-detection check still catches the user-reported case.
return None
def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
"""Rebuild the HNSW index from scratch. """Rebuild the HNSW index from scratch.
1. Extract all drawers via ChromaDB get() 1. Extract all drawers via ChromaDB get()
2. Back up ONLY chroma.sqlite3 (not the bloated HNSW files) 2. Cross-check against the SQLite ground truth (#1208 guard)
3. Delete and recreate the collection with hnsw:space=cosine 3. Back up ONLY chroma.sqlite3 (not the bloated HNSW files)
4. Upsert all drawers back 4. Delete and recreate the collection with hnsw:space=cosine
5. Upsert all drawers back
``confirm_truncation_ok`` overrides the safety guard from step 2.
Set to ``True`` only when you have independently verified that the
palace genuinely contains exactly the extracted number of drawers
(typically only a concern for palaces sized at exactly 10 000 rows).
""" """
palace_path = palace_path or _get_palace_path() palace_path = palace_path or _get_palace_path()
@@ -252,6 +382,16 @@ def rebuild_index(palace_path=None):
offset += len(batch["ids"]) offset += len(batch["ids"])
print(f" Extracted {len(all_ids)} drawers") print(f" Extracted {len(all_ids)} drawers")
# ── #1208 guard ──────────────────────────────────────────────────
# Refuse to ``delete_collection`` + rebuild when extraction looks
# short of the SQLite ground truth (or when extraction == chromadb
# default get() cap and the SQLite check couldn't run).
try:
check_extraction_safety(palace_path, len(all_ids), confirm_truncation_ok)
except TruncationDetected as e:
print(e.message)
return
# Back up ONLY the SQLite database, not the bloated HNSW files # Back up ONLY the SQLite database, not the bloated HNSW files
sqlite_path = os.path.join(palace_path, "chroma.sqlite3") sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
backup_path = sqlite_path + ".backup" backup_path = sqlite_path + ".backup"
+120
View File
@@ -254,3 +254,123 @@ def test_rebuild_index_error_reading(mock_backend_cls, mock_shutil, tmp_path):
repair.rebuild_index(palace_path=str(tmp_path)) repair.rebuild_index(palace_path=str(tmp_path))
mock_backend.delete_collection.assert_not_called() mock_backend.delete_collection.assert_not_called()
# ── #1208 truncation safety ───────────────────────────────────────────
def test_check_extraction_safety_passes_when_counts_match(tmp_path):
"""SQLite reports same count as extracted → no exception."""
with patch("mempalace.repair.sqlite_drawer_count", return_value=500):
repair.check_extraction_safety(str(tmp_path), 500)
def test_check_extraction_safety_passes_when_sqlite_unreadable_and_under_cap(tmp_path):
"""SQLite check fails (None) but extraction is well under the cap → safe."""
with patch("mempalace.repair.sqlite_drawer_count", return_value=None):
repair.check_extraction_safety(str(tmp_path), 5_000)
def test_check_extraction_safety_aborts_when_sqlite_higher(tmp_path):
"""SQLite reports more than extracted — the user-reported #1208 case."""
with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
try:
repair.check_extraction_safety(str(tmp_path), 10_000)
except repair.TruncationDetected as e:
assert e.sqlite_count == 67_580
assert e.extracted == 10_000
assert "67,580" in e.message
assert "10,000" in e.message
assert "57,580" in e.message # the loss number
else:
raise AssertionError("expected TruncationDetected")
def test_check_extraction_safety_aborts_when_unreadable_and_at_cap(tmp_path):
"""SQLite unreadable but extraction == default get() cap → suspicious."""
with patch("mempalace.repair.sqlite_drawer_count", return_value=None):
try:
repair.check_extraction_safety(str(tmp_path), repair.CHROMADB_DEFAULT_GET_LIMIT)
except repair.TruncationDetected as e:
assert e.sqlite_count is None
assert e.extracted == repair.CHROMADB_DEFAULT_GET_LIMIT
assert "10,000" in e.message
else:
raise AssertionError("expected TruncationDetected")
def test_check_extraction_safety_override_skips_check(tmp_path):
"""``confirm_truncation_ok=True`` short-circuits both signals."""
with patch("mempalace.repair.sqlite_drawer_count", return_value=99_999):
# Would normally abort — override allows through
repair.check_extraction_safety(str(tmp_path), 10_000, confirm_truncation_ok=True)
def test_sqlite_drawer_count_returns_none_on_missing_file(tmp_path):
"""Palace dir exists but no chroma.sqlite3 → None, not crash."""
assert repair.sqlite_drawer_count(str(tmp_path)) is None
def test_sqlite_drawer_count_returns_none_on_unreadable_schema(tmp_path):
"""File exists but isn't a chromadb sqlite → None, not crash."""
sqlite_path = os.path.join(str(tmp_path), "chroma.sqlite3")
with open(sqlite_path, "wb") as f:
f.write(b"not a sqlite file at all")
assert repair.sqlite_drawer_count(str(tmp_path)) is None
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_aborts_on_truncation_signal(mock_backend_cls, mock_shutil, tmp_path):
"""rebuild_index honors the safety guard: SQLite says 67k, get() returns
10k → no delete_collection, no upsert, no backup."""
mock_backend = MagicMock()
mock_col = MagicMock()
mock_col.count.return_value = 10_000
# Single page comes back with 10_000 ids
mock_col.get.side_effect = [
{
"ids": [f"id{i}" for i in range(10_000)],
"documents": ["x"] * 10_000,
"metadatas": [{}] * 10_000,
},
{"ids": [], "documents": [], "metadatas": []},
]
mock_backend.get_collection.return_value = mock_col
mock_backend_cls.return_value = mock_backend
with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
repair.rebuild_index(palace_path=str(tmp_path))
# Guard fired: nothing destructive happened
mock_backend.delete_collection.assert_not_called()
mock_backend.create_collection.assert_not_called()
mock_shutil.copy2.assert_not_called()
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
def test_rebuild_index_proceeds_with_override(mock_backend_cls, mock_shutil, tmp_path):
"""Override flag lets repair proceed even when the guard would fire."""
mock_backend = MagicMock()
mock_col = MagicMock()
mock_col.count.return_value = 10_000
mock_col.get.side_effect = [
{
"ids": [f"id{i}" for i in range(10_000)],
"documents": ["x"] * 10_000,
"metadatas": [{}] * 10_000,
},
{"ids": [], "documents": [], "metadatas": []},
]
mock_new_col = MagicMock()
mock_backend.get_collection.return_value = mock_col
mock_backend.create_collection.return_value = mock_new_col
mock_backend_cls.return_value = mock_backend
with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
repair.rebuild_index(palace_path=str(tmp_path), confirm_truncation_ok=True)
mock_backend.delete_collection.assert_called_once()
mock_backend.create_collection.assert_called_once()
mock_new_col.upsert.assert_called()