Merge pull request #1210 from MemPalace/fix/repair-extraction-cap-detection
fix(repair): refuse to overwrite when extraction looks truncated (#1208)
This commit is contained in:
+33
-2
@@ -410,6 +410,7 @@ def cmd_repair(args):
|
|||||||
import shutil
|
import shutil
|
||||||
from .backends.chroma import ChromaBackend
|
from .backends.chroma import ChromaBackend
|
||||||
from .migrate import confirm_destructive_action, contains_palace_database
|
from .migrate import confirm_destructive_action, contains_palace_database
|
||||||
|
from .repair import TruncationDetected, check_extraction_safety
|
||||||
|
|
||||||
palace_path = os.path.abspath(
|
palace_path = os.path.abspath(
|
||||||
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
|
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
|
||||||
@@ -466,6 +467,23 @@ def cmd_repair(args):
|
|||||||
offset += len(batch["ids"])
|
offset += len(batch["ids"])
|
||||||
print(f" Extracted {len(all_ids)} drawers")
|
print(f" Extracted {len(all_ids)} drawers")
|
||||||
|
|
||||||
|
# ── #1208 guard ──────────────────────────────────────────────────
|
||||||
|
# Cross-check against the SQLite ground truth before doing anything
|
||||||
|
# destructive. Catches the user-reported case where chromadb's
|
||||||
|
# collection-layer get() silently caps at 10,000 rows even on much
|
||||||
|
# larger palaces (e.g. after manual HNSW quarantine). Override with
|
||||||
|
# --confirm-truncation-ok only after independently verifying the
|
||||||
|
# extraction count is real.
|
||||||
|
try:
|
||||||
|
check_extraction_safety(
|
||||||
|
palace_path,
|
||||||
|
len(all_ids),
|
||||||
|
confirm_truncation_ok=getattr(args, "confirm_truncation_ok", False),
|
||||||
|
)
|
||||||
|
except TruncationDetected as e:
|
||||||
|
print(e.message)
|
||||||
|
return
|
||||||
|
|
||||||
# Backup and rebuild
|
# Backup and rebuild
|
||||||
palace_path = os.path.normpath(palace_path)
|
palace_path = os.path.normpath(palace_path)
|
||||||
backup_path = palace_path + ".backup"
|
backup_path = palace_path + ".backup"
|
||||||
@@ -868,10 +886,23 @@ def main():
|
|||||||
instructions_sub.add_parser(instr_name, help=f"Output {instr_name} instructions")
|
instructions_sub.add_parser(instr_name, help=f"Output {instr_name} instructions")
|
||||||
|
|
||||||
# repair
|
# repair
|
||||||
sub.add_parser(
|
p_repair = sub.add_parser(
|
||||||
"repair",
|
"repair",
|
||||||
help="Rebuild palace vector index from stored data (fixes segfaults after corruption)",
|
help="Rebuild palace vector index from stored data (fixes segfaults after corruption)",
|
||||||
).add_argument("--yes", action="store_true", help="Skip confirmation for destructive changes")
|
)
|
||||||
|
p_repair.add_argument(
|
||||||
|
"--yes", action="store_true", help="Skip confirmation for destructive changes"
|
||||||
|
)
|
||||||
|
p_repair.add_argument(
|
||||||
|
"--confirm-truncation-ok",
|
||||||
|
action="store_true",
|
||||||
|
help=(
|
||||||
|
"Override the #1208 safety guard. Required when chromadb's collection-layer "
|
||||||
|
"extraction returns exactly 10,000 drawers and the SQLite ground-truth check "
|
||||||
|
"either matches or can't be read. Use only after independently confirming "
|
||||||
|
"the palace really contains that count."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
# mcp
|
# mcp
|
||||||
sub.add_parser(
|
sub.add_parser(
|
||||||
|
|||||||
+144
-4
@@ -201,13 +201,143 @@ def prune_corrupt(palace_path=None, confirm=False):
|
|||||||
print(f" Collection size: {before:,} → {after:,}")
|
print(f" Collection size: {before:,} → {after:,}")
|
||||||
|
|
||||||
|
|
||||||
def rebuild_index(palace_path=None):
|
# ChromaDB's ``collection.get()`` enforces an internal default ``limit``
|
||||||
|
# of 10 000 rows when the caller does not pass one. We pass an explicit
|
||||||
|
# ``limit=batch_size`` below, but the underlying segment also caps reads
|
||||||
|
# during stale/quarantined-HNSW recovery flows: extraction silently stops
|
||||||
|
# at exactly 10 000 even on palaces with many more rows. Refusing to
|
||||||
|
# overwrite when this exact value comes back is the simplest signal we
|
||||||
|
# can detect without depending on chromadb internals.
|
||||||
|
CHROMADB_DEFAULT_GET_LIMIT = 10_000
|
||||||
|
|
||||||
|
|
||||||
|
class TruncationDetected(Exception):
|
||||||
|
"""Raised by :func:`check_extraction_safety` when extraction looks short.
|
||||||
|
|
||||||
|
Carries the human-readable abort message so callers (CLI ``cmd_repair``,
|
||||||
|
``rebuild_index``) can print and exit consistently without re-deriving
|
||||||
|
the wording.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, message: str, sqlite_count: "int | None", extracted: int):
|
||||||
|
super().__init__(message)
|
||||||
|
self.message = message
|
||||||
|
self.sqlite_count = sqlite_count
|
||||||
|
self.extracted = extracted
|
||||||
|
|
||||||
|
|
||||||
|
def check_extraction_safety(
|
||||||
|
palace_path: str, extracted: int, confirm_truncation_ok: bool = False
|
||||||
|
) -> None:
|
||||||
|
"""Cross-check that ``extracted`` matches the SQLite ground truth.
|
||||||
|
|
||||||
|
Two signals trip the guard:
|
||||||
|
|
||||||
|
1. **Strong** — ``chroma.sqlite3`` reports more drawers than were
|
||||||
|
extracted. This is the user-reported #1208 case: 67 580 on disk,
|
||||||
|
10 000 came back through the chromadb collection layer, repair
|
||||||
|
would have destroyed the difference.
|
||||||
|
2. **Weak** — extracted count equals exactly ``CHROMADB_DEFAULT_GET_LIMIT``
|
||||||
|
AND the SQLite check couldn't run (schema drift, locked file).
|
||||||
|
Hitting the chromadb default ``get()`` cap exactly is suspicious
|
||||||
|
enough to refuse without explicit acknowledgement.
|
||||||
|
|
||||||
|
Raises :class:`TruncationDetected` with a printable message when the
|
||||||
|
guard fires. Does nothing on safe extractions or when
|
||||||
|
``confirm_truncation_ok`` is set.
|
||||||
|
"""
|
||||||
|
if confirm_truncation_ok:
|
||||||
|
return
|
||||||
|
|
||||||
|
sqlite_count = sqlite_drawer_count(palace_path)
|
||||||
|
cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT
|
||||||
|
|
||||||
|
if sqlite_count is not None and sqlite_count > extracted:
|
||||||
|
loss = sqlite_count - extracted
|
||||||
|
pct = 100 * loss / sqlite_count
|
||||||
|
message = (
|
||||||
|
f"\n ABORT: chroma.sqlite3 reports {sqlite_count:,} drawers but only {extracted:,}\n"
|
||||||
|
" came back through the chromadb collection layer. The segment metadata is\n"
|
||||||
|
" stale (often after manual HNSW quarantine) — proceeding would silently\n"
|
||||||
|
f" destroy {loss:,} drawers (~{pct:.0f}%).\n"
|
||||||
|
"\n"
|
||||||
|
" Recovery options:\n"
|
||||||
|
" 1. Restore from your most recent palace backup, then re-mine.\n"
|
||||||
|
" 2. Direct-extract from chroma.sqlite3 (rows are still on disk) and\n"
|
||||||
|
" rebuild the palace from source files.\n"
|
||||||
|
" 3. If you have independently confirmed the palace really contains only\n"
|
||||||
|
f" {extracted:,} drawers, re-run with --confirm-truncation-ok.\n"
|
||||||
|
)
|
||||||
|
raise TruncationDetected(message, sqlite_count, extracted)
|
||||||
|
|
||||||
|
if cap_signal and sqlite_count is None:
|
||||||
|
message = (
|
||||||
|
f"\n ABORT: extracted exactly {CHROMADB_DEFAULT_GET_LIMIT:,} drawers, which matches\n"
|
||||||
|
" ChromaDB's internal default get() limit. The on-disk SQLite count couldn't\n"
|
||||||
|
" be cross-checked from this Python context, so we can't tell whether the\n"
|
||||||
|
f" palace genuinely holds {CHROMADB_DEFAULT_GET_LIMIT:,} rows or whether extraction was\n"
|
||||||
|
" silently capped. Refusing to overwrite the palace.\n"
|
||||||
|
"\n"
|
||||||
|
" If you have independently confirmed (e.g. via direct sqlite3 query) that\n"
|
||||||
|
f" the palace really contains exactly {CHROMADB_DEFAULT_GET_LIMIT:,} drawers, re-run with\n"
|
||||||
|
" --confirm-truncation-ok.\n"
|
||||||
|
)
|
||||||
|
raise TruncationDetected(message, sqlite_count, extracted)
|
||||||
|
|
||||||
|
|
||||||
|
def sqlite_drawer_count(palace_path: str) -> "int | None":
|
||||||
|
"""Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection.
|
||||||
|
|
||||||
|
Used as an independent ground-truth check against the chromadb
|
||||||
|
collection-layer ``count()`` / ``get()``: when the on-disk SQLite
|
||||||
|
row count exceeds the extraction count, the segment metadata is
|
||||||
|
stale and repair would destroy the difference.
|
||||||
|
|
||||||
|
Returns ``None`` when the schema isn't readable (chromadb version
|
||||||
|
drift, missing tables, locked file). Callers treat ``None`` as
|
||||||
|
"unknown" and fall back to the cap-detection check.
|
||||||
|
"""
|
||||||
|
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||||
|
if not os.path.exists(sqlite_path):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"""
|
||||||
|
SELECT COUNT(*)
|
||||||
|
FROM embeddings e
|
||||||
|
JOIN segments s ON e.segment_id = s.id
|
||||||
|
JOIN collections c ON s.collection = c.id
|
||||||
|
WHERE c.name = ?
|
||||||
|
""",
|
||||||
|
(COLLECTION_NAME,),
|
||||||
|
).fetchone()
|
||||||
|
return int(row[0]) if row and row[0] is not None else None
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
except Exception:
|
||||||
|
# chromadb schema differs by version (segments / collections column
|
||||||
|
# names occasionally rename). Silent fallback is correct here —
|
||||||
|
# the cap-detection check still catches the user-reported case.
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
||||||
"""Rebuild the HNSW index from scratch.
|
"""Rebuild the HNSW index from scratch.
|
||||||
|
|
||||||
1. Extract all drawers via ChromaDB get()
|
1. Extract all drawers via ChromaDB get()
|
||||||
2. Back up ONLY chroma.sqlite3 (not the bloated HNSW files)
|
2. Cross-check against the SQLite ground truth (#1208 guard)
|
||||||
3. Delete and recreate the collection with hnsw:space=cosine
|
3. Back up ONLY chroma.sqlite3 (not the bloated HNSW files)
|
||||||
4. Upsert all drawers back
|
4. Delete and recreate the collection with hnsw:space=cosine
|
||||||
|
5. Upsert all drawers back
|
||||||
|
|
||||||
|
``confirm_truncation_ok`` overrides the safety guard from step 2.
|
||||||
|
Set to ``True`` only when you have independently verified that the
|
||||||
|
palace genuinely contains exactly the extracted number of drawers
|
||||||
|
(typically only a concern for palaces sized at exactly 10 000 rows).
|
||||||
"""
|
"""
|
||||||
palace_path = palace_path or _get_palace_path()
|
palace_path = palace_path or _get_palace_path()
|
||||||
|
|
||||||
@@ -252,6 +382,16 @@ def rebuild_index(palace_path=None):
|
|||||||
offset += len(batch["ids"])
|
offset += len(batch["ids"])
|
||||||
print(f" Extracted {len(all_ids)} drawers")
|
print(f" Extracted {len(all_ids)} drawers")
|
||||||
|
|
||||||
|
# ── #1208 guard ──────────────────────────────────────────────────
|
||||||
|
# Refuse to ``delete_collection`` + rebuild when extraction looks
|
||||||
|
# short of the SQLite ground truth (or when extraction == chromadb
|
||||||
|
# default get() cap and the SQLite check couldn't run).
|
||||||
|
try:
|
||||||
|
check_extraction_safety(palace_path, len(all_ids), confirm_truncation_ok)
|
||||||
|
except TruncationDetected as e:
|
||||||
|
print(e.message)
|
||||||
|
return
|
||||||
|
|
||||||
# Back up ONLY the SQLite database, not the bloated HNSW files
|
# Back up ONLY the SQLite database, not the bloated HNSW files
|
||||||
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
|
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||||
backup_path = sqlite_path + ".backup"
|
backup_path = sqlite_path + ".backup"
|
||||||
|
|||||||
@@ -254,3 +254,123 @@ def test_rebuild_index_error_reading(mock_backend_cls, mock_shutil, tmp_path):
|
|||||||
|
|
||||||
repair.rebuild_index(palace_path=str(tmp_path))
|
repair.rebuild_index(palace_path=str(tmp_path))
|
||||||
mock_backend.delete_collection.assert_not_called()
|
mock_backend.delete_collection.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
# ── #1208 truncation safety ───────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_extraction_safety_passes_when_counts_match(tmp_path):
|
||||||
|
"""SQLite reports same count as extracted → no exception."""
|
||||||
|
with patch("mempalace.repair.sqlite_drawer_count", return_value=500):
|
||||||
|
repair.check_extraction_safety(str(tmp_path), 500)
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_extraction_safety_passes_when_sqlite_unreadable_and_under_cap(tmp_path):
|
||||||
|
"""SQLite check fails (None) but extraction is well under the cap → safe."""
|
||||||
|
with patch("mempalace.repair.sqlite_drawer_count", return_value=None):
|
||||||
|
repair.check_extraction_safety(str(tmp_path), 5_000)
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_extraction_safety_aborts_when_sqlite_higher(tmp_path):
|
||||||
|
"""SQLite reports more than extracted — the user-reported #1208 case."""
|
||||||
|
with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
|
||||||
|
try:
|
||||||
|
repair.check_extraction_safety(str(tmp_path), 10_000)
|
||||||
|
except repair.TruncationDetected as e:
|
||||||
|
assert e.sqlite_count == 67_580
|
||||||
|
assert e.extracted == 10_000
|
||||||
|
assert "67,580" in e.message
|
||||||
|
assert "10,000" in e.message
|
||||||
|
assert "57,580" in e.message # the loss number
|
||||||
|
else:
|
||||||
|
raise AssertionError("expected TruncationDetected")
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_extraction_safety_aborts_when_unreadable_and_at_cap(tmp_path):
|
||||||
|
"""SQLite unreadable but extraction == default get() cap → suspicious."""
|
||||||
|
with patch("mempalace.repair.sqlite_drawer_count", return_value=None):
|
||||||
|
try:
|
||||||
|
repair.check_extraction_safety(str(tmp_path), repair.CHROMADB_DEFAULT_GET_LIMIT)
|
||||||
|
except repair.TruncationDetected as e:
|
||||||
|
assert e.sqlite_count is None
|
||||||
|
assert e.extracted == repair.CHROMADB_DEFAULT_GET_LIMIT
|
||||||
|
assert "10,000" in e.message
|
||||||
|
else:
|
||||||
|
raise AssertionError("expected TruncationDetected")
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_extraction_safety_override_skips_check(tmp_path):
|
||||||
|
"""``confirm_truncation_ok=True`` short-circuits both signals."""
|
||||||
|
with patch("mempalace.repair.sqlite_drawer_count", return_value=99_999):
|
||||||
|
# Would normally abort — override allows through
|
||||||
|
repair.check_extraction_safety(str(tmp_path), 10_000, confirm_truncation_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_sqlite_drawer_count_returns_none_on_missing_file(tmp_path):
|
||||||
|
"""Palace dir exists but no chroma.sqlite3 → None, not crash."""
|
||||||
|
assert repair.sqlite_drawer_count(str(tmp_path)) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_sqlite_drawer_count_returns_none_on_unreadable_schema(tmp_path):
|
||||||
|
"""File exists but isn't a chromadb sqlite → None, not crash."""
|
||||||
|
sqlite_path = os.path.join(str(tmp_path), "chroma.sqlite3")
|
||||||
|
with open(sqlite_path, "wb") as f:
|
||||||
|
f.write(b"not a sqlite file at all")
|
||||||
|
assert repair.sqlite_drawer_count(str(tmp_path)) is None
|
||||||
|
|
||||||
|
|
||||||
|
@patch("mempalace.repair.shutil")
|
||||||
|
@patch("mempalace.repair.ChromaBackend")
|
||||||
|
def test_rebuild_index_aborts_on_truncation_signal(mock_backend_cls, mock_shutil, tmp_path):
|
||||||
|
"""rebuild_index honors the safety guard: SQLite says 67k, get() returns
|
||||||
|
10k → no delete_collection, no upsert, no backup."""
|
||||||
|
mock_backend = MagicMock()
|
||||||
|
mock_col = MagicMock()
|
||||||
|
mock_col.count.return_value = 10_000
|
||||||
|
# Single page comes back with 10_000 ids
|
||||||
|
mock_col.get.side_effect = [
|
||||||
|
{
|
||||||
|
"ids": [f"id{i}" for i in range(10_000)],
|
||||||
|
"documents": ["x"] * 10_000,
|
||||||
|
"metadatas": [{}] * 10_000,
|
||||||
|
},
|
||||||
|
{"ids": [], "documents": [], "metadatas": []},
|
||||||
|
]
|
||||||
|
mock_backend.get_collection.return_value = mock_col
|
||||||
|
mock_backend_cls.return_value = mock_backend
|
||||||
|
|
||||||
|
with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
|
||||||
|
repair.rebuild_index(palace_path=str(tmp_path))
|
||||||
|
|
||||||
|
# Guard fired: nothing destructive happened
|
||||||
|
mock_backend.delete_collection.assert_not_called()
|
||||||
|
mock_backend.create_collection.assert_not_called()
|
||||||
|
mock_shutil.copy2.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
@patch("mempalace.repair.shutil")
|
||||||
|
@patch("mempalace.repair.ChromaBackend")
|
||||||
|
def test_rebuild_index_proceeds_with_override(mock_backend_cls, mock_shutil, tmp_path):
|
||||||
|
"""Override flag lets repair proceed even when the guard would fire."""
|
||||||
|
mock_backend = MagicMock()
|
||||||
|
mock_col = MagicMock()
|
||||||
|
mock_col.count.return_value = 10_000
|
||||||
|
mock_col.get.side_effect = [
|
||||||
|
{
|
||||||
|
"ids": [f"id{i}" for i in range(10_000)],
|
||||||
|
"documents": ["x"] * 10_000,
|
||||||
|
"metadatas": [{}] * 10_000,
|
||||||
|
},
|
||||||
|
{"ids": [], "documents": [], "metadatas": []},
|
||||||
|
]
|
||||||
|
mock_new_col = MagicMock()
|
||||||
|
mock_backend.get_collection.return_value = mock_col
|
||||||
|
mock_backend.create_collection.return_value = mock_new_col
|
||||||
|
mock_backend_cls.return_value = mock_backend
|
||||||
|
|
||||||
|
with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
|
||||||
|
repair.rebuild_index(palace_path=str(tmp_path), confirm_truncation_ok=True)
|
||||||
|
|
||||||
|
mock_backend.delete_collection.assert_called_once()
|
||||||
|
mock_backend.create_collection.assert_called_once()
|
||||||
|
mock_new_col.upsert.assert_called()
|
||||||
|
|||||||
Reference in New Issue
Block a user