feat: mempalace migrate — recover palaces from different ChromaDB versions

Reads documents and metadata directly from ChromaDB's SQLite (bypassing
the API that fails on version-mismatched databases), then reimports into
a fresh palace using the currently installed ChromaDB.

Fixes the 3.0.0 → 3.1.0 upgrade path where chromadb was downgraded from
1.5.x to 0.6.x, breaking the on-disk storage format.

- Detects chromadb version from SQLite schema (0.6.x vs 1.x)
- Extracts all drawers with full metadata via raw SQL
- Builds fresh palace in temp dir, swaps atomically
- Backs up original palace before any changes
- Supports --dry-run to preview without modifying

Fixes #457
This commit is contained in:
bensig
2026-04-10 00:08:28 -07:00
parent afa30a9cca
commit 60bea83e76
2 changed files with 234 additions and 0 deletions
+20
View File
@@ -150,6 +150,14 @@ def cmd_split(args):
sys.argv = old_argv
def cmd_migrate(args):
"""Migrate palace from a different ChromaDB version."""
from .migrate import migrate
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
migrate(palace_path=palace_path, dry_run=args.dry_run)
def cmd_status(args):
from .miner import status
@@ -531,6 +539,17 @@ def main():
)
# status
# migrate
p_migrate = sub.add_parser(
"migrate",
help="Migrate palace from a different ChromaDB version (fixes 3.0.0 → 3.1.0 upgrade)",
)
p_migrate.add_argument(
"--dry-run",
action="store_true",
help="Show what would be migrated without changing anything",
)
sub.add_parser("status", help="Show what's been filed")
args = parser.parse_args()
@@ -565,6 +584,7 @@ def main():
"compress": cmd_compress,
"wake-up": cmd_wakeup,
"repair": cmd_repair,
"migrate": cmd_migrate,
"status": cmd_status,
}
dispatch[args.command](args)
+214
View File
@@ -0,0 +1,214 @@
#!/usr/bin/env python3
"""
mempalace migrate — Recover a palace created with a different ChromaDB version.
Reads documents and metadata directly from the palace's SQLite database
(bypassing ChromaDB's API, which fails on version-mismatched palaces),
then re-imports everything into a fresh palace using the currently installed
ChromaDB version.
This fixes the 3.0.0 → 3.1.0 upgrade path where chromadb was downgraded
from 1.5.x to 0.6.x, breaking the on-disk storage format.
Usage:
mempalace migrate # migrate default palace
mempalace migrate --palace /path/to/palace # migrate specific palace
mempalace migrate --dry-run # show what would be migrated
"""
import os
import shutil
import sqlite3
from collections import defaultdict
from datetime import datetime
def extract_drawers_from_sqlite(db_path: str) -> list:
"""Read all drawers directly from ChromaDB's SQLite, bypassing the API.
Works regardless of which ChromaDB version created the database.
Returns list of dicts with 'id', 'document', and 'metadata' keys.
"""
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
# Get all embedding IDs and their documents
rows = conn.execute("""
SELECT e.embedding_id,
MAX(CASE WHEN em.key = 'chroma:document' THEN em.string_value END) as document
FROM embeddings e
JOIN embedding_metadata em ON em.id = e.id
GROUP BY e.embedding_id
""").fetchall()
drawers = []
for row in rows:
embedding_id = row["embedding_id"]
document = row["document"]
if not document:
continue
# Get metadata for this embedding
meta_rows = conn.execute(
"""
SELECT em.key, em.string_value, em.int_value, em.float_value, em.bool_value
FROM embedding_metadata em
JOIN embeddings e ON e.id = em.id
WHERE e.embedding_id = ?
AND em.key NOT LIKE 'chroma:%'
""",
(embedding_id,),
).fetchall()
metadata = {}
for mr in meta_rows:
key = mr["key"]
if mr["string_value"] is not None:
metadata[key] = mr["string_value"]
elif mr["int_value"] is not None:
metadata[key] = mr["int_value"]
elif mr["float_value"] is not None:
metadata[key] = mr["float_value"]
elif mr["bool_value"] is not None:
metadata[key] = bool(mr["bool_value"])
drawers.append(
{
"id": embedding_id,
"document": document,
"metadata": metadata,
}
)
conn.close()
return drawers
def detect_chromadb_version(db_path: str) -> str:
"""Detect which ChromaDB version created the database by checking schema."""
conn = sqlite3.connect(db_path)
try:
# 1.x has schema_str column in collections table
cols = [r[1] for r in conn.execute("PRAGMA table_info(collections)").fetchall()]
if "schema_str" in cols:
return "1.x"
# 0.6.x has embeddings_queue but no schema_str
tables = [
r[0]
for r in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
]
if "embeddings_queue" in tables:
return "0.6.x"
return "unknown"
finally:
conn.close()
def migrate(palace_path: str, dry_run: bool = False):
"""Migrate a palace to the currently installed ChromaDB version."""
import chromadb
palace_path = os.path.expanduser(palace_path)
db_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
print(f"\n No palace database found at {db_path}")
return False
print(f"\n{'=' * 60}")
print(" MemPalace Migrate")
print(f"{'=' * 60}\n")
print(f" Palace: {palace_path}")
print(f" Database: {db_path}")
print(f" DB size: {os.path.getsize(db_path) / 1024 / 1024:.1f} MB")
# Detect version
source_version = detect_chromadb_version(db_path)
print(f" Source: ChromaDB {source_version}")
print(f" Target: ChromaDB {chromadb.__version__}")
# Try reading with current chromadb first
try:
client = chromadb.PersistentClient(path=palace_path)
col = client.get_collection("mempalace_drawers")
count = col.count()
print(f"\n Palace is already readable by chromadb {chromadb.__version__}.")
print(f" {count} drawers found. No migration needed.")
return True
except Exception:
print(f"\n Palace is NOT readable by chromadb {chromadb.__version__}.")
print(" Extracting from SQLite directly...")
# Extract all drawers via raw SQL
drawers = extract_drawers_from_sqlite(db_path)
print(f" Extracted {len(drawers)} drawers from SQLite")
if not drawers:
print(" Nothing to migrate.")
return True
# Show summary
wings = defaultdict(lambda: defaultdict(int))
for d in drawers:
w = d["metadata"].get("wing", "?")
r = d["metadata"].get("room", "?")
wings[w][r] += 1
print("\n Summary:")
for wing, rooms in sorted(wings.items()):
total = sum(rooms.values())
print(f" WING: {wing} ({total} drawers)")
for room, count in sorted(rooms.items(), key=lambda x: -x[1]):
print(f" ROOM: {room:30} {count:5}")
if dry_run:
print("\n DRY RUN — no changes made.")
print(f" Would migrate {len(drawers)} drawers.")
return True
# Backup the old palace
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = f"{palace_path}.pre-migrate.{timestamp}"
print(f"\n Backing up to {backup_path}...")
shutil.copytree(palace_path, backup_path)
# Build fresh palace in a temp directory (avoids chromadb reading old state)
import tempfile
temp_palace = tempfile.mkdtemp(prefix="mempalace_migrate_")
print(f" Creating fresh palace in {temp_palace}...")
client = chromadb.PersistentClient(path=temp_palace)
col = client.get_or_create_collection("mempalace_drawers")
# Re-import in batches
batch_size = 500
imported = 0
for i in range(0, len(drawers), batch_size):
batch = drawers[i : i + batch_size]
col.add(
ids=[d["id"] for d in batch],
documents=[d["document"] for d in batch],
metadatas=[d["metadata"] for d in batch],
)
imported += len(batch)
print(f" Imported {imported}/{len(drawers)} drawers...")
# Verify before swapping
final_count = col.count()
del col
del client
# Swap: remove old palace, move new one into place
print(" Swapping old palace for migrated version...")
shutil.rmtree(palace_path)
shutil.move(temp_palace, palace_path)
print("\n Migration complete.")
print(f" Drawers migrated: {final_count}")
print(f" Backup at: {backup_path}")
if final_count != len(drawers):
print(f" WARNING: Expected {len(drawers)}, got {final_count}")
print(f"\n{'=' * 60}\n")
return True