Merge pull request #502 from milla-jovovich/fix/chromadb-version-migration

feat: mempalace migrate — recover palaces from different ChromaDB versions
This commit is contained in:
Ben Sigman
2026-04-10 08:26:45 -07:00
committed by GitHub
9 changed files with 408 additions and 4 deletions
+13
View File
@@ -0,0 +1,13 @@
# Default owners for everything
* @milla-jovovich @bensig @igorls
# Core library
mempalace/ @milla-jovovich @bensig
# CI and workflows
.github/ @bensig
# Plugins and integrations
.claude-plugin/ @bensig
.codex-plugin/ @bensig
integrations/ @bensig
+12
View File
@@ -0,0 +1,12 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "weekly"
open-pull-requests-limit: 5
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
open-pull-requests-limit: 3
+3 -3
View File
@@ -18,7 +18,7 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- run: pip install -e ".[dev]"
- run: python -m pytest tests/ -v --ignore=tests/benchmarks --cov=mempalace --cov-report=term-missing --cov-fail-under=80
- run: python -m pytest tests/ -v --ignore=tests/benchmarks --cov=mempalace --cov-report=term-missing --cov-fail-under=80 --durations=10
test-windows:
runs-on: windows-latest
@@ -28,7 +28,7 @@ jobs:
with:
python-version: "3.9"
- run: pip install -e ".[dev]"
- run: python -m pytest tests/ -v --ignore=tests/benchmarks --cov=mempalace --cov-report=term-missing --cov-fail-under=80
- run: python -m pytest tests/ -v --ignore=tests/benchmarks --cov=mempalace --cov-report=term-missing --cov-fail-under=80 --durations=10
test-macos:
runs-on: macos-latest
@@ -38,7 +38,7 @@ jobs:
with:
python-version: "3.9"
- run: pip install -e ".[dev]"
- run: python -m pytest tests/ -v --ignore=tests/benchmarks --cov=mempalace --cov-report=term-missing --cov-fail-under=80
- run: python -m pytest tests/ -v --ignore=tests/benchmarks --cov=mempalace --cov-report=term-missing --cov-fail-under=80 --durations=10
lint:
runs-on: ubuntu-latest
steps:
+27
View File
@@ -6,3 +6,30 @@ __pycache__/
.pytest_cache/
mempal.yaml
.a5c/
# Environment
.env
.env.*
# OS
.DS_Store
Thumbs.db
# IDEs
.idea/
.vscode/
*.swp
*.swo
*~
# Coverage
htmlcov/
.coverage
coverage.xml
# Virtual environments
.venv/
venv/
# ChromaDB local data
*.sqlite3-journal
+78
View File
@@ -0,0 +1,78 @@
# AGENTS.md
> How to build, test, and contribute to MemPalace.
## Setup
```bash
pip install -e ".[dev]"
```
## Commands
```bash
# Run tests
python -m pytest tests/ -v --ignore=tests/benchmarks
# Run tests with coverage
python -m pytest tests/ -v --ignore=tests/benchmarks --cov=mempalace --cov-report=term-missing
# Lint
ruff check .
# Format
ruff format .
# Format check (CI mode)
ruff format --check .
```
## Project structure
```
mempalace/
├── mcp_server.py # MCP server — all read/write tools
├── miner.py # Project file miner
├── convo_miner.py # Conversation transcript miner
├── searcher.py # Semantic search
├── knowledge_graph.py # Temporal entity-relationship graph (SQLite)
├── palace.py # Shared palace operations (ChromaDB access)
├── config.py # Configuration + input validation
├── normalize.py # Transcript format detection + normalization
├── cli.py # CLI dispatcher
├── dialect.py # AAAK compression dialect
├── palace_graph.py # Room traversal + cross-wing tunnels
├── hooks_cli.py # Hook system for auto-save
└── version.py # Single source of truth for version
```
## Conventions
- **Python style**: snake_case for functions/variables, PascalCase for classes
- **Linter**: ruff with E/F/W rules
- **Formatter**: ruff format, double quotes
- **Commits**: conventional commits (`fix:`, `feat:`, `test:`, `docs:`, `ci:`)
- **Tests**: `tests/test_*.py`, fixtures in `tests/conftest.py`
- **Coverage**: 85% threshold (80% on Windows due to ChromaDB file lock cleanup)
## Architecture
```
User → CLI / MCP Server → ChromaDB (vector store) + SQLite (knowledge graph)
Palace structure:
WING (person/project)
└── ROOM (topic)
└── DRAWER (verbatim text chunk)
Knowledge Graph:
ENTITY → PREDICATE → ENTITY (with valid_from / valid_to dates)
```
## Key files for common tasks
- **Adding an MCP tool**: `mempalace/mcp_server.py` — add handler function + TOOLS dict entry
- **Changing search**: `mempalace/searcher.py`
- **Modifying mining**: `mempalace/miner.py` (project files) or `mempalace/convo_miner.py` (transcripts)
- **Input validation**: `mempalace/config.py``sanitize_name()` / `sanitize_content()`
- **Tests**: mirror source structure in `tests/test_<module>.py`
+36
View File
@@ -0,0 +1,36 @@
-- MemPalace Knowledge Graph Schema
-- SQLite database at ~/.mempalace/knowledge_graph.db
CREATE TABLE IF NOT EXISTS entities (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
type TEXT DEFAULT 'unknown',
properties TEXT DEFAULT '{}'
);
CREATE TABLE IF NOT EXISTS triples (
id TEXT PRIMARY KEY,
subject TEXT NOT NULL,
predicate TEXT NOT NULL,
object TEXT NOT NULL,
valid_from TEXT,
valid_to TEXT,
confidence REAL DEFAULT 1.0,
source_closet TEXT,
source_file TEXT
);
CREATE TABLE IF NOT EXISTS attributes (
entity_id TEXT NOT NULL,
key TEXT NOT NULL,
value TEXT,
valid_from TEXT,
valid_to TEXT,
PRIMARY KEY (entity_id, key, valid_from)
);
-- Indexes
CREATE INDEX IF NOT EXISTS idx_triples_subject ON triples(subject);
CREATE INDEX IF NOT EXISTS idx_triples_object ON triples(object);
CREATE INDEX IF NOT EXISTS idx_triples_predicate ON triples(predicate);
CREATE INDEX IF NOT EXISTS idx_triples_valid ON triples(valid_from, valid_to);
+20
View File
@@ -150,6 +150,14 @@ def cmd_split(args):
sys.argv = old_argv
def cmd_migrate(args):
"""Migrate palace from a different ChromaDB version."""
from .migrate import migrate
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
migrate(palace_path=palace_path, dry_run=args.dry_run)
def cmd_status(args):
from .miner import status
@@ -531,6 +539,17 @@ def main():
)
# status
# migrate
p_migrate = sub.add_parser(
"migrate",
help="Migrate palace from a different ChromaDB version (fixes 3.0.0 → 3.1.0 upgrade)",
)
p_migrate.add_argument(
"--dry-run",
action="store_true",
help="Show what would be migrated without changing anything",
)
sub.add_parser("status", help="Show what's been filed")
args = parser.parse_args()
@@ -565,6 +584,7 @@ def main():
"compress": cmd_compress,
"wake-up": cmd_wakeup,
"repair": cmd_repair,
"migrate": cmd_migrate,
"status": cmd_status,
}
dispatch[args.command](args)
+214
View File
@@ -0,0 +1,214 @@
#!/usr/bin/env python3
"""
mempalace migrate — Recover a palace created with a different ChromaDB version.
Reads documents and metadata directly from the palace's SQLite database
(bypassing ChromaDB's API, which fails on version-mismatched palaces),
then re-imports everything into a fresh palace using the currently installed
ChromaDB version.
This fixes the 3.0.0 → 3.1.0 upgrade path where chromadb was downgraded
from 1.5.x to 0.6.x, breaking the on-disk storage format.
Usage:
mempalace migrate # migrate default palace
mempalace migrate --palace /path/to/palace # migrate specific palace
mempalace migrate --dry-run # show what would be migrated
"""
import os
import shutil
import sqlite3
from collections import defaultdict
from datetime import datetime
def extract_drawers_from_sqlite(db_path: str) -> list:
"""Read all drawers directly from ChromaDB's SQLite, bypassing the API.
Works regardless of which ChromaDB version created the database.
Returns list of dicts with 'id', 'document', and 'metadata' keys.
"""
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
# Get all embedding IDs and their documents
rows = conn.execute("""
SELECT e.embedding_id,
MAX(CASE WHEN em.key = 'chroma:document' THEN em.string_value END) as document
FROM embeddings e
JOIN embedding_metadata em ON em.id = e.id
GROUP BY e.embedding_id
""").fetchall()
drawers = []
for row in rows:
embedding_id = row["embedding_id"]
document = row["document"]
if not document:
continue
# Get metadata for this embedding
meta_rows = conn.execute(
"""
SELECT em.key, em.string_value, em.int_value, em.float_value, em.bool_value
FROM embedding_metadata em
JOIN embeddings e ON e.id = em.id
WHERE e.embedding_id = ?
AND em.key NOT LIKE 'chroma:%'
""",
(embedding_id,),
).fetchall()
metadata = {}
for mr in meta_rows:
key = mr["key"]
if mr["string_value"] is not None:
metadata[key] = mr["string_value"]
elif mr["int_value"] is not None:
metadata[key] = mr["int_value"]
elif mr["float_value"] is not None:
metadata[key] = mr["float_value"]
elif mr["bool_value"] is not None:
metadata[key] = bool(mr["bool_value"])
drawers.append(
{
"id": embedding_id,
"document": document,
"metadata": metadata,
}
)
conn.close()
return drawers
def detect_chromadb_version(db_path: str) -> str:
"""Detect which ChromaDB version created the database by checking schema."""
conn = sqlite3.connect(db_path)
try:
# 1.x has schema_str column in collections table
cols = [r[1] for r in conn.execute("PRAGMA table_info(collections)").fetchall()]
if "schema_str" in cols:
return "1.x"
# 0.6.x has embeddings_queue but no schema_str
tables = [
r[0]
for r in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
]
if "embeddings_queue" in tables:
return "0.6.x"
return "unknown"
finally:
conn.close()
def migrate(palace_path: str, dry_run: bool = False):
"""Migrate a palace to the currently installed ChromaDB version."""
import chromadb
palace_path = os.path.expanduser(palace_path)
db_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
print(f"\n No palace database found at {db_path}")
return False
print(f"\n{'=' * 60}")
print(" MemPalace Migrate")
print(f"{'=' * 60}\n")
print(f" Palace: {palace_path}")
print(f" Database: {db_path}")
print(f" DB size: {os.path.getsize(db_path) / 1024 / 1024:.1f} MB")
# Detect version
source_version = detect_chromadb_version(db_path)
print(f" Source: ChromaDB {source_version}")
print(f" Target: ChromaDB {chromadb.__version__}")
# Try reading with current chromadb first
try:
client = chromadb.PersistentClient(path=palace_path)
col = client.get_collection("mempalace_drawers")
count = col.count()
print(f"\n Palace is already readable by chromadb {chromadb.__version__}.")
print(f" {count} drawers found. No migration needed.")
return True
except Exception:
print(f"\n Palace is NOT readable by chromadb {chromadb.__version__}.")
print(" Extracting from SQLite directly...")
# Extract all drawers via raw SQL
drawers = extract_drawers_from_sqlite(db_path)
print(f" Extracted {len(drawers)} drawers from SQLite")
if not drawers:
print(" Nothing to migrate.")
return True
# Show summary
wings = defaultdict(lambda: defaultdict(int))
for d in drawers:
w = d["metadata"].get("wing", "?")
r = d["metadata"].get("room", "?")
wings[w][r] += 1
print("\n Summary:")
for wing, rooms in sorted(wings.items()):
total = sum(rooms.values())
print(f" WING: {wing} ({total} drawers)")
for room, count in sorted(rooms.items(), key=lambda x: -x[1]):
print(f" ROOM: {room:30} {count:5}")
if dry_run:
print("\n DRY RUN — no changes made.")
print(f" Would migrate {len(drawers)} drawers.")
return True
# Backup the old palace
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = f"{palace_path}.pre-migrate.{timestamp}"
print(f"\n Backing up to {backup_path}...")
shutil.copytree(palace_path, backup_path)
# Build fresh palace in a temp directory (avoids chromadb reading old state)
import tempfile
temp_palace = tempfile.mkdtemp(prefix="mempalace_migrate_")
print(f" Creating fresh palace in {temp_palace}...")
client = chromadb.PersistentClient(path=temp_palace)
col = client.get_or_create_collection("mempalace_drawers")
# Re-import in batches
batch_size = 500
imported = 0
for i in range(0, len(drawers), batch_size):
batch = drawers[i : i + batch_size]
col.add(
ids=[d["id"] for d in batch],
documents=[d["document"] for d in batch],
metadatas=[d["metadata"] for d in batch],
)
imported += len(batch)
print(f" Imported {imported}/{len(drawers)} drawers...")
# Verify before swapping
final_count = col.count()
del col
del client
# Swap: remove old palace, move new one into place
print(" Swapping old palace for migrated version...")
shutil.rmtree(palace_path)
shutil.move(temp_palace, palace_path)
print("\n Migration complete.")
print(f" Drawers migrated: {final_count}")
print(f" Backup at: {backup_path}")
if final_count != len(drawers):
print(f" WARNING: Expected {len(drawers)}, got {final_count}")
print(f"\n{'=' * 60}\n")
return True
+5 -1
View File
@@ -54,11 +54,15 @@ packages = ["mempalace"]
[tool.ruff]
line-length = 100
target-version = "py39"
extend-exclude = ["benchmarks"]
[tool.ruff.lint]
select = ["E", "F", "W"]
select = ["E", "F", "W", "C901"]
ignore = ["E501"]
[tool.ruff.lint.mccabe]
max-complexity = 25
[tool.ruff.format]
quote-style = "double"