Merge branch 'main' into fix/issue-347-codex-hook-message-counting

This commit is contained in:
Ben Sigman
2026-04-10 09:23:37 -07:00
committed by GitHub
48 changed files with 6034 additions and 231 deletions
+51
View File
@@ -14,6 +14,7 @@ Commands:
mempalace mine <dir> Mine project files (default)
mempalace mine <dir> --mode convos Mine conversation exports
mempalace search "query" Find anything, exact words
mempalace mcp Show MCP setup command
mempalace wake-up Show L0 + L1 wake-up context
mempalace wake-up --wing my_app Wake-up for a specific project
mempalace status Show what's been filed
@@ -28,6 +29,7 @@ Examples:
import os
import sys
import shlex
import argparse
from pathlib import Path
@@ -148,6 +150,14 @@ def cmd_split(args):
sys.argv = old_argv
def cmd_migrate(args):
"""Migrate palace from a different ChromaDB version."""
from .migrate import migrate
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
migrate(palace_path=palace_path, dry_run=args.dry_run)
def cmd_status(args):
from .miner import status
@@ -202,6 +212,7 @@ def cmd_repair(args):
print(f" Extracted {len(all_ids)} drawers")
# Backup and rebuild
palace_path = palace_path.rstrip(os.sep)
backup_path = palace_path + ".backup"
if os.path.exists(backup_path):
shutil.rmtree(backup_path)
@@ -240,6 +251,27 @@ def cmd_instructions(args):
run_instructions(name=args.name)
def cmd_mcp(args):
"""Show how to wire MemPalace into MCP-capable hosts."""
base_server_cmd = "python -m mempalace.mcp_server"
if args.palace:
resolved_palace = str(Path(args.palace).expanduser())
server_cmd = f"{base_server_cmd} --palace {shlex.quote(resolved_palace)}"
else:
server_cmd = base_server_cmd
print("MemPalace MCP quick setup:")
print(f" claude mcp add mempalace -- {server_cmd}")
print("\nRun the server directly:")
print(f" {server_cmd}")
if not args.palace:
print("\nOptional custom palace:")
print(f" claude mcp add mempalace -- {base_server_cmd} --palace /path/to/palace")
print(f" {base_server_cmd} --palace /path/to/palace")
def cmd_compress(args):
"""Compress drawers in a wing using AAAK Dialect."""
import chromadb
@@ -500,7 +532,24 @@ def main():
help="Rebuild palace vector index from stored data (fixes segfaults after corruption)",
)
# mcp
sub.add_parser(
"mcp",
help="Show MCP setup command for connecting MemPalace to your AI client",
)
# status
# migrate
p_migrate = sub.add_parser(
"migrate",
help="Migrate palace from a different ChromaDB version (fixes 3.0.0 → 3.1.0 upgrade)",
)
p_migrate.add_argument(
"--dry-run",
action="store_true",
help="Show what would be migrated without changing anything",
)
sub.add_parser("status", help="Show what's been filed")
args = parser.parse_args()
@@ -531,9 +580,11 @@ def main():
"mine": cmd_mine,
"split": cmd_split,
"search": cmd_search,
"mcp": cmd_mcp,
"compress": cmd_compress,
"wake-up": cmd_wakeup,
"repair": cmd_repair,
"migrate": cmd_migrate,
"status": cmd_status,
}
dispatch[args.command](args)
+60
View File
@@ -6,8 +6,58 @@ Priority: env vars > config file (~/.mempalace/config.json) > defaults
import json
import os
import re
from pathlib import Path
# ── Input validation ──────────────────────────────────────────────────────────
# Shared sanitizers for wing/room/entity names. Prevents path traversal,
# excessively long strings, and special characters that could cause issues
# in file paths, SQLite, or ChromaDB metadata.
MAX_NAME_LENGTH = 128
_SAFE_NAME_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_ .'-]{0,126}[a-zA-Z0-9]?$")
def sanitize_name(value: str, field_name: str = "name") -> str:
"""Validate and sanitize a wing/room/entity name.
Raises ValueError if the name is invalid.
"""
if not isinstance(value, str) or not value.strip():
raise ValueError(f"{field_name} must be a non-empty string")
value = value.strip()
if len(value) > MAX_NAME_LENGTH:
raise ValueError(f"{field_name} exceeds maximum length of {MAX_NAME_LENGTH} characters")
# Block path traversal
if ".." in value or "/" in value or "\\" in value:
raise ValueError(f"{field_name} contains invalid path characters")
# Block null bytes
if "\x00" in value:
raise ValueError(f"{field_name} contains null bytes")
# Enforce safe character set
if not _SAFE_NAME_RE.match(value):
raise ValueError(f"{field_name} contains invalid characters")
return value
def sanitize_content(value: str, max_length: int = 100_000) -> str:
"""Validate drawer/diary content length."""
if not isinstance(value, str) or not value.strip():
raise ValueError("content must be a non-empty string")
if len(value) > max_length:
raise ValueError(f"content exceeds maximum length of {max_length} characters")
if "\x00" in value:
raise ValueError("content contains null bytes")
return value
DEFAULT_PALACE_PATH = os.path.expanduser("~/.mempalace/palace")
DEFAULT_COLLECTION_NAME = "mempalace_drawers"
@@ -126,6 +176,11 @@ class MempalaceConfig:
def init(self):
"""Create config directory and write default config.json if it doesn't exist."""
self._config_dir.mkdir(parents=True, exist_ok=True)
# Restrict directory permissions to owner only (Unix)
try:
self._config_dir.chmod(0o700)
except (OSError, NotImplementedError):
pass # Windows doesn't support Unix permissions
if not self._config_file.exists():
default_config = {
"palace_path": DEFAULT_PALACE_PATH,
@@ -135,6 +190,11 @@ class MempalaceConfig:
}
with open(self._config_file, "w") as f:
json.dump(default_config, f, indent=2)
# Restrict config file to owner read/write only
try:
self._config_file.chmod(0o600)
except (OSError, NotImplementedError):
pass
return self._config_file
def save_people_map(self, people_map):
+11 -35
View File
@@ -15,9 +15,8 @@ from pathlib import Path
from datetime import datetime
from collections import defaultdict
import chromadb
from .normalize import normalize
from .palace import SKIP_DIRS, get_collection, file_already_mined
# File types that might contain conversations
@@ -28,22 +27,8 @@ CONVO_EXTENSIONS = {
".jsonl",
}
SKIP_DIRS = {
".git",
"node_modules",
"__pycache__",
".venv",
"venv",
"env",
"dist",
"build",
".next",
".mempalace",
"tool-results",
"memory",
}
MIN_CHUNK_SIZE = 30
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
# =============================================================================
@@ -211,23 +196,6 @@ def detect_convo_room(content: str) -> str:
# =============================================================================
def get_collection(palace_path: str):
os.makedirs(palace_path, exist_ok=True)
client = chromadb.PersistentClient(path=palace_path)
try:
return client.get_collection("mempalace_drawers")
except Exception:
return client.create_collection("mempalace_drawers")
def file_already_mined(collection, source_file: str) -> bool:
try:
results = collection.get(where={"source_file": source_file}, limit=1)
return len(results.get("ids", [])) > 0
except Exception:
return False
# =============================================================================
# SCAN FOR CONVERSATION FILES
# =============================================================================
@@ -244,6 +212,14 @@ def scan_convos(convo_dir: str) -> list:
continue
filepath = Path(root) / filename
if filepath.suffix.lower() in CONVO_EXTENSIONS:
# Skip symlinks and oversized files
if filepath.is_symlink():
continue
try:
if filepath.stat().st_size > MAX_FILE_SIZE:
continue
except OSError:
continue
files.append(filepath)
return files
@@ -356,7 +332,7 @@ def mine_convos(
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
if extract_mode == "general":
room_counts[chunk_room] += 1
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.md5((source_file + str(chunk['chunk_index'])).encode(), usedforsecurity=False).hexdigest()[:16]}"
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
try:
collection.add(
documents=[chunk["content"]],
+1 -1
View File
@@ -309,7 +309,7 @@ class EntityRegistry:
def save(self):
self._path.parent.mkdir(parents=True, exist_ok=True)
self._path.write_text(json.dumps(self._data, indent=2))
self._path.write_text(json.dumps(self._data, indent=2), encoding="utf-8")
@staticmethod
def _empty() -> dict:
+1 -1
View File
@@ -158,7 +158,7 @@ def hook_stop(data: dict, harness: str):
if since_last >= SAVE_INTERVAL and exchange_count > 0:
# Update last save point
try:
last_save_file.write_text(str(exchange_count))
last_save_file.write_text(str(exchange_count), encoding="utf-8")
except OSError:
pass
+1
View File
@@ -60,6 +60,7 @@ AI memory system. Store everything, find anything. Local, free, no API key.
mempalace compress Compress palace storage
mempalace status Show palace status
mempalace repair Rebuild vector index
mempalace mcp Show MCP setup command
mempalace hook run Run hook logic (for harness integration)
mempalace instructions <name> Output skill instructions
+83 -77
View File
@@ -50,11 +50,14 @@ class KnowledgeGraph:
def __init__(self, db_path: str = None):
self.db_path = db_path or DEFAULT_KG_PATH
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
self._connection = None
self._init_db()
def _init_db(self):
conn = self._conn()
conn.executescript("""
PRAGMA journal_mode=WAL;
CREATE TABLE IF NOT EXISTS entities (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
@@ -84,12 +87,19 @@ class KnowledgeGraph:
CREATE INDEX IF NOT EXISTS idx_triples_valid ON triples(valid_from, valid_to);
""")
conn.commit()
conn.close()
def _conn(self):
conn = sqlite3.connect(self.db_path, timeout=10)
conn.execute("PRAGMA journal_mode=WAL")
return conn
if self._connection is None:
self._connection = sqlite3.connect(self.db_path, timeout=10, check_same_thread=False)
self._connection.execute("PRAGMA journal_mode=WAL")
self._connection.row_factory = sqlite3.Row
return self._connection
def close(self):
"""Close the database connection."""
if self._connection is not None:
self._connection.close()
self._connection = None
def _entity_id(self, name: str) -> str:
return name.lower().replace(" ", "_").replace("'", "")
@@ -101,12 +111,11 @@ class KnowledgeGraph:
eid = self._entity_id(name)
props = json.dumps(properties or {})
conn = self._conn()
conn.execute(
"INSERT OR REPLACE INTO entities (id, name, type, properties) VALUES (?, ?, ?, ?)",
(eid, name, entity_type, props),
)
conn.commit()
conn.close()
with conn:
conn.execute(
"INSERT OR REPLACE INTO entities (id, name, type, properties) VALUES (?, ?, ?, ?)",
(eid, name, entity_type, props),
)
return eid
def add_triple(
@@ -134,38 +143,38 @@ class KnowledgeGraph:
# Auto-create entities if they don't exist
conn = self._conn()
conn.execute("INSERT OR IGNORE INTO entities (id, name) VALUES (?, ?)", (sub_id, subject))
conn.execute("INSERT OR IGNORE INTO entities (id, name) VALUES (?, ?)", (obj_id, obj))
with conn:
conn.execute(
"INSERT OR IGNORE INTO entities (id, name) VALUES (?, ?)", (sub_id, subject)
)
conn.execute("INSERT OR IGNORE INTO entities (id, name) VALUES (?, ?)", (obj_id, obj))
# Check for existing identical triple
existing = conn.execute(
"SELECT id FROM triples WHERE subject=? AND predicate=? AND object=? AND valid_to IS NULL",
(sub_id, pred, obj_id),
).fetchone()
# Check for existing identical triple
existing = conn.execute(
"SELECT id FROM triples WHERE subject=? AND predicate=? AND object=? AND valid_to IS NULL",
(sub_id, pred, obj_id),
).fetchone()
if existing:
conn.close()
return existing[0] # Already exists and still valid
if existing:
return existing["id"] # Already exists and still valid
triple_id = f"t_{sub_id}_{pred}_{obj_id}_{hashlib.md5(f'{valid_from}{datetime.now().isoformat()}'.encode()).hexdigest()[:8]}"
triple_id = f"t_{sub_id}_{pred}_{obj_id}_{hashlib.sha256(f'{valid_from}{datetime.now().isoformat()}'.encode()).hexdigest()[:12]}"
conn.execute(
"""INSERT INTO triples (id, subject, predicate, object, valid_from, valid_to, confidence, source_closet, source_file)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
triple_id,
sub_id,
pred,
obj_id,
valid_from,
valid_to,
confidence,
source_closet,
source_file,
),
)
conn.commit()
conn.close()
conn.execute(
"""INSERT INTO triples (id, subject, predicate, object, valid_from, valid_to, confidence, source_closet, source_file)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
triple_id,
sub_id,
pred,
obj_id,
valid_from,
valid_to,
confidence,
source_closet,
source_file,
),
)
return triple_id
def invalidate(self, subject: str, predicate: str, obj: str, ended: str = None):
@@ -176,12 +185,11 @@ class KnowledgeGraph:
ended = ended or date.today().isoformat()
conn = self._conn()
conn.execute(
"UPDATE triples SET valid_to=? WHERE subject=? AND predicate=? AND object=? AND valid_to IS NULL",
(ended, sub_id, pred, obj_id),
)
conn.commit()
conn.close()
with conn:
conn.execute(
"UPDATE triples SET valid_to=? WHERE subject=? AND predicate=? AND object=? AND valid_to IS NULL",
(ended, sub_id, pred, obj_id),
)
# ── Query operations ──────────────────────────────────────────────────
@@ -208,13 +216,13 @@ class KnowledgeGraph:
{
"direction": "outgoing",
"subject": name,
"predicate": row[2],
"object": row[10], # obj_name
"valid_from": row[4],
"valid_to": row[5],
"confidence": row[6],
"source_closet": row[7],
"current": row[5] is None,
"predicate": row["predicate"],
"object": row["obj_name"],
"valid_from": row["valid_from"],
"valid_to": row["valid_to"],
"confidence": row["confidence"],
"source_closet": row["source_closet"],
"current": row["valid_to"] is None,
}
)
@@ -228,18 +236,17 @@ class KnowledgeGraph:
results.append(
{
"direction": "incoming",
"subject": row[10], # sub_name
"predicate": row[2],
"subject": row["sub_name"],
"predicate": row["predicate"],
"object": name,
"valid_from": row[4],
"valid_to": row[5],
"confidence": row[6],
"source_closet": row[7],
"current": row[5] is None,
"valid_from": row["valid_from"],
"valid_to": row["valid_to"],
"confidence": row["confidence"],
"source_closet": row["source_closet"],
"current": row["valid_to"] is None,
}
)
conn.close()
return results
def query_relationship(self, predicate: str, as_of: str = None):
@@ -262,15 +269,14 @@ class KnowledgeGraph:
for row in conn.execute(query, params).fetchall():
results.append(
{
"subject": row[10],
"subject": row["sub_name"],
"predicate": pred,
"object": row[11],
"valid_from": row[4],
"valid_to": row[5],
"current": row[5] is None,
"object": row["obj_name"],
"valid_from": row["valid_from"],
"valid_to": row["valid_to"],
"current": row["valid_to"] is None,
}
)
conn.close()
return results
def timeline(self, entity_name: str = None):
@@ -300,15 +306,14 @@ class KnowledgeGraph:
LIMIT 100
""").fetchall()
conn.close()
return [
{
"subject": r[10],
"predicate": r[2],
"object": r[11],
"valid_from": r[4],
"valid_to": r[5],
"current": r[5] is None,
"subject": r["sub_name"],
"predicate": r["predicate"],
"object": r["obj_name"],
"valid_from": r["valid_from"],
"valid_to": r["valid_to"],
"current": r["valid_to"] is None,
}
for r in rows
]
@@ -317,17 +322,18 @@ class KnowledgeGraph:
def stats(self):
conn = self._conn()
entities = conn.execute("SELECT COUNT(*) FROM entities").fetchone()[0]
triples = conn.execute("SELECT COUNT(*) FROM triples").fetchone()[0]
current = conn.execute("SELECT COUNT(*) FROM triples WHERE valid_to IS NULL").fetchone()[0]
entities = conn.execute("SELECT COUNT(*) as cnt FROM entities").fetchone()["cnt"]
triples = conn.execute("SELECT COUNT(*) as cnt FROM triples").fetchone()["cnt"]
current = conn.execute(
"SELECT COUNT(*) as cnt FROM triples WHERE valid_to IS NULL"
).fetchone()["cnt"]
expired = triples - current
predicates = [
r[0]
r["predicate"]
for r in conn.execute(
"SELECT DISTINCT predicate FROM triples ORDER BY predicate"
).fetchall()
]
conn.close()
return {
"entities": entities,
"triples": triples,
+145 -11
View File
@@ -24,8 +24,9 @@ import json
import logging
import hashlib
from datetime import datetime
from pathlib import Path
from .config import MempalaceConfig
from .config import MempalaceConfig, sanitize_name, sanitize_content
from .version import __version__
from .searcher import search_memories
from .palace_graph import traverse, find_tunnels, graph_stats
@@ -44,7 +45,9 @@ def _parse_args():
metavar="PATH",
help="Path to the palace directory (overrides config file and env var)",
)
args, _ = parser.parse_known_args()
args, unknown = parser.parse_known_args()
if unknown:
logger.debug("Ignoring unknown args: %s", unknown)
return args
@@ -64,16 +67,60 @@ _client_cache = None
_collection_cache = None
# ==================== WRITE-AHEAD LOG ====================
# Every write operation is logged to a JSONL file before execution.
# This provides an audit trail for detecting memory poisoning and
# enables review/rollback of writes from external or untrusted sources.
_WAL_DIR = Path(os.path.expanduser("~/.mempalace/wal"))
_WAL_DIR.mkdir(parents=True, exist_ok=True)
try:
_WAL_DIR.chmod(0o700)
except (OSError, NotImplementedError):
pass
_WAL_FILE = _WAL_DIR / "write_log.jsonl"
def _wal_log(operation: str, params: dict, result: dict = None):
"""Append a write operation to the write-ahead log."""
entry = {
"timestamp": datetime.now().isoformat(),
"operation": operation,
"params": params,
"result": result,
}
try:
with open(_WAL_FILE, "a", encoding="utf-8") as f:
f.write(json.dumps(entry, default=str) + "\n")
try:
_WAL_FILE.chmod(0o600)
except (OSError, NotImplementedError):
pass
except Exception as e:
logger.error(f"WAL write failed: {e}")
_client_cache = None
_collection_cache = None
def _get_client():
"""Return a singleton ChromaDB PersistentClient."""
global _client_cache
if _client_cache is None:
_client_cache = chromadb.PersistentClient(path=_config.palace_path)
return _client_cache
def _get_collection(create=False):
"""Return the ChromaDB collection, caching the client between calls."""
global _client_cache, _collection_cache
global _collection_cache
try:
if _client_cache is None:
_client_cache = chromadb.PersistentClient(path=_config.palace_path)
client = _get_client()
if create:
_collection_cache = _client_cache.get_or_create_collection(_config.collection_name)
_collection_cache = client.get_or_create_collection(_config.collection_name)
elif _collection_cache is None:
_collection_cache = _client_cache.get_collection(_config.collection_name)
_collection_cache = client.get_collection(_config.collection_name)
return _collection_cache
except Exception:
return None
@@ -280,11 +327,30 @@ def tool_add_drawer(
wing: str, room: str, content: str, source_file: str = None, added_by: str = "mcp"
):
"""File verbatim content into a wing/room. Checks for duplicates first."""
try:
wing = sanitize_name(wing, "wing")
room = sanitize_name(room, "room")
content = sanitize_content(content)
except ValueError as e:
return {"success": False, "error": str(e)}
col = _get_collection(create=True)
if not col:
return _no_palace()
drawer_id = f"drawer_{wing}_{room}_{hashlib.md5(content.encode()).hexdigest()[:16]}"
drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((wing + room + content[:100]).encode()).hexdigest()[:24]}"
_wal_log(
"add_drawer",
{
"drawer_id": drawer_id,
"wing": wing,
"room": room,
"added_by": added_by,
"content_length": len(content),
"content_preview": content[:200],
},
)
# Idempotency: if the deterministic ID already exists, return success as a no-op.
try:
@@ -323,6 +389,19 @@ def tool_delete_drawer(drawer_id: str):
existing = col.get(ids=[drawer_id])
if not existing["ids"]:
return {"success": False, "error": f"Drawer not found: {drawer_id}"}
# Log the deletion with the content being removed for audit trail
deleted_content = existing.get("documents", [""])[0] if existing.get("documents") else ""
deleted_meta = existing.get("metadatas", [{}])[0] if existing.get("metadatas") else {}
_wal_log(
"delete_drawer",
{
"drawer_id": drawer_id,
"deleted_meta": deleted_meta,
"content_preview": deleted_content[:200],
},
)
try:
col.delete(ids=[drawer_id])
logger.info(f"Deleted drawer: {drawer_id}")
@@ -344,6 +423,23 @@ def tool_kg_add(
subject: str, predicate: str, object: str, valid_from: str = None, source_closet: str = None
):
"""Add a relationship to the knowledge graph."""
try:
subject = sanitize_name(subject, "subject")
predicate = sanitize_name(predicate, "predicate")
object = sanitize_name(object, "object")
except ValueError as e:
return {"success": False, "error": str(e)}
_wal_log(
"kg_add",
{
"subject": subject,
"predicate": predicate,
"object": object,
"valid_from": valid_from,
"source_closet": source_closet,
},
)
triple_id = _kg.add_triple(
subject, predicate, object, valid_from=valid_from, source_closet=source_closet
)
@@ -352,6 +448,10 @@ def tool_kg_add(
def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = None):
"""Mark a fact as no longer true (set end date)."""
_wal_log(
"kg_invalidate",
{"subject": subject, "predicate": predicate, "object": object, "ended": ended},
)
_kg.invalidate(subject, predicate, object, ended=ended)
return {
"success": True,
@@ -382,6 +482,12 @@ def tool_diary_write(agent_name: str, entry: str, topic: str = "general"):
This is the agent's personal journal — observations, thoughts,
what it worked on, what it noticed, what it thinks matters.
"""
try:
agent_name = sanitize_name(agent_name, "agent_name")
entry = sanitize_content(entry)
except ValueError as e:
return {"success": False, "error": str(e)}
wing = f"wing_{agent_name.lower().replace(' ', '_')}"
room = "diary"
col = _get_collection(create=True)
@@ -389,9 +495,23 @@ def tool_diary_write(agent_name: str, entry: str, topic: str = "general"):
return _no_palace()
now = datetime.now()
entry_id = f"diary_{wing}_{now.strftime('%Y%m%d_%H%M%S')}_{hashlib.md5(entry[:50].encode()).hexdigest()[:8]}"
entry_id = f"diary_{wing}_{now.strftime('%Y%m%d_%H%M%S')}_{hashlib.sha256(entry[:50].encode()).hexdigest()[:12]}"
_wal_log(
"diary_write",
{
"agent_name": agent_name,
"topic": topic,
"entry_id": entry_id,
"entry_preview": entry[:200],
},
)
try:
# TODO: Future versions should expand AAAK before embedding to improve
# semantic search quality. For now, store raw AAAK in metadata so it's
# preserved, and keep the document as-is for embedding (even though
# compressed AAAK degrades embedding quality).
col.add(
ids=[entry_id],
documents=[entry],
@@ -717,17 +837,31 @@ TOOLS = {
}
SUPPORTED_PROTOCOL_VERSIONS = [
"2025-11-25",
"2025-06-18",
"2025-03-26",
"2024-11-05",
]
def handle_request(request):
method = request.get("method", "")
params = request.get("params", {})
req_id = request.get("id")
if method == "initialize":
client_version = params.get("protocolVersion", SUPPORTED_PROTOCOL_VERSIONS[-1])
negotiated = (
client_version
if client_version in SUPPORTED_PROTOCOL_VERSIONS
else SUPPORTED_PROTOCOL_VERSIONS[0]
)
return {
"jsonrpc": "2.0",
"id": req_id,
"result": {
"protocolVersion": "2024-11-05",
"protocolVersion": negotiated,
"capabilities": {"tools": {}},
"serverInfo": {"name": "mempalace", "version": __version__},
},
@@ -747,7 +881,7 @@ def handle_request(request):
}
elif method == "tools/call":
tool_name = params.get("name")
tool_args = params.get("arguments", {})
tool_args = params.get("arguments") or {}
if tool_name not in TOOLS:
return {
"jsonrpc": "2.0",
+214
View File
@@ -0,0 +1,214 @@
#!/usr/bin/env python3
"""
mempalace migrate — Recover a palace created with a different ChromaDB version.
Reads documents and metadata directly from the palace's SQLite database
(bypassing ChromaDB's API, which fails on version-mismatched palaces),
then re-imports everything into a fresh palace using the currently installed
ChromaDB version.
This fixes the 3.0.0 → 3.1.0 upgrade path where chromadb was downgraded
from 1.5.x to 0.6.x, breaking the on-disk storage format.
Usage:
mempalace migrate # migrate default palace
mempalace migrate --palace /path/to/palace # migrate specific palace
mempalace migrate --dry-run # show what would be migrated
"""
import os
import shutil
import sqlite3
from collections import defaultdict
from datetime import datetime
def extract_drawers_from_sqlite(db_path: str) -> list:
"""Read all drawers directly from ChromaDB's SQLite, bypassing the API.
Works regardless of which ChromaDB version created the database.
Returns list of dicts with 'id', 'document', and 'metadata' keys.
"""
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
# Get all embedding IDs and their documents
rows = conn.execute("""
SELECT e.embedding_id,
MAX(CASE WHEN em.key = 'chroma:document' THEN em.string_value END) as document
FROM embeddings e
JOIN embedding_metadata em ON em.id = e.id
GROUP BY e.embedding_id
""").fetchall()
drawers = []
for row in rows:
embedding_id = row["embedding_id"]
document = row["document"]
if not document:
continue
# Get metadata for this embedding
meta_rows = conn.execute(
"""
SELECT em.key, em.string_value, em.int_value, em.float_value, em.bool_value
FROM embedding_metadata em
JOIN embeddings e ON e.id = em.id
WHERE e.embedding_id = ?
AND em.key NOT LIKE 'chroma:%'
""",
(embedding_id,),
).fetchall()
metadata = {}
for mr in meta_rows:
key = mr["key"]
if mr["string_value"] is not None:
metadata[key] = mr["string_value"]
elif mr["int_value"] is not None:
metadata[key] = mr["int_value"]
elif mr["float_value"] is not None:
metadata[key] = mr["float_value"]
elif mr["bool_value"] is not None:
metadata[key] = bool(mr["bool_value"])
drawers.append(
{
"id": embedding_id,
"document": document,
"metadata": metadata,
}
)
conn.close()
return drawers
def detect_chromadb_version(db_path: str) -> str:
"""Detect which ChromaDB version created the database by checking schema."""
conn = sqlite3.connect(db_path)
try:
# 1.x has schema_str column in collections table
cols = [r[1] for r in conn.execute("PRAGMA table_info(collections)").fetchall()]
if "schema_str" in cols:
return "1.x"
# 0.6.x has embeddings_queue but no schema_str
tables = [
r[0]
for r in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
]
if "embeddings_queue" in tables:
return "0.6.x"
return "unknown"
finally:
conn.close()
def migrate(palace_path: str, dry_run: bool = False):
"""Migrate a palace to the currently installed ChromaDB version."""
import chromadb
palace_path = os.path.expanduser(palace_path)
db_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
print(f"\n No palace database found at {db_path}")
return False
print(f"\n{'=' * 60}")
print(" MemPalace Migrate")
print(f"{'=' * 60}\n")
print(f" Palace: {palace_path}")
print(f" Database: {db_path}")
print(f" DB size: {os.path.getsize(db_path) / 1024 / 1024:.1f} MB")
# Detect version
source_version = detect_chromadb_version(db_path)
print(f" Source: ChromaDB {source_version}")
print(f" Target: ChromaDB {chromadb.__version__}")
# Try reading with current chromadb first
try:
client = chromadb.PersistentClient(path=palace_path)
col = client.get_collection("mempalace_drawers")
count = col.count()
print(f"\n Palace is already readable by chromadb {chromadb.__version__}.")
print(f" {count} drawers found. No migration needed.")
return True
except Exception:
print(f"\n Palace is NOT readable by chromadb {chromadb.__version__}.")
print(" Extracting from SQLite directly...")
# Extract all drawers via raw SQL
drawers = extract_drawers_from_sqlite(db_path)
print(f" Extracted {len(drawers)} drawers from SQLite")
if not drawers:
print(" Nothing to migrate.")
return True
# Show summary
wings = defaultdict(lambda: defaultdict(int))
for d in drawers:
w = d["metadata"].get("wing", "?")
r = d["metadata"].get("room", "?")
wings[w][r] += 1
print("\n Summary:")
for wing, rooms in sorted(wings.items()):
total = sum(rooms.values())
print(f" WING: {wing} ({total} drawers)")
for room, count in sorted(rooms.items(), key=lambda x: -x[1]):
print(f" ROOM: {room:30} {count:5}")
if dry_run:
print("\n DRY RUN — no changes made.")
print(f" Would migrate {len(drawers)} drawers.")
return True
# Backup the old palace
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = f"{palace_path}.pre-migrate.{timestamp}"
print(f"\n Backing up to {backup_path}...")
shutil.copytree(palace_path, backup_path)
# Build fresh palace in a temp directory (avoids chromadb reading old state)
import tempfile
temp_palace = tempfile.mkdtemp(prefix="mempalace_migrate_")
print(f" Creating fresh palace in {temp_palace}...")
client = chromadb.PersistentClient(path=temp_palace)
col = client.get_or_create_collection("mempalace_drawers")
# Re-import in batches
batch_size = 500
imported = 0
for i in range(0, len(drawers), batch_size):
batch = drawers[i : i + batch_size]
col.add(
ids=[d["id"] for d in batch],
documents=[d["document"] for d in batch],
metadatas=[d["metadata"] for d in batch],
)
imported += len(batch)
print(f" Imported {imported}/{len(drawers)} drawers...")
# Verify before swapping
final_count = col.count()
del col
del client
# Swap: remove old palace, move new one into place
print(" Swapping old palace for migrated version...")
shutil.rmtree(palace_path)
shutil.move(temp_palace, palace_path)
print("\n Migration complete.")
print(f" Drawers migrated: {final_count}")
print(f" Backup at: {backup_path}")
if final_count != len(drawers):
print(f" WARNING: Expected {len(drawers)}, got {final_count}")
print(f"\n{'=' * 60}\n")
return True
+14 -58
View File
@@ -17,6 +17,8 @@ from collections import defaultdict
import chromadb
from .palace import SKIP_DIRS, get_collection, file_already_mined
READABLE_EXTENSIONS = {
".txt",
".md",
@@ -40,32 +42,6 @@ READABLE_EXTENSIONS = {
".toml",
}
SKIP_DIRS = {
".git",
"node_modules",
"__pycache__",
".venv",
"venv",
"env",
"dist",
"build",
".next",
"coverage",
".mempalace",
".ruff_cache",
".mypy_cache",
".pytest_cache",
".cache",
".tox",
".nox",
".idea",
".vscode",
".ipynb_checkpoints",
".eggs",
"htmlcov",
"target",
}
SKIP_FILENAMES = {
"mempalace.yaml",
"mempalace.yml",
@@ -78,6 +54,7 @@ SKIP_FILENAMES = {
CHUNK_SIZE = 800 # chars per drawer
CHUNK_OVERLAP = 100 # overlap between chunks
MIN_CHUNK_SIZE = 50 # skip tiny chunks
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
# =============================================================================
@@ -393,41 +370,11 @@ def chunk_text(content: str, source_file: str) -> list:
# =============================================================================
def get_collection(palace_path: str):
os.makedirs(palace_path, exist_ok=True)
client = chromadb.PersistentClient(path=palace_path)
try:
return client.get_collection("mempalace_drawers")
except Exception:
return client.create_collection("mempalace_drawers")
def file_already_mined(collection, source_file: str) -> bool:
"""Fast check: has this file been filed before and is unchanged?
Compares the stored mtime in drawer metadata against the file's current
mtime. Returns False (needs re-mining) when the file has been modified
since it was last mined, or when no mtime was stored.
"""
try:
results = collection.get(where={"source_file": source_file}, limit=1)
if not results.get("ids"):
return False
stored_meta = results["metadatas"][0] if results.get("metadatas") else {}
stored_mtime = stored_meta.get("source_mtime")
if stored_mtime is None:
return False
current_mtime = os.path.getmtime(source_file)
return float(stored_mtime) == current_mtime
except Exception:
return False
def add_drawer(
collection, wing: str, room: str, content: str, source_file: str, chunk_index: int, agent: str
):
"""Add one drawer to the palace."""
drawer_id = f"drawer_{wing}_{room}_{hashlib.md5((source_file + str(chunk_index)).encode(), usedforsecurity=False).hexdigest()[:16]}"
drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk_index)).encode()).hexdigest()[:24]}"
try:
metadata = {
"wing": wing,
@@ -470,7 +417,7 @@ def process_file(
# Skip if already filed
source_file = str(filepath)
if not dry_run and file_already_mined(collection, source_file):
if not dry_run and file_already_mined(collection, source_file, check_mtime=True):
return 0, None
try:
@@ -562,6 +509,15 @@ def scan_project(
if respect_gitignore and active_matchers and not force_include:
if is_gitignored(filepath, active_matchers, is_dir=False):
continue
# Skip symlinks — prevents following links to /dev/urandom, etc.
if filepath.is_symlink():
continue
# Skip files exceeding size limit
try:
if filepath.stat().st_size > MAX_FILE_SIZE:
continue
except OSError:
continue
files.append(filepath)
return files
+6
View File
@@ -25,6 +25,12 @@ def normalize(filepath: str) -> str:
Load a file and normalize to transcript format if it's a chat export.
Plain text files pass through unchanged.
"""
try:
file_size = os.path.getsize(filepath)
except OSError as e:
raise IOError(f"Could not read {filepath}: {e}")
if file_size > 500 * 1024 * 1024: # 500 MB safety limit
raise IOError(f"File too large ({file_size // (1024*1024)} MB): {filepath}")
try:
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
content = f.read()
+2 -2
View File
@@ -312,7 +312,7 @@ def _generate_aaak_bootstrap(
]
)
(mempalace_dir / "aaak_entities.md").write_text("\n".join(registry_lines))
(mempalace_dir / "aaak_entities.md").write_text("\n".join(registry_lines), encoding="utf-8")
# Critical facts bootstrap (pre-palace — before any mining)
facts_lines = [
@@ -359,7 +359,7 @@ def _generate_aaak_bootstrap(
]
)
(mempalace_dir / "critical_facts.md").write_text("\n".join(facts_lines))
(mempalace_dir / "critical_facts.md").write_text("\n".join(facts_lines), encoding="utf-8")
def run_onboarding(
+71
View File
@@ -0,0 +1,71 @@
"""
palace.py — Shared palace operations.
Consolidates ChromaDB access patterns used by both miners and the MCP server.
"""
import os
import chromadb
SKIP_DIRS = {
".git",
"node_modules",
"__pycache__",
".venv",
"venv",
"env",
"dist",
"build",
".next",
"coverage",
".mempalace",
".ruff_cache",
".mypy_cache",
".pytest_cache",
".cache",
".tox",
".nox",
".idea",
".vscode",
".ipynb_checkpoints",
".eggs",
"htmlcov",
"target",
}
def get_collection(palace_path: str, collection_name: str = "mempalace_drawers"):
"""Get or create the palace ChromaDB collection."""
os.makedirs(palace_path, exist_ok=True)
try:
os.chmod(palace_path, 0o700)
except (OSError, NotImplementedError):
pass
client = chromadb.PersistentClient(path=palace_path)
try:
return client.get_collection(collection_name)
except Exception:
return client.create_collection(collection_name)
def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool:
"""Check if a file has already been filed in the palace.
When check_mtime=True (used by project miner), returns False if the file
has been modified since it was last mined, so it gets re-mined.
When check_mtime=False (used by convo miner), just checks existence.
"""
try:
results = collection.get(where={"source_file": source_file}, limit=1)
if not results.get("ids"):
return False
if check_mtime:
stored_meta = results.get("metadatas", [{}])[0]
stored_mtime = stored_meta.get("source_mtime")
if stored_mtime is None:
return False
current_mtime = os.path.getmtime(source_file)
return float(stored_mtime) == current_mtime
return True
except Exception:
return False
+9 -1
View File
@@ -182,6 +182,10 @@ def split_file(filepath, output_dir, dry_run=False):
Returns list of output paths written (or would be written if dry_run).
"""
path = Path(filepath)
max_size = 500 * 1024 * 1024 # 500 MB safety limit
if path.stat().st_size > max_size:
print(f" SKIP: {path.name} exceeds {max_size // (1024*1024)} MB limit")
return []
lines = path.read_text(errors="replace").splitlines(keepends=True)
boundaries = find_session_boundaries(lines)
@@ -219,7 +223,7 @@ def split_file(filepath, output_dir, dry_run=False):
if dry_run:
print(f" [{i + 1}/{len(boundaries) - 1}] {name} ({len(chunk)} lines)")
else:
out_path.write_text("".join(chunk))
out_path.write_text("".join(chunk), encoding="utf-8")
print(f"{name} ({len(chunk)} lines)")
written.append(out_path)
@@ -266,7 +270,11 @@ def main():
files = sorted(src_dir.glob("*.txt"))
mega_files = []
max_scan_size = 500 * 1024 * 1024 # 500 MB
for f in files:
if f.stat().st_size > max_scan_size:
print(f" SKIP: {f.name} exceeds {max_scan_size // (1024*1024)} MB limit")
continue
lines = f.read_text(errors="replace").splitlines(keepends=True)
boundaries = find_session_boundaries(lines)
if len(boundaries) >= args.min_sessions:
+1 -1
View File
@@ -1,3 +1,3 @@
"""Single source of truth for the MemPalace package version."""
__version__ = "3.0.14"
__version__ = "3.1.0"