Merge branch 'main' into fix/query-sanitizer-prompt-contamination

2026-04-09 08:11:39 -07:00
parent 7509a72502 963c04cf45
commit 725fa2b6f1
79 changed files with 9292 additions and 123 deletions
@@ -1,6 +1,21 @@
 """MemPalace — Give your AI a memory. No API key required."""

-from .cli import main
-from .version import __version__
+import logging
+import os
+import platform
+
+from .cli import main  # noqa: E402
+from .version import __version__  # noqa: E402
+
+# ChromaDB 0.6.x ships a Posthog telemetry client whose capture() signature is
+# incompatible with the bundled posthog library, producing noisy stderr warnings
+# on every client operation ("Failed to send telemetry event … capture() takes
+# 1 positional argument but 3 were given").  Silence just that logger.
+logging.getLogger("chromadb.telemetry.product.posthog").setLevel(logging.CRITICAL)
+
+# ONNX Runtime's CoreML provider segfaults during vector queries on Apple Silicon.
+# Force CPU execution unless the user has explicitly set a preference.
+if platform.machine() == "arm64" and platform.system() == "Darwin":
+    os.environ.setdefault("ORT_DISABLE_COREML", "1")

 __all__ = ["main", "__version__"]
@@ -226,6 +226,20 @@ def cmd_repair(args):
    print(f"\n{'=' * 55}\n")


+def cmd_hook(args):
+    """Run hook logic: reads JSON from stdin, outputs JSON to stdout."""
+    from .hooks_cli import run_hook
+
+    run_hook(hook_name=args.hook, harness=args.harness)
+
+
+def cmd_instructions(args):
+    """Output skill instructions to stdout."""
+    from .instructions_cli import run_instructions
+
+    run_instructions(name=args.name)
+
+
 def cmd_compress(args):
    """Compress drawers in a wing using AAAK Dialect."""
    import chromadb
@@ -451,6 +465,35 @@ def main():
        help="Only split files containing at least N sessions (default: 2)",
    )

+    # hook
+    p_hook = sub.add_parser(
+        "hook",
+        help="Run hook logic (reads JSON from stdin, outputs JSON to stdout)",
+    )
+    hook_sub = p_hook.add_subparsers(dest="hook_action")
+    p_hook_run = hook_sub.add_parser("run", help="Execute a hook")
+    p_hook_run.add_argument(
+        "--hook",
+        required=True,
+        choices=["session-start", "stop", "precompact"],
+        help="Hook name to run",
+    )
+    p_hook_run.add_argument(
+        "--harness",
+        required=True,
+        choices=["claude-code", "codex"],
+        help="Harness type (determines stdin JSON format)",
+    )
+
+    # instructions
+    p_instructions = sub.add_parser(
+        "instructions",
+        help="Output skill instructions to stdout",
+    )
+    instructions_sub = p_instructions.add_subparsers(dest="instructions_name")
+    for instr_name in ["init", "search", "mine", "help", "status"]:
+        instructions_sub.add_parser(instr_name, help=f"Output {instr_name} instructions")
+
    # repair
    sub.add_parser(
        "repair",
@@ -466,6 +509,23 @@ def main():
        parser.print_help()
        return

+    # Handle two-level subcommands
+    if args.command == "hook":
+        if not getattr(args, "hook_action", None):
+            p_hook.print_help()
+            return
+        cmd_hook(args)
+        return
+
+    if args.command == "instructions":
+        name = getattr(args, "instructions_name", None)
+        if not name:
+            p_instructions.print_help()
+            return
+        args.name = name
+        cmd_instructions(args)
+        return
+
    dispatch = {
        "init": cmd_init,
        "mine": cmd_mine,
@@ -309,7 +309,7 @@ class EntityRegistry:

    def save(self):
        self._path.parent.mkdir(parents=True, exist_ok=True)
-        self._path.write_text(json.dumps(self._data, indent=2))
+        self._path.write_text(json.dumps(self._data, indent=2), encoding="utf-8")

    @staticmethod
    def _empty() -> dict:
@@ -0,0 +1,226 @@
+"""
+Hook logic for MemPalace — Python implementation of session-start, stop, and precompact hooks.
+
+Reads JSON from stdin, outputs JSON to stdout.
+Supported hooks: session-start, stop, precompact
+Supported harnesses: claude-code, codex (extensible to cursor, gemini, etc.)
+"""
+
+import json
+import os
+import re
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+
+SAVE_INTERVAL = 15
+STATE_DIR = Path.home() / ".mempalace" / "hook_state"
+
+STOP_BLOCK_REASON = (
+    "AUTO-SAVE checkpoint. Save key topics, decisions, quotes, and code "
+    "from this session to your memory system. Organize into appropriate "
+    "categories. Use verbatim quotes where possible. Continue conversation "
+    "after saving."
+)
+
+PRECOMPACT_BLOCK_REASON = (
+    "COMPACTION IMMINENT. Save ALL topics, decisions, quotes, code, and "
+    "important context from this session to your memory system. Be thorough "
+    "\u2014 after compaction, detailed context will be lost. Organize into "
+    "appropriate categories. Use verbatim quotes where possible. Save "
+    "everything, then allow compaction to proceed."
+)
+
+
+def _sanitize_session_id(session_id: str) -> str:
+    """Only allow alnum, dash, underscore to prevent path traversal."""
+    sanitized = re.sub(r"[^a-zA-Z0-9_-]", "", session_id)
+    return sanitized or "unknown"
+
+
+def _count_human_messages(transcript_path: str) -> int:
+    """Count human messages in a JSONL transcript, skipping command-messages."""
+    path = Path(transcript_path).expanduser()
+    if not path.is_file():
+        return 0
+    count = 0
+    try:
+        with open(path, encoding="utf-8", errors="replace") as f:
+            for line in f:
+                try:
+                    entry = json.loads(line)
+                    msg = entry.get("message", {})
+                    if isinstance(msg, dict) and msg.get("role") == "user":
+                        content = msg.get("content", "")
+                        if isinstance(content, str):
+                            if "<command-message>" in content:
+                                continue
+                        elif isinstance(content, list):
+                            text = " ".join(
+                                b.get("text", "") for b in content if isinstance(b, dict)
+                            )
+                            if "<command-message>" in text:
+                                continue
+                        count += 1
+                except (json.JSONDecodeError, AttributeError):
+                    pass
+    except OSError:
+        return 0
+    return count
+
+
+def _log(message: str):
+    """Append to hook state log file."""
+    try:
+        STATE_DIR.mkdir(parents=True, exist_ok=True)
+        log_path = STATE_DIR / "hook.log"
+        timestamp = datetime.now().strftime("%H:%M:%S")
+        with open(log_path, "a") as f:
+            f.write(f"[{timestamp}] {message}\n")
+    except OSError:
+        pass
+
+
+def _output(data: dict):
+    """Print JSON to stdout with consistent formatting (pretty-printed)."""
+    print(json.dumps(data, indent=2, ensure_ascii=False))
+
+
+def _maybe_auto_ingest():
+    """If MEMPAL_DIR is set and exists, run mempalace mine in background."""
+    mempal_dir = os.environ.get("MEMPAL_DIR", "")
+    if mempal_dir and os.path.isdir(mempal_dir):
+        try:
+            log_path = STATE_DIR / "hook.log"
+            with open(log_path, "a") as log_f:
+                subprocess.Popen(
+                    [sys.executable, "-m", "mempalace", "mine", mempal_dir],
+                    stdout=log_f,
+                    stderr=log_f,
+                )
+        except OSError:
+            pass
+
+
+SUPPORTED_HARNESSES = {"claude-code", "codex"}
+
+
+def _parse_harness_input(data: dict, harness: str) -> dict:
+    """Parse stdin JSON according to the harness type."""
+    if harness not in SUPPORTED_HARNESSES:
+        print(f"Unknown harness: {harness}", file=sys.stderr)
+        sys.exit(1)
+    return {
+        "session_id": _sanitize_session_id(str(data.get("session_id", "unknown"))),
+        "stop_hook_active": data.get("stop_hook_active", False),
+        "transcript_path": str(data.get("transcript_path", "")),
+    }
+
+
+def hook_stop(data: dict, harness: str):
+    """Stop hook: block every N messages for auto-save."""
+    parsed = _parse_harness_input(data, harness)
+    session_id = parsed["session_id"]
+    stop_hook_active = parsed["stop_hook_active"]
+    transcript_path = parsed["transcript_path"]
+
+    # If already in a save cycle, let through (infinite-loop prevention)
+    if str(stop_hook_active).lower() in ("true", "1", "yes"):
+        _output({})
+        return
+
+    # Count human messages
+    exchange_count = _count_human_messages(transcript_path)
+
+    # Track last save point
+    STATE_DIR.mkdir(parents=True, exist_ok=True)
+    last_save_file = STATE_DIR / f"{session_id}_last_save"
+    last_save = 0
+    if last_save_file.is_file():
+        try:
+            last_save = int(last_save_file.read_text().strip())
+        except (ValueError, OSError):
+            last_save = 0
+
+    since_last = exchange_count - last_save
+
+    _log(f"Session {session_id}: {exchange_count} exchanges, {since_last} since last save")
+
+    if since_last >= SAVE_INTERVAL and exchange_count > 0:
+        # Update last save point
+        try:
+            last_save_file.write_text(str(exchange_count), encoding="utf-8")
+        except OSError:
+            pass
+
+        _log(f"TRIGGERING SAVE at exchange {exchange_count}")
+
+        # Optional: auto-ingest if MEMPAL_DIR is set
+        _maybe_auto_ingest()
+
+        _output({"decision": "block", "reason": STOP_BLOCK_REASON})
+    else:
+        _output({})
+
+
+def hook_session_start(data: dict, harness: str):
+    """Session start hook: initialize session tracking state."""
+    parsed = _parse_harness_input(data, harness)
+    session_id = parsed["session_id"]
+
+    _log(f"SESSION START for session {session_id}")
+
+    # Initialize session state directory
+    STATE_DIR.mkdir(parents=True, exist_ok=True)
+
+    # Pass through — no blocking on session start
+    _output({})
+
+
+def hook_precompact(data: dict, harness: str):
+    """Precompact hook: always block with comprehensive save instruction."""
+    parsed = _parse_harness_input(data, harness)
+    session_id = parsed["session_id"]
+
+    _log(f"PRE-COMPACT triggered for session {session_id}")
+
+    # Optional: auto-ingest synchronously before compaction (so memories land first)
+    mempal_dir = os.environ.get("MEMPAL_DIR", "")
+    if mempal_dir and os.path.isdir(mempal_dir):
+        try:
+            log_path = STATE_DIR / "hook.log"
+            with open(log_path, "a") as log_f:
+                subprocess.run(
+                    [sys.executable, "-m", "mempalace", "mine", mempal_dir],
+                    stdout=log_f,
+                    stderr=log_f,
+                    timeout=60,
+                )
+        except OSError:
+            pass
+
+    # Always block -- compaction = save everything
+    _output({"decision": "block", "reason": PRECOMPACT_BLOCK_REASON})
+
+
+def run_hook(hook_name: str, harness: str):
+    """Main entry point: read stdin JSON, dispatch to hook handler."""
+    try:
+        data = json.load(sys.stdin)
+    except (json.JSONDecodeError, EOFError):
+        _log("WARNING: Failed to parse stdin JSON, proceeding with empty data")
+        data = {}
+
+    hooks = {
+        "session-start": hook_session_start,
+        "stop": hook_stop,
+        "precompact": hook_precompact,
+    }
+
+    handler = hooks.get(hook_name)
+    if handler is None:
+        print(f"Unknown hook: {hook_name}", file=sys.stderr)
+        sys.exit(1)
+
+    handler(data, harness)
@@ -0,0 +1,105 @@
+# MemPalace
+
+AI memory system. Store everything, find anything. Local, free, no API key.
+
+---
+
+## Slash Commands
+
+| Command              | Description                    |
+|----------------------|--------------------------------|
+| /mempalace:init      | Install and set up MemPalace   |
+| /mempalace:search    | Search your memories           |
+| /mempalace:mine      | Mine projects and conversations|
+| /mempalace:status    | Palace overview and stats      |
+| /mempalace:help      | This help message              |
+
+---
+
+## MCP Tools (19)
+
+### Palace (read)
+- mempalace_status -- Palace status and stats
+- mempalace_list_wings -- List all wings
+- mempalace_list_rooms -- List rooms in a wing
+- mempalace_get_taxonomy -- Get the full taxonomy tree
+- mempalace_search -- Search memories by query
+- mempalace_check_duplicate -- Check if a memory already exists
+- mempalace_get_aaak_spec -- Get the AAAK specification
+
+### Palace (write)
+- mempalace_add_drawer -- Add a new memory (drawer)
+- mempalace_delete_drawer -- Delete a memory (drawer)
+
+### Knowledge Graph
+- mempalace_kg_query -- Query the knowledge graph
+- mempalace_kg_add -- Add a knowledge graph entry
+- mempalace_kg_invalidate -- Invalidate a knowledge graph entry
+- mempalace_kg_timeline -- View knowledge graph timeline
+- mempalace_kg_stats -- Knowledge graph statistics
+
+### Navigation
+- mempalace_traverse -- Traverse the palace structure
+- mempalace_find_tunnels -- Find cross-wing connections
+- mempalace_graph_stats -- Graph connectivity statistics
+
+### Agent Diary
+- mempalace_diary_write -- Write a diary entry
+- mempalace_diary_read -- Read diary entries
+
+---
+
+## CLI Commands
+
+    mempalace init <dir>                  Initialize a new palace
+    mempalace mine <dir>                  Mine a project (default mode)
+    mempalace mine <dir> --mode convos    Mine conversation exports
+    mempalace search "query"              Search your memories
+    mempalace split <dir>                 Split large transcript files
+    mempalace wake-up                     Load palace into context
+    mempalace compress                    Compress palace storage
+    mempalace status                      Show palace status
+    mempalace repair                      Rebuild vector index
+    mempalace hook run                    Run hook logic (for harness integration)
+    mempalace instructions <name>         Output skill instructions
+
+---
+
+## Auto-Save Hooks
+
+- Stop hook -- Automatically saves memories every 15 messages. Counts human
+  messages in the session transcript (skipping command-messages). When the
+  threshold is reached, blocks the AI with a save instruction. Uses
+  ~/.mempalace/hook_state/ to track save points per session. If
+  stop_hook_active is true, passes through to prevent infinite loops.
+
+- PreCompact hook -- Emergency save before context compaction. Always blocks
+  with a comprehensive save instruction because compaction means the AI is
+  about to lose detailed context.
+
+Hooks read JSON from stdin and output JSON to stdout. They can be invoked via:
+
+    echo '{"session_id":"abc","stop_hook_active":false,"transcript_path":"..."}' | mempalace hook run --hook stop --harness claude-code
+
+---
+
+## Architecture
+
+    Wings (projects/people)
+      +-- Rooms (topics)
+            +-- Closets (summaries)
+                  +-- Drawers (verbatim memories)
+
+    Halls connect rooms within a wing.
+    Tunnels connect rooms across wings.
+
+The palace is stored locally using ChromaDB for vector search and SQLite for
+metadata. No cloud services or API keys required.
+
+---
+
+## Getting Started
+
+1. /mempalace:init -- Set up your palace
+2. /mempalace:mine -- Mine a project or conversation
+3. /mempalace:search -- Find what you stored
@@ -0,0 +1,69 @@
+# MemPalace Init
+
+Guide the user through a complete MemPalace setup. Follow each step in order,
+stopping to report errors and attempt remediation before proceeding.
+
+## Step 1: Check Python version
+
+Run `python3 --version` (or `python --version` on Windows) and confirm the
+version is 3.9 or higher. If Python is not found or the version is too old,
+tell the user they need Python 3.9+ installed and stop.
+
+## Step 2: Check if mempalace is already installed
+
+Run `pip show mempalace` to see if the package is already present. If it is,
+report the installed version and skip to Step 4.
+
+## Step 3: Install mempalace
+
+Run `pip install mempalace`.
+
+### Error handling -- pip failures
+
+If `pip install mempalace` fails, try these fallbacks in order:
+
+1. Try `pip3 install mempalace`
+2. Try `python -m pip install mempalace` (or `python3 -m pip install mempalace`)
+3. If the error mentions missing build tools or compilation failures (commonly
+   from chromadb or its native dependencies):
+   - On Linux/macOS: suggest `sudo apt-get install build-essential python3-dev`
+     (Debian/Ubuntu) or `xcode-select --install` (macOS)
+   - On Windows: suggest installing Microsoft C++ Build Tools from
+     https://visualstudio.microsoft.com/visual-cpp-build-tools/
+   - Then retry the install command
+4. If all attempts fail, report the error clearly and stop.
+
+## Step 4: Ask for project directory
+
+Ask the user which project directory they want to initialize with MemPalace.
+Offer the current working directory as the default. Wait for their response
+before continuing.
+
+## Step 5: Initialize the palace
+
+Run `mempalace init <dir>` where `<dir>` is the directory from Step 4.
+
+If this fails, report the error and stop.
+
+## Step 6: Configure MCP server
+
+Run the following command to register the MemPalace MCP server with Claude:
+
+    claude mcp add mempalace -- python -m mempalace.mcp_server
+
+If this fails, report the error but continue to the next step (MCP
+configuration can be done manually later).
+
+## Step 7: Verify installation
+
+Run `mempalace status` and confirm the output shows a healthy palace.
+
+If the command fails or reports errors, walk the user through troubleshooting
+based on the output.
+
+## Step 8: Show next steps
+
+Tell the user setup is complete and suggest these next actions:
+
+- Use /mempalace:mine to start adding data to their palace
+- Use /mempalace:search to query their palace and retrieve stored knowledge
@@ -0,0 +1,64 @@
+# MemPalace Mine
+
+When the user invokes this skill, follow these steps:
+
+## 1. Ask what to mine
+
+Ask the user what they want to mine and where the source data is located.
+Clarify:
+- Is it a project directory (code, docs, notes)?
+- Is it conversation exports (Claude, ChatGPT, Slack)?
+- Do they want auto-classification (decisions, milestones, problems)?
+
+## 2. Choose the mining mode
+
+There are three mining modes:
+
+### Project mining
+
+    mempalace mine <dir>
+
+Mines code files, documentation, and notes from a project directory.
+
+### Conversation mining
+
+    mempalace mine <dir> --mode convos
+
+Mines conversation exports from Claude, ChatGPT, or Slack into the palace.
+
+### General extraction (auto-classify)
+
+    mempalace mine <dir> --mode convos --extract general
+
+Auto-classifies mined content into decisions, milestones, and problems.
+
+## 3. Optionally split mega-files first
+
+If the source directory contains very large files, suggest splitting them
+before mining:
+
+    mempalace split <dir> [--dry-run]
+
+Use --dry-run first to preview what will be split without making changes.
+
+## 4. Optionally tag with a wing
+
+If the user wants to organize mined content under a specific wing, add the
+--wing flag:
+
+    mempalace mine <dir> --wing <name>
+
+## 5. Show progress and results
+
+Run the selected mining command and display progress as it executes. After
+completion, summarize the results including:
+- Number of items mined
+- Categories or classifications applied
+- Any warnings or skipped files
+
+## 6. Suggest next steps
+
+After mining completes, suggest the user try:
+- /mempalace:search -- search the newly mined content
+- /mempalace:status -- check the current state of their palace
+- Mine more data from additional sources
@@ -0,0 +1,57 @@
+# MemPalace Search
+
+When the user wants to search their MemPalace memories, follow these steps:
+
+## 1. Parse the Search Query
+
+Extract the core search intent from the user's message. Identify any explicit
+or implicit filters:
+- Wing -- a top-level category (e.g., "work", "personal", "research")
+- Room -- a sub-category within a wing
+- Keywords / semantic query -- the actual search terms
+
+## 2. Determine Wing/Room Filters
+
+If the user mentions a specific domain, topic area, or context, map it to the
+appropriate wing and/or room. If unsure, omit filters to search globally. You
+can discover the taxonomy first if needed.
+
+## 3. Use MCP Tools (Preferred)
+
+If MCP tools are available, use them in this priority order:
+
+- mempalace_search(query, wing, room) -- Primary search tool. Pass the semantic
+  query and any wing/room filters.
+- mempalace_list_wings -- Discover all available wings. Use when the user asks
+  what categories exist or you need to resolve a wing name.
+- mempalace_list_rooms(wing) -- List rooms within a specific wing. Use to help
+  the user navigate or to resolve a room name.
+- mempalace_get_taxonomy -- Retrieve the full wing/room/drawer tree. Use when
+  the user wants an overview of their entire memory structure.
+- mempalace_traverse(room) -- Walk the knowledge graph starting from a room.
+  Use when the user wants to explore connections and related memories.
+- mempalace_find_tunnels(wing1, wing2) -- Find cross-wing connections (tunnels)
+  between two wings. Use when the user asks about relationships between
+  different knowledge domains.
+
+## 4. CLI Fallback
+
+If MCP tools are not available, fall back to the CLI:
+
+    mempalace search "query" [--wing X] [--room Y]
+
+## 5. Present Results
+
+When presenting search results:
+- Always include source attribution: wing, room, and drawer for each result
+- Show relevance or similarity scores if available
+- Group results by wing/room when returning multiple hits
+- Quote or summarize the memory content clearly
+
+## 6. Offer Next Steps
+
+After presenting results, offer the user options to go deeper:
+- Drill deeper -- search within a specific room or narrow the query
+- Traverse -- explore the knowledge graph from a related room
+- Check tunnels -- look for cross-wing connections if the topic spans domains
+- Browse taxonomy -- show the full structure for manual exploration
@@ -0,0 +1,49 @@
+# MemPalace Status
+
+Display the current state of the user's memory palace.
+
+## Step 1: Gather Palace Status
+
+Check if MCP tools are available (look for mempalace_status in available tools).
+
+- If MCP is available: Call the mempalace_status tool to retrieve palace state.
+- If MCP is not available: Run the CLI command: mempalace status
+
+## Step 2: Display Wing/Room/Drawer Counts
+
+Present the palace structure counts clearly:
+- Number of wings
+- Number of rooms
+- Number of drawers
+- Total memories stored
+
+Keep the output concise -- use a brief summary format, not verbose tables.
+
+## Step 3: Knowledge Graph Stats (MCP only)
+
+If MCP tools are available, also call:
+- mempalace_kg_stats -- for a knowledge graph overview (triple count, entity
+  count, relationship types)
+- mempalace_graph_stats -- for connectivity information (connected components,
+  average connections per entity)
+
+Present these alongside the palace counts in a unified summary.
+
+## Step 4: Suggest Next Actions
+
+Based on the current state, suggest one relevant action:
+
+- Empty palace (zero memories): Suggest "Try /mempalace:mine to add data from
+  files, URLs, or text."
+- Has data but no knowledge graph (memories exist but KG stats show zero
+  triples): Suggest "Consider adding knowledge graph triples for richer
+  queries."
+- Healthy palace (has memories and KG data): Suggest "Use /mempalace:search to
+  query your memories."
+
+## Output Style
+
+- Be concise and informative -- aim for a quick glance, not a report.
+- Use short labels and numbers, not prose paragraphs.
+- If any step fails or a tool is unavailable, note it briefly and continue
+  with what is available.
@@ -0,0 +1,28 @@
+"""
+Instruction text output for MemPalace CLI commands.
+
+Each instruction lives as a .md file in the instructions/ directory
+inside the package. The CLI reads and prints the file content.
+"""
+
+import sys
+from pathlib import Path
+
+INSTRUCTIONS_DIR = Path(__file__).parent / "instructions"
+
+AVAILABLE = ["init", "search", "mine", "help", "status"]
+
+
+def run_instructions(name: str):
+    """Read and print the instruction .md file for the given name."""
+    if name not in AVAILABLE:
+        print(f"Unknown instructions: {name}", file=sys.stderr)
+        print(f"Available: {', '.join(sorted(AVAILABLE))}", file=sys.stderr)
+        sys.exit(1)
+
+    md_path = INSTRUCTIONS_DIR / f"{name}.md"
+    if not md_path.is_file():
+        print(f"Instructions file not found: {md_path}", file=sys.stderr)
+        sys.exit(1)
+
+    print(md_path.read_text())
@@ -2,7 +2,7 @@
 """
 MemPalace MCP Server — read/write palace access for Claude Code
 ================================================================
-Install: claude mcp add mempalace -- python -m mempalace.mcp_server
+Install: claude mcp add mempalace -- python -m mempalace.mcp_server [--palace /path/to/palace]

 Tools (read):
  mempalace_status          — total drawers, wing/room breakdown
@@ -17,6 +17,8 @@ Tools (write):
  mempalace_delete_drawer   — remove a drawer by ID
 """

+import argparse
+import os
 import sys
 import json
 import logging
@@ -32,21 +34,50 @@ import chromadb

 from .knowledge_graph import KnowledgeGraph

-_kg = KnowledgeGraph()
-
 logging.basicConfig(level=logging.INFO, format="%(message)s", stream=sys.stderr)
 logger = logging.getLogger("mempalace_mcp")

+
+def _parse_args():
+    parser = argparse.ArgumentParser(description="MemPalace MCP Server")
+    parser.add_argument(
+        "--palace",
+        metavar="PATH",
+        help="Path to the palace directory (overrides config file and env var)",
+    )
+    args, unknown = parser.parse_known_args()
+    if unknown:
+        logger.debug("Ignoring unknown args: %s", unknown)
+    return args
+
+
+_args = _parse_args()
+
+if _args.palace:
+    os.environ["MEMPALACE_PALACE_PATH"] = os.path.abspath(_args.palace)
+
 _config = MempalaceConfig()
+if _args.palace:
+    _kg = KnowledgeGraph(db_path=os.path.join(_config.palace_path, "knowledge_graph.sqlite3"))
+else:
+    _kg = KnowledgeGraph()
+
+
+_client_cache = None
+_collection_cache = None


 def _get_collection(create=False):
-    """Return the ChromaDB collection, or None on failure."""
+    """Return the ChromaDB collection, caching the client between calls."""
+    global _client_cache, _collection_cache
    try:
-        client = chromadb.PersistentClient(path=_config.palace_path)
+        if _client_cache is None:
+            _client_cache = chromadb.PersistentClient(path=_config.palace_path)
        if create:
-            return client.get_or_create_collection(_config.collection_name)
-        return client.get_collection(_config.collection_name)
+            _collection_cache = _client_cache.get_or_create_collection(_config.collection_name)
+        elif _collection_cache is None:
+            _collection_cache = _client_cache.get_collection(_config.collection_name)
+        return _collection_cache
    except Exception:
        return None

@@ -270,19 +301,18 @@ def tool_add_drawer(
    if not col:
        return _no_palace()

-    # Duplicate check
-    dup = tool_check_duplicate(content, threshold=0.9)
-    if dup.get("is_duplicate"):
-        return {
-            "success": False,
-            "reason": "duplicate",
-            "matches": dup["matches"],
-        }
+    drawer_id = f"drawer_{wing}_{room}_{hashlib.md5(content.encode()).hexdigest()[:16]}"

-    drawer_id = f"drawer_{wing}_{room}_{hashlib.md5((content[:100] + datetime.now().isoformat()).encode()).hexdigest()[:16]}"
+    # Idempotency: if the deterministic ID already exists, return success as a no-op.
+    try:
+        existing = col.get(ids=[drawer_id])
+        if existing and existing["ids"]:
+            return {"success": True, "reason": "already_exists", "drawer_id": drawer_id}
+    except Exception:
+        pass

    try:
-        col.add(
+        col.upsert(
            ids=[drawer_id],
            documents=[content],
            metadatas=[
@@ -403,10 +403,22 @@ def get_collection(palace_path: str):


 def file_already_mined(collection, source_file: str) -> bool:
-    """Fast check: has this file been filed before?"""
+    """Fast check: has this file been filed before and is unchanged?
+
+    Compares the stored mtime in drawer metadata against the file's current
+    mtime.  Returns False (needs re-mining) when the file has been modified
+    since it was last mined, or when no mtime was stored.
+    """
    try:
        results = collection.get(where={"source_file": source_file}, limit=1)
-        return len(results.get("ids", [])) > 0
+        if not results.get("ids"):
+            return False
+        stored_meta = results["metadatas"][0] if results.get("metadatas") else {}
+        stored_mtime = stored_meta.get("source_mtime")
+        if stored_mtime is None:
+            return False
+        current_mtime = os.path.getmtime(source_file)
+        return float(stored_mtime) == current_mtime
    except Exception:
        return False

@@ -417,24 +429,26 @@ def add_drawer(
    """Add one drawer to the palace."""
    drawer_id = f"drawer_{wing}_{room}_{hashlib.md5((source_file + str(chunk_index)).encode(), usedforsecurity=False).hexdigest()[:16]}"
    try:
-        collection.add(
+        metadata = {
+            "wing": wing,
+            "room": room,
+            "source_file": source_file,
+            "chunk_index": chunk_index,
+            "added_by": agent,
+            "filed_at": datetime.now().isoformat(),
+        }
+        # Store file mtime so we can detect modifications later.
+        try:
+            metadata["source_mtime"] = os.path.getmtime(source_file)
+        except OSError:
+            pass
+        collection.upsert(
            documents=[content],
            ids=[drawer_id],
-            metadatas=[
-                {
-                    "wing": wing,
-                    "room": room,
-                    "source_file": source_file,
-                    "chunk_index": chunk_index,
-                    "added_by": agent,
-                    "filed_at": datetime.now().isoformat(),
-                }
-            ],
+            metadatas=[metadata],
        )
        return True
-    except Exception as e:
-        if "already exists" in str(e).lower() or "duplicate" in str(e).lower():
-            return False
+    except Exception:
        raise


@@ -451,29 +465,29 @@ def process_file(
    rooms: list,
    agent: str,
    dry_run: bool,
-) -> int:
-    """Read, chunk, route, and file one file. Returns drawer count."""
+) -> tuple:
+    """Read, chunk, route, and file one file. Returns (drawer_count, room_name)."""

    # Skip if already filed
    source_file = str(filepath)
    if not dry_run and file_already_mined(collection, source_file):
-        return 0
+        return 0, None

    try:
        content = filepath.read_text(encoding="utf-8", errors="replace")
    except OSError:
-        return 0
+        return 0, None

    content = content.strip()
    if len(content) < MIN_CHUNK_SIZE:
-        return 0
+        return 0, None

    room = detect_room(filepath, content, rooms, project_path)
    chunks = chunk_text(content, source_file)

    if dry_run:
        print(f"    [DRY RUN] {filepath.name} → room:{room} ({len(chunks)} drawers)")
-        return len(chunks)
+        return len(chunks), room

    drawers_added = 0
    for chunk in chunks:
@@ -489,7 +503,7 @@ def process_file(
        if added:
            drawers_added += 1

-    return drawers_added
+    return drawers_added, room


 # =============================================================================
@@ -608,7 +622,7 @@ def mine(
    room_counts = defaultdict(int)

    for i, filepath in enumerate(files, 1):
-        drawers = process_file(
+        drawers, room = process_file(
            filepath=filepath,
            project_path=project_path,
            collection=collection,
@@ -621,7 +635,6 @@ def mine(
            files_skipped += 1
        else:
            total_drawers += drawers
-            room = detect_room(filepath, "", rooms, project_path)
            room_counts[room] += 1
            if not dry_run:
                print(f"  ✓ [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers}")
@@ -312,7 +312,7 @@ def _generate_aaak_bootstrap(
        ]
    )

-    (mempalace_dir / "aaak_entities.md").write_text("\n".join(registry_lines))
+    (mempalace_dir / "aaak_entities.md").write_text("\n".join(registry_lines), encoding="utf-8")

    # Critical facts bootstrap (pre-palace — before any mining)
    facts_lines = [
@@ -359,7 +359,7 @@ def _generate_aaak_bootstrap(
        ]
    )

-    (mempalace_dir / "critical_facts.md").write_text("\n".join(facts_lines))
+    (mempalace_dir / "critical_facts.md").write_text("\n".join(facts_lines), encoding="utf-8")


 def run_onboarding(
@@ -219,7 +219,7 @@ def split_file(filepath, output_dir, dry_run=False):
        if dry_run:
            print(f"  [{i + 1}/{len(boundaries) - 1}] {name}  ({len(chunk)} lines)")
        else:
-            out_path.write_text("".join(chunk))
+            out_path.write_text("".join(chunk), encoding="utf-8")
            print(f"  ✓ {name}  ({len(chunk)} lines)")

        written.append(out_path)
@@ -1,3 +1,3 @@
 """Single source of truth for the MemPalace package version."""

-__version__ = "3.0.0"
+__version__ = "3.0.14"