Merge branch 'main' into fix/query-sanitizer-prompt-contamination

This commit is contained in:
Ben Sigman
2026-04-09 08:11:39 -07:00
committed by GitHub
79 changed files with 9292 additions and 123 deletions
+17 -2
View File
@@ -1,6 +1,21 @@
"""MemPalace — Give your AI a memory. No API key required."""
from .cli import main
from .version import __version__
import logging
import os
import platform
from .cli import main # noqa: E402
from .version import __version__ # noqa: E402
# ChromaDB 0.6.x ships a Posthog telemetry client whose capture() signature is
# incompatible with the bundled posthog library, producing noisy stderr warnings
# on every client operation ("Failed to send telemetry event … capture() takes
# 1 positional argument but 3 were given"). Silence just that logger.
logging.getLogger("chromadb.telemetry.product.posthog").setLevel(logging.CRITICAL)
# ONNX Runtime's CoreML provider segfaults during vector queries on Apple Silicon.
# Force CPU execution unless the user has explicitly set a preference.
if platform.machine() == "arm64" and platform.system() == "Darwin":
os.environ.setdefault("ORT_DISABLE_COREML", "1")
__all__ = ["main", "__version__"]
+60
View File
@@ -226,6 +226,20 @@ def cmd_repair(args):
print(f"\n{'=' * 55}\n")
def cmd_hook(args):
"""Run hook logic: reads JSON from stdin, outputs JSON to stdout."""
from .hooks_cli import run_hook
run_hook(hook_name=args.hook, harness=args.harness)
def cmd_instructions(args):
"""Output skill instructions to stdout."""
from .instructions_cli import run_instructions
run_instructions(name=args.name)
def cmd_compress(args):
"""Compress drawers in a wing using AAAK Dialect."""
import chromadb
@@ -451,6 +465,35 @@ def main():
help="Only split files containing at least N sessions (default: 2)",
)
# hook
p_hook = sub.add_parser(
"hook",
help="Run hook logic (reads JSON from stdin, outputs JSON to stdout)",
)
hook_sub = p_hook.add_subparsers(dest="hook_action")
p_hook_run = hook_sub.add_parser("run", help="Execute a hook")
p_hook_run.add_argument(
"--hook",
required=True,
choices=["session-start", "stop", "precompact"],
help="Hook name to run",
)
p_hook_run.add_argument(
"--harness",
required=True,
choices=["claude-code", "codex"],
help="Harness type (determines stdin JSON format)",
)
# instructions
p_instructions = sub.add_parser(
"instructions",
help="Output skill instructions to stdout",
)
instructions_sub = p_instructions.add_subparsers(dest="instructions_name")
for instr_name in ["init", "search", "mine", "help", "status"]:
instructions_sub.add_parser(instr_name, help=f"Output {instr_name} instructions")
# repair
sub.add_parser(
"repair",
@@ -466,6 +509,23 @@ def main():
parser.print_help()
return
# Handle two-level subcommands
if args.command == "hook":
if not getattr(args, "hook_action", None):
p_hook.print_help()
return
cmd_hook(args)
return
if args.command == "instructions":
name = getattr(args, "instructions_name", None)
if not name:
p_instructions.print_help()
return
args.name = name
cmd_instructions(args)
return
dispatch = {
"init": cmd_init,
"mine": cmd_mine,
+1 -1
View File
@@ -309,7 +309,7 @@ class EntityRegistry:
def save(self):
self._path.parent.mkdir(parents=True, exist_ok=True)
self._path.write_text(json.dumps(self._data, indent=2))
self._path.write_text(json.dumps(self._data, indent=2), encoding="utf-8")
@staticmethod
def _empty() -> dict:
+226
View File
@@ -0,0 +1,226 @@
"""
Hook logic for MemPalace — Python implementation of session-start, stop, and precompact hooks.
Reads JSON from stdin, outputs JSON to stdout.
Supported hooks: session-start, stop, precompact
Supported harnesses: claude-code, codex (extensible to cursor, gemini, etc.)
"""
import json
import os
import re
import subprocess
import sys
from datetime import datetime
from pathlib import Path
SAVE_INTERVAL = 15
STATE_DIR = Path.home() / ".mempalace" / "hook_state"
STOP_BLOCK_REASON = (
"AUTO-SAVE checkpoint. Save key topics, decisions, quotes, and code "
"from this session to your memory system. Organize into appropriate "
"categories. Use verbatim quotes where possible. Continue conversation "
"after saving."
)
PRECOMPACT_BLOCK_REASON = (
"COMPACTION IMMINENT. Save ALL topics, decisions, quotes, code, and "
"important context from this session to your memory system. Be thorough "
"\u2014 after compaction, detailed context will be lost. Organize into "
"appropriate categories. Use verbatim quotes where possible. Save "
"everything, then allow compaction to proceed."
)
def _sanitize_session_id(session_id: str) -> str:
"""Only allow alnum, dash, underscore to prevent path traversal."""
sanitized = re.sub(r"[^a-zA-Z0-9_-]", "", session_id)
return sanitized or "unknown"
def _count_human_messages(transcript_path: str) -> int:
"""Count human messages in a JSONL transcript, skipping command-messages."""
path = Path(transcript_path).expanduser()
if not path.is_file():
return 0
count = 0
try:
with open(path, encoding="utf-8", errors="replace") as f:
for line in f:
try:
entry = json.loads(line)
msg = entry.get("message", {})
if isinstance(msg, dict) and msg.get("role") == "user":
content = msg.get("content", "")
if isinstance(content, str):
if "<command-message>" in content:
continue
elif isinstance(content, list):
text = " ".join(
b.get("text", "") for b in content if isinstance(b, dict)
)
if "<command-message>" in text:
continue
count += 1
except (json.JSONDecodeError, AttributeError):
pass
except OSError:
return 0
return count
def _log(message: str):
"""Append to hook state log file."""
try:
STATE_DIR.mkdir(parents=True, exist_ok=True)
log_path = STATE_DIR / "hook.log"
timestamp = datetime.now().strftime("%H:%M:%S")
with open(log_path, "a") as f:
f.write(f"[{timestamp}] {message}\n")
except OSError:
pass
def _output(data: dict):
"""Print JSON to stdout with consistent formatting (pretty-printed)."""
print(json.dumps(data, indent=2, ensure_ascii=False))
def _maybe_auto_ingest():
"""If MEMPAL_DIR is set and exists, run mempalace mine in background."""
mempal_dir = os.environ.get("MEMPAL_DIR", "")
if mempal_dir and os.path.isdir(mempal_dir):
try:
log_path = STATE_DIR / "hook.log"
with open(log_path, "a") as log_f:
subprocess.Popen(
[sys.executable, "-m", "mempalace", "mine", mempal_dir],
stdout=log_f,
stderr=log_f,
)
except OSError:
pass
SUPPORTED_HARNESSES = {"claude-code", "codex"}
def _parse_harness_input(data: dict, harness: str) -> dict:
"""Parse stdin JSON according to the harness type."""
if harness not in SUPPORTED_HARNESSES:
print(f"Unknown harness: {harness}", file=sys.stderr)
sys.exit(1)
return {
"session_id": _sanitize_session_id(str(data.get("session_id", "unknown"))),
"stop_hook_active": data.get("stop_hook_active", False),
"transcript_path": str(data.get("transcript_path", "")),
}
def hook_stop(data: dict, harness: str):
"""Stop hook: block every N messages for auto-save."""
parsed = _parse_harness_input(data, harness)
session_id = parsed["session_id"]
stop_hook_active = parsed["stop_hook_active"]
transcript_path = parsed["transcript_path"]
# If already in a save cycle, let through (infinite-loop prevention)
if str(stop_hook_active).lower() in ("true", "1", "yes"):
_output({})
return
# Count human messages
exchange_count = _count_human_messages(transcript_path)
# Track last save point
STATE_DIR.mkdir(parents=True, exist_ok=True)
last_save_file = STATE_DIR / f"{session_id}_last_save"
last_save = 0
if last_save_file.is_file():
try:
last_save = int(last_save_file.read_text().strip())
except (ValueError, OSError):
last_save = 0
since_last = exchange_count - last_save
_log(f"Session {session_id}: {exchange_count} exchanges, {since_last} since last save")
if since_last >= SAVE_INTERVAL and exchange_count > 0:
# Update last save point
try:
last_save_file.write_text(str(exchange_count), encoding="utf-8")
except OSError:
pass
_log(f"TRIGGERING SAVE at exchange {exchange_count}")
# Optional: auto-ingest if MEMPAL_DIR is set
_maybe_auto_ingest()
_output({"decision": "block", "reason": STOP_BLOCK_REASON})
else:
_output({})
def hook_session_start(data: dict, harness: str):
"""Session start hook: initialize session tracking state."""
parsed = _parse_harness_input(data, harness)
session_id = parsed["session_id"]
_log(f"SESSION START for session {session_id}")
# Initialize session state directory
STATE_DIR.mkdir(parents=True, exist_ok=True)
# Pass through — no blocking on session start
_output({})
def hook_precompact(data: dict, harness: str):
"""Precompact hook: always block with comprehensive save instruction."""
parsed = _parse_harness_input(data, harness)
session_id = parsed["session_id"]
_log(f"PRE-COMPACT triggered for session {session_id}")
# Optional: auto-ingest synchronously before compaction (so memories land first)
mempal_dir = os.environ.get("MEMPAL_DIR", "")
if mempal_dir and os.path.isdir(mempal_dir):
try:
log_path = STATE_DIR / "hook.log"
with open(log_path, "a") as log_f:
subprocess.run(
[sys.executable, "-m", "mempalace", "mine", mempal_dir],
stdout=log_f,
stderr=log_f,
timeout=60,
)
except OSError:
pass
# Always block -- compaction = save everything
_output({"decision": "block", "reason": PRECOMPACT_BLOCK_REASON})
def run_hook(hook_name: str, harness: str):
"""Main entry point: read stdin JSON, dispatch to hook handler."""
try:
data = json.load(sys.stdin)
except (json.JSONDecodeError, EOFError):
_log("WARNING: Failed to parse stdin JSON, proceeding with empty data")
data = {}
hooks = {
"session-start": hook_session_start,
"stop": hook_stop,
"precompact": hook_precompact,
}
handler = hooks.get(hook_name)
if handler is None:
print(f"Unknown hook: {hook_name}", file=sys.stderr)
sys.exit(1)
handler(data, harness)
+105
View File
@@ -0,0 +1,105 @@
# MemPalace
AI memory system. Store everything, find anything. Local, free, no API key.
---
## Slash Commands
| Command | Description |
|----------------------|--------------------------------|
| /mempalace:init | Install and set up MemPalace |
| /mempalace:search | Search your memories |
| /mempalace:mine | Mine projects and conversations|
| /mempalace:status | Palace overview and stats |
| /mempalace:help | This help message |
---
## MCP Tools (19)
### Palace (read)
- mempalace_status -- Palace status and stats
- mempalace_list_wings -- List all wings
- mempalace_list_rooms -- List rooms in a wing
- mempalace_get_taxonomy -- Get the full taxonomy tree
- mempalace_search -- Search memories by query
- mempalace_check_duplicate -- Check if a memory already exists
- mempalace_get_aaak_spec -- Get the AAAK specification
### Palace (write)
- mempalace_add_drawer -- Add a new memory (drawer)
- mempalace_delete_drawer -- Delete a memory (drawer)
### Knowledge Graph
- mempalace_kg_query -- Query the knowledge graph
- mempalace_kg_add -- Add a knowledge graph entry
- mempalace_kg_invalidate -- Invalidate a knowledge graph entry
- mempalace_kg_timeline -- View knowledge graph timeline
- mempalace_kg_stats -- Knowledge graph statistics
### Navigation
- mempalace_traverse -- Traverse the palace structure
- mempalace_find_tunnels -- Find cross-wing connections
- mempalace_graph_stats -- Graph connectivity statistics
### Agent Diary
- mempalace_diary_write -- Write a diary entry
- mempalace_diary_read -- Read diary entries
---
## CLI Commands
mempalace init <dir> Initialize a new palace
mempalace mine <dir> Mine a project (default mode)
mempalace mine <dir> --mode convos Mine conversation exports
mempalace search "query" Search your memories
mempalace split <dir> Split large transcript files
mempalace wake-up Load palace into context
mempalace compress Compress palace storage
mempalace status Show palace status
mempalace repair Rebuild vector index
mempalace hook run Run hook logic (for harness integration)
mempalace instructions <name> Output skill instructions
---
## Auto-Save Hooks
- Stop hook -- Automatically saves memories every 15 messages. Counts human
messages in the session transcript (skipping command-messages). When the
threshold is reached, blocks the AI with a save instruction. Uses
~/.mempalace/hook_state/ to track save points per session. If
stop_hook_active is true, passes through to prevent infinite loops.
- PreCompact hook -- Emergency save before context compaction. Always blocks
with a comprehensive save instruction because compaction means the AI is
about to lose detailed context.
Hooks read JSON from stdin and output JSON to stdout. They can be invoked via:
echo '{"session_id":"abc","stop_hook_active":false,"transcript_path":"..."}' | mempalace hook run --hook stop --harness claude-code
---
## Architecture
Wings (projects/people)
+-- Rooms (topics)
+-- Closets (summaries)
+-- Drawers (verbatim memories)
Halls connect rooms within a wing.
Tunnels connect rooms across wings.
The palace is stored locally using ChromaDB for vector search and SQLite for
metadata. No cloud services or API keys required.
---
## Getting Started
1. /mempalace:init -- Set up your palace
2. /mempalace:mine -- Mine a project or conversation
3. /mempalace:search -- Find what you stored
+69
View File
@@ -0,0 +1,69 @@
# MemPalace Init
Guide the user through a complete MemPalace setup. Follow each step in order,
stopping to report errors and attempt remediation before proceeding.
## Step 1: Check Python version
Run `python3 --version` (or `python --version` on Windows) and confirm the
version is 3.9 or higher. If Python is not found or the version is too old,
tell the user they need Python 3.9+ installed and stop.
## Step 2: Check if mempalace is already installed
Run `pip show mempalace` to see if the package is already present. If it is,
report the installed version and skip to Step 4.
## Step 3: Install mempalace
Run `pip install mempalace`.
### Error handling -- pip failures
If `pip install mempalace` fails, try these fallbacks in order:
1. Try `pip3 install mempalace`
2. Try `python -m pip install mempalace` (or `python3 -m pip install mempalace`)
3. If the error mentions missing build tools or compilation failures (commonly
from chromadb or its native dependencies):
- On Linux/macOS: suggest `sudo apt-get install build-essential python3-dev`
(Debian/Ubuntu) or `xcode-select --install` (macOS)
- On Windows: suggest installing Microsoft C++ Build Tools from
https://visualstudio.microsoft.com/visual-cpp-build-tools/
- Then retry the install command
4. If all attempts fail, report the error clearly and stop.
## Step 4: Ask for project directory
Ask the user which project directory they want to initialize with MemPalace.
Offer the current working directory as the default. Wait for their response
before continuing.
## Step 5: Initialize the palace
Run `mempalace init <dir>` where `<dir>` is the directory from Step 4.
If this fails, report the error and stop.
## Step 6: Configure MCP server
Run the following command to register the MemPalace MCP server with Claude:
claude mcp add mempalace -- python -m mempalace.mcp_server
If this fails, report the error but continue to the next step (MCP
configuration can be done manually later).
## Step 7: Verify installation
Run `mempalace status` and confirm the output shows a healthy palace.
If the command fails or reports errors, walk the user through troubleshooting
based on the output.
## Step 8: Show next steps
Tell the user setup is complete and suggest these next actions:
- Use /mempalace:mine to start adding data to their palace
- Use /mempalace:search to query their palace and retrieve stored knowledge
+64
View File
@@ -0,0 +1,64 @@
# MemPalace Mine
When the user invokes this skill, follow these steps:
## 1. Ask what to mine
Ask the user what they want to mine and where the source data is located.
Clarify:
- Is it a project directory (code, docs, notes)?
- Is it conversation exports (Claude, ChatGPT, Slack)?
- Do they want auto-classification (decisions, milestones, problems)?
## 2. Choose the mining mode
There are three mining modes:
### Project mining
mempalace mine <dir>
Mines code files, documentation, and notes from a project directory.
### Conversation mining
mempalace mine <dir> --mode convos
Mines conversation exports from Claude, ChatGPT, or Slack into the palace.
### General extraction (auto-classify)
mempalace mine <dir> --mode convos --extract general
Auto-classifies mined content into decisions, milestones, and problems.
## 3. Optionally split mega-files first
If the source directory contains very large files, suggest splitting them
before mining:
mempalace split <dir> [--dry-run]
Use --dry-run first to preview what will be split without making changes.
## 4. Optionally tag with a wing
If the user wants to organize mined content under a specific wing, add the
--wing flag:
mempalace mine <dir> --wing <name>
## 5. Show progress and results
Run the selected mining command and display progress as it executes. After
completion, summarize the results including:
- Number of items mined
- Categories or classifications applied
- Any warnings or skipped files
## 6. Suggest next steps
After mining completes, suggest the user try:
- /mempalace:search -- search the newly mined content
- /mempalace:status -- check the current state of their palace
- Mine more data from additional sources
+57
View File
@@ -0,0 +1,57 @@
# MemPalace Search
When the user wants to search their MemPalace memories, follow these steps:
## 1. Parse the Search Query
Extract the core search intent from the user's message. Identify any explicit
or implicit filters:
- Wing -- a top-level category (e.g., "work", "personal", "research")
- Room -- a sub-category within a wing
- Keywords / semantic query -- the actual search terms
## 2. Determine Wing/Room Filters
If the user mentions a specific domain, topic area, or context, map it to the
appropriate wing and/or room. If unsure, omit filters to search globally. You
can discover the taxonomy first if needed.
## 3. Use MCP Tools (Preferred)
If MCP tools are available, use them in this priority order:
- mempalace_search(query, wing, room) -- Primary search tool. Pass the semantic
query and any wing/room filters.
- mempalace_list_wings -- Discover all available wings. Use when the user asks
what categories exist or you need to resolve a wing name.
- mempalace_list_rooms(wing) -- List rooms within a specific wing. Use to help
the user navigate or to resolve a room name.
- mempalace_get_taxonomy -- Retrieve the full wing/room/drawer tree. Use when
the user wants an overview of their entire memory structure.
- mempalace_traverse(room) -- Walk the knowledge graph starting from a room.
Use when the user wants to explore connections and related memories.
- mempalace_find_tunnels(wing1, wing2) -- Find cross-wing connections (tunnels)
between two wings. Use when the user asks about relationships between
different knowledge domains.
## 4. CLI Fallback
If MCP tools are not available, fall back to the CLI:
mempalace search "query" [--wing X] [--room Y]
## 5. Present Results
When presenting search results:
- Always include source attribution: wing, room, and drawer for each result
- Show relevance or similarity scores if available
- Group results by wing/room when returning multiple hits
- Quote or summarize the memory content clearly
## 6. Offer Next Steps
After presenting results, offer the user options to go deeper:
- Drill deeper -- search within a specific room or narrow the query
- Traverse -- explore the knowledge graph from a related room
- Check tunnels -- look for cross-wing connections if the topic spans domains
- Browse taxonomy -- show the full structure for manual exploration
+49
View File
@@ -0,0 +1,49 @@
# MemPalace Status
Display the current state of the user's memory palace.
## Step 1: Gather Palace Status
Check if MCP tools are available (look for mempalace_status in available tools).
- If MCP is available: Call the mempalace_status tool to retrieve palace state.
- If MCP is not available: Run the CLI command: mempalace status
## Step 2: Display Wing/Room/Drawer Counts
Present the palace structure counts clearly:
- Number of wings
- Number of rooms
- Number of drawers
- Total memories stored
Keep the output concise -- use a brief summary format, not verbose tables.
## Step 3: Knowledge Graph Stats (MCP only)
If MCP tools are available, also call:
- mempalace_kg_stats -- for a knowledge graph overview (triple count, entity
count, relationship types)
- mempalace_graph_stats -- for connectivity information (connected components,
average connections per entity)
Present these alongside the palace counts in a unified summary.
## Step 4: Suggest Next Actions
Based on the current state, suggest one relevant action:
- Empty palace (zero memories): Suggest "Try /mempalace:mine to add data from
files, URLs, or text."
- Has data but no knowledge graph (memories exist but KG stats show zero
triples): Suggest "Consider adding knowledge graph triples for richer
queries."
- Healthy palace (has memories and KG data): Suggest "Use /mempalace:search to
query your memories."
## Output Style
- Be concise and informative -- aim for a quick glance, not a report.
- Use short labels and numbers, not prose paragraphs.
- If any step fails or a tool is unavailable, note it briefly and continue
with what is available.
+28
View File
@@ -0,0 +1,28 @@
"""
Instruction text output for MemPalace CLI commands.
Each instruction lives as a .md file in the instructions/ directory
inside the package. The CLI reads and prints the file content.
"""
import sys
from pathlib import Path
INSTRUCTIONS_DIR = Path(__file__).parent / "instructions"
AVAILABLE = ["init", "search", "mine", "help", "status"]
def run_instructions(name: str):
"""Read and print the instruction .md file for the given name."""
if name not in AVAILABLE:
print(f"Unknown instructions: {name}", file=sys.stderr)
print(f"Available: {', '.join(sorted(AVAILABLE))}", file=sys.stderr)
sys.exit(1)
md_path = INSTRUCTIONS_DIR / f"{name}.md"
if not md_path.is_file():
print(f"Instructions file not found: {md_path}", file=sys.stderr)
sys.exit(1)
print(md_path.read_text())
+47 -17
View File
@@ -2,7 +2,7 @@
"""
MemPalace MCP Server — read/write palace access for Claude Code
================================================================
Install: claude mcp add mempalace -- python -m mempalace.mcp_server
Install: claude mcp add mempalace -- python -m mempalace.mcp_server [--palace /path/to/palace]
Tools (read):
mempalace_status — total drawers, wing/room breakdown
@@ -17,6 +17,8 @@ Tools (write):
mempalace_delete_drawer — remove a drawer by ID
"""
import argparse
import os
import sys
import json
import logging
@@ -32,21 +34,50 @@ import chromadb
from .knowledge_graph import KnowledgeGraph
_kg = KnowledgeGraph()
logging.basicConfig(level=logging.INFO, format="%(message)s", stream=sys.stderr)
logger = logging.getLogger("mempalace_mcp")
def _parse_args():
parser = argparse.ArgumentParser(description="MemPalace MCP Server")
parser.add_argument(
"--palace",
metavar="PATH",
help="Path to the palace directory (overrides config file and env var)",
)
args, unknown = parser.parse_known_args()
if unknown:
logger.debug("Ignoring unknown args: %s", unknown)
return args
_args = _parse_args()
if _args.palace:
os.environ["MEMPALACE_PALACE_PATH"] = os.path.abspath(_args.palace)
_config = MempalaceConfig()
if _args.palace:
_kg = KnowledgeGraph(db_path=os.path.join(_config.palace_path, "knowledge_graph.sqlite3"))
else:
_kg = KnowledgeGraph()
_client_cache = None
_collection_cache = None
def _get_collection(create=False):
"""Return the ChromaDB collection, or None on failure."""
"""Return the ChromaDB collection, caching the client between calls."""
global _client_cache, _collection_cache
try:
client = chromadb.PersistentClient(path=_config.palace_path)
if _client_cache is None:
_client_cache = chromadb.PersistentClient(path=_config.palace_path)
if create:
return client.get_or_create_collection(_config.collection_name)
return client.get_collection(_config.collection_name)
_collection_cache = _client_cache.get_or_create_collection(_config.collection_name)
elif _collection_cache is None:
_collection_cache = _client_cache.get_collection(_config.collection_name)
return _collection_cache
except Exception:
return None
@@ -270,19 +301,18 @@ def tool_add_drawer(
if not col:
return _no_palace()
# Duplicate check
dup = tool_check_duplicate(content, threshold=0.9)
if dup.get("is_duplicate"):
return {
"success": False,
"reason": "duplicate",
"matches": dup["matches"],
}
drawer_id = f"drawer_{wing}_{room}_{hashlib.md5(content.encode()).hexdigest()[:16]}"
drawer_id = f"drawer_{wing}_{room}_{hashlib.md5((content[:100] + datetime.now().isoformat()).encode()).hexdigest()[:16]}"
# Idempotency: if the deterministic ID already exists, return success as a no-op.
try:
existing = col.get(ids=[drawer_id])
if existing and existing["ids"]:
return {"success": True, "reason": "already_exists", "drawer_id": drawer_id}
except Exception:
pass
try:
col.add(
col.upsert(
ids=[drawer_id],
documents=[content],
metadatas=[
+38 -25
View File
@@ -403,10 +403,22 @@ def get_collection(palace_path: str):
def file_already_mined(collection, source_file: str) -> bool:
"""Fast check: has this file been filed before?"""
"""Fast check: has this file been filed before and is unchanged?
Compares the stored mtime in drawer metadata against the file's current
mtime. Returns False (needs re-mining) when the file has been modified
since it was last mined, or when no mtime was stored.
"""
try:
results = collection.get(where={"source_file": source_file}, limit=1)
return len(results.get("ids", [])) > 0
if not results.get("ids"):
return False
stored_meta = results["metadatas"][0] if results.get("metadatas") else {}
stored_mtime = stored_meta.get("source_mtime")
if stored_mtime is None:
return False
current_mtime = os.path.getmtime(source_file)
return float(stored_mtime) == current_mtime
except Exception:
return False
@@ -417,24 +429,26 @@ def add_drawer(
"""Add one drawer to the palace."""
drawer_id = f"drawer_{wing}_{room}_{hashlib.md5((source_file + str(chunk_index)).encode(), usedforsecurity=False).hexdigest()[:16]}"
try:
collection.add(
metadata = {
"wing": wing,
"room": room,
"source_file": source_file,
"chunk_index": chunk_index,
"added_by": agent,
"filed_at": datetime.now().isoformat(),
}
# Store file mtime so we can detect modifications later.
try:
metadata["source_mtime"] = os.path.getmtime(source_file)
except OSError:
pass
collection.upsert(
documents=[content],
ids=[drawer_id],
metadatas=[
{
"wing": wing,
"room": room,
"source_file": source_file,
"chunk_index": chunk_index,
"added_by": agent,
"filed_at": datetime.now().isoformat(),
}
],
metadatas=[metadata],
)
return True
except Exception as e:
if "already exists" in str(e).lower() or "duplicate" in str(e).lower():
return False
except Exception:
raise
@@ -451,29 +465,29 @@ def process_file(
rooms: list,
agent: str,
dry_run: bool,
) -> int:
"""Read, chunk, route, and file one file. Returns drawer count."""
) -> tuple:
"""Read, chunk, route, and file one file. Returns (drawer_count, room_name)."""
# Skip if already filed
source_file = str(filepath)
if not dry_run and file_already_mined(collection, source_file):
return 0
return 0, None
try:
content = filepath.read_text(encoding="utf-8", errors="replace")
except OSError:
return 0
return 0, None
content = content.strip()
if len(content) < MIN_CHUNK_SIZE:
return 0
return 0, None
room = detect_room(filepath, content, rooms, project_path)
chunks = chunk_text(content, source_file)
if dry_run:
print(f" [DRY RUN] {filepath.name} → room:{room} ({len(chunks)} drawers)")
return len(chunks)
return len(chunks), room
drawers_added = 0
for chunk in chunks:
@@ -489,7 +503,7 @@ def process_file(
if added:
drawers_added += 1
return drawers_added
return drawers_added, room
# =============================================================================
@@ -608,7 +622,7 @@ def mine(
room_counts = defaultdict(int)
for i, filepath in enumerate(files, 1):
drawers = process_file(
drawers, room = process_file(
filepath=filepath,
project_path=project_path,
collection=collection,
@@ -621,7 +635,6 @@ def mine(
files_skipped += 1
else:
total_drawers += drawers
room = detect_room(filepath, "", rooms, project_path)
room_counts[room] += 1
if not dry_run:
print(f" ✓ [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers}")
+2 -2
View File
@@ -312,7 +312,7 @@ def _generate_aaak_bootstrap(
]
)
(mempalace_dir / "aaak_entities.md").write_text("\n".join(registry_lines))
(mempalace_dir / "aaak_entities.md").write_text("\n".join(registry_lines), encoding="utf-8")
# Critical facts bootstrap (pre-palace — before any mining)
facts_lines = [
@@ -359,7 +359,7 @@ def _generate_aaak_bootstrap(
]
)
(mempalace_dir / "critical_facts.md").write_text("\n".join(facts_lines))
(mempalace_dir / "critical_facts.md").write_text("\n".join(facts_lines), encoding="utf-8")
def run_onboarding(
+1 -1
View File
@@ -219,7 +219,7 @@ def split_file(filepath, output_dir, dry_run=False):
if dry_run:
print(f" [{i + 1}/{len(boundaries) - 1}] {name} ({len(chunk)} lines)")
else:
out_path.write_text("".join(chunk))
out_path.write_text("".join(chunk), encoding="utf-8")
print(f"{name} ({len(chunk)} lines)")
written.append(out_path)
+1 -1
View File
@@ -1,3 +1,3 @@
"""Single source of truth for the MemPalace package version."""
__version__ = "3.0.0"
__version__ = "3.0.14"