Merge branch 'MemPalace:develop' into feat/add-i18n-hindi
This commit is contained in:
@@ -2,7 +2,6 @@
|
||||
|
||||
import logging
|
||||
|
||||
from .cli import main # noqa: E402
|
||||
from .version import __version__ # noqa: E402
|
||||
|
||||
# ChromaDB 0.6.x ships a Posthog telemetry client whose capture() signature is
|
||||
@@ -25,4 +24,4 @@ logging.getLogger("chromadb.telemetry.product.posthog").setLevel(logging.CRITICA
|
||||
# intact, so the real fix is upgrading chromadb to 1.5.4+, which #581
|
||||
# proposes. See #397 for the history of this line.
|
||||
|
||||
__all__ = ["main", "__version__"]
|
||||
__all__ = ["__version__"]
|
||||
|
||||
@@ -27,6 +27,11 @@ class BaseCollection(ABC):
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def update(self, **kwargs: Any) -> None:
|
||||
"""Update existing records. Must raise if any ID is missing."""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def query(self, **kwargs: Any) -> Dict[str, Any]:
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -55,6 +55,9 @@ class ChromaCollection(BaseCollection):
|
||||
def upsert(self, *, documents, ids, metadatas=None):
|
||||
self._collection.upsert(documents=documents, ids=ids, metadatas=metadatas)
|
||||
|
||||
def update(self, **kwargs):
|
||||
self._collection.update(**kwargs)
|
||||
|
||||
def query(self, **kwargs):
|
||||
return self._collection.query(**kwargs)
|
||||
|
||||
@@ -71,6 +74,44 @@ class ChromaCollection(BaseCollection):
|
||||
class ChromaBackend:
|
||||
"""Factory for MemPalace's default ChromaDB backend."""
|
||||
|
||||
def __init__(self):
|
||||
# Per-instance client cache: palace_path -> chromadb.PersistentClient
|
||||
self._clients: dict = {}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _client(self, palace_path: str):
|
||||
"""Return a cached PersistentClient for *palace_path*, creating one if needed."""
|
||||
if palace_path not in self._clients:
|
||||
_fix_blob_seq_ids(palace_path)
|
||||
self._clients[palace_path] = chromadb.PersistentClient(path=palace_path)
|
||||
return self._clients[palace_path]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public static helpers (for callers that manage their own caching)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def make_client(palace_path: str):
|
||||
"""Create and return a fresh PersistentClient (fix BLOB seq_ids first).
|
||||
|
||||
Intended for long-lived callers (e.g. mcp_server) that keep their own
|
||||
inode/mtime-based client cache.
|
||||
"""
|
||||
_fix_blob_seq_ids(palace_path)
|
||||
return chromadb.PersistentClient(path=palace_path)
|
||||
|
||||
@staticmethod
|
||||
def backend_version() -> str:
|
||||
"""Return the installed chromadb package version string."""
|
||||
return chromadb.__version__
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Collection lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_collection(self, palace_path: str, collection_name: str, create: bool = False):
|
||||
if not create and not os.path.isdir(palace_path):
|
||||
raise FileNotFoundError(palace_path)
|
||||
@@ -82,10 +123,30 @@ class ChromaBackend:
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
|
||||
_fix_blob_seq_ids(palace_path)
|
||||
client = chromadb.PersistentClient(path=palace_path)
|
||||
client = self._client(palace_path)
|
||||
if create:
|
||||
collection = client.get_or_create_collection(collection_name)
|
||||
collection = client.get_or_create_collection(
|
||||
collection_name, metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
else:
|
||||
collection = client.get_collection(collection_name)
|
||||
return ChromaCollection(collection)
|
||||
|
||||
def get_or_create_collection(
|
||||
self, palace_path: str, collection_name: str
|
||||
) -> "ChromaCollection":
|
||||
"""Shorthand for get_collection(..., create=True)."""
|
||||
return self.get_collection(palace_path, collection_name, create=True)
|
||||
|
||||
def delete_collection(self, palace_path: str, collection_name: str) -> None:
|
||||
"""Delete *collection_name* from the palace at *palace_path*."""
|
||||
self._client(palace_path).delete_collection(collection_name)
|
||||
|
||||
def create_collection(
|
||||
self, palace_path: str, collection_name: str, hnsw_space: str = "cosine"
|
||||
) -> "ChromaCollection":
|
||||
"""Create (not get-or-create) *collection_name* with cosine HNSW space."""
|
||||
collection = self._client(palace_path).create_collection(
|
||||
collection_name, metadata={"hnsw:space": hnsw_space}
|
||||
)
|
||||
return ChromaCollection(collection)
|
||||
|
||||
+82
-14
@@ -36,18 +36,62 @@ from pathlib import Path
|
||||
from .config import MempalaceConfig
|
||||
|
||||
|
||||
_MEMPALACE_PROJECT_FILES = ("mempalace.yaml", "entities.json")
|
||||
|
||||
|
||||
def _ensure_mempalace_files_gitignored(project_dir) -> bool:
|
||||
"""If project_dir is a git repo, ensure MemPalace's per-project files
|
||||
are listed in .gitignore so they don't get committed by accident.
|
||||
|
||||
Returns True if .gitignore was updated, False otherwise. Issue #185:
|
||||
`mempalace init` writes mempalace.yaml + entities.json into the
|
||||
project root, where they previously had no protection against being
|
||||
staged into git.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
project_path = Path(project_dir).expanduser().resolve()
|
||||
if not (project_path / ".git").exists():
|
||||
return False
|
||||
gitignore = project_path / ".gitignore"
|
||||
existing = gitignore.read_text() if gitignore.exists() else ""
|
||||
existing_lines = {line.strip() for line in existing.splitlines()}
|
||||
missing = [p for p in _MEMPALACE_PROJECT_FILES if p not in existing_lines]
|
||||
if not missing:
|
||||
return False
|
||||
prefix = "" if not existing or existing.endswith("\n") else "\n"
|
||||
block = prefix + "\n# MemPalace per-project files (issue #185)\n" + "\n".join(missing) + "\n"
|
||||
with open(gitignore, "a") as f:
|
||||
f.write(block)
|
||||
print(f" Added {', '.join(missing)} to {gitignore.name}")
|
||||
return True
|
||||
|
||||
|
||||
def cmd_init(args):
|
||||
import json
|
||||
from pathlib import Path
|
||||
from .entity_detector import scan_for_detection, detect_entities, confirm_entities
|
||||
from .room_detector_local import detect_rooms_local
|
||||
|
||||
cfg = MempalaceConfig()
|
||||
|
||||
# Resolve entity-detection languages: --lang overrides config.
|
||||
lang_arg = getattr(args, "lang", None)
|
||||
if lang_arg:
|
||||
languages = [s.strip() for s in lang_arg.split(",") if s.strip()] or ["en"]
|
||||
cfg.set_entity_languages(languages)
|
||||
else:
|
||||
languages = cfg.entity_languages
|
||||
languages_tuple = tuple(languages)
|
||||
|
||||
# Pass 1: auto-detect people and projects from file content
|
||||
print(f"\n Scanning for entities in: {args.dir}")
|
||||
if languages_tuple != ("en",):
|
||||
print(f" Languages: {', '.join(languages_tuple)}")
|
||||
files = scan_for_detection(args.dir)
|
||||
if files:
|
||||
print(f" Reading {len(files)} files...")
|
||||
detected = detect_entities(files)
|
||||
detected = detect_entities(files, languages=languages_tuple)
|
||||
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
|
||||
if total > 0:
|
||||
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
|
||||
@@ -62,7 +106,10 @@ def cmd_init(args):
|
||||
|
||||
# Pass 2: detect rooms from folder structure
|
||||
detect_rooms_local(project_dir=args.dir, yes=getattr(args, "yes", False))
|
||||
MempalaceConfig().init()
|
||||
cfg.init()
|
||||
|
||||
# Pass 3: protect git repos from accidentally committing per-project files
|
||||
_ensure_mempalace_files_gitignored(args.dir)
|
||||
|
||||
|
||||
def cmd_mine(args):
|
||||
@@ -156,7 +203,11 @@ def cmd_migrate(args):
|
||||
from .migrate import migrate
|
||||
|
||||
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
|
||||
migrate(palace_path=palace_path, dry_run=args.dry_run, confirm=getattr(args, "yes", False))
|
||||
migrate(
|
||||
palace_path=palace_path,
|
||||
dry_run=args.dry_run,
|
||||
confirm=getattr(args, "yes", False),
|
||||
)
|
||||
|
||||
|
||||
def cmd_status(args):
|
||||
@@ -168,8 +219,8 @@ def cmd_status(args):
|
||||
|
||||
def cmd_repair(args):
|
||||
"""Rebuild palace vector index from SQLite metadata."""
|
||||
import chromadb
|
||||
import shutil
|
||||
from .backends.chroma import ChromaBackend
|
||||
from .migrate import confirm_destructive_action, contains_palace_database
|
||||
|
||||
palace_path = os.path.abspath(
|
||||
@@ -189,10 +240,11 @@ def cmd_repair(args):
|
||||
print(f"{'=' * 55}\n")
|
||||
print(f" Palace: {palace_path}")
|
||||
|
||||
backend = ChromaBackend()
|
||||
|
||||
# Try to read existing drawers
|
||||
try:
|
||||
client = chromadb.PersistentClient(path=palace_path)
|
||||
col = client.get_collection("mempalace_drawers")
|
||||
col = backend.get_collection(palace_path, "mempalace_drawers")
|
||||
total = col.count()
|
||||
print(f" Drawers found: {total}")
|
||||
except Exception as e:
|
||||
@@ -239,8 +291,8 @@ def cmd_repair(args):
|
||||
shutil.copytree(palace_path, backup_path)
|
||||
|
||||
print(" Rebuilding collection...")
|
||||
client.delete_collection("mempalace_drawers")
|
||||
new_col = client.create_collection("mempalace_drawers")
|
||||
backend.delete_collection(palace_path, "mempalace_drawers")
|
||||
new_col = backend.create_collection(palace_path, "mempalace_drawers")
|
||||
|
||||
filed = 0
|
||||
for i in range(0, len(all_ids), batch_size):
|
||||
@@ -293,7 +345,7 @@ def cmd_mcp(args):
|
||||
|
||||
def cmd_compress(args):
|
||||
"""Compress drawers in a wing using AAAK Dialect."""
|
||||
import chromadb
|
||||
from .backends.chroma import ChromaBackend
|
||||
from .dialect import Dialect
|
||||
|
||||
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
|
||||
@@ -313,9 +365,9 @@ def cmd_compress(args):
|
||||
dialect = Dialect()
|
||||
|
||||
# Connect to palace
|
||||
backend = ChromaBackend()
|
||||
try:
|
||||
client = chromadb.PersistentClient(path=palace_path)
|
||||
col = client.get_collection("mempalace_drawers")
|
||||
col = backend.get_collection(palace_path, "mempalace_drawers")
|
||||
except Exception:
|
||||
print(f"\n No palace found at {palace_path}")
|
||||
print(" Run: mempalace init <dir> then mempalace mine <dir>")
|
||||
@@ -328,7 +380,11 @@ def cmd_compress(args):
|
||||
offset = 0
|
||||
while True:
|
||||
try:
|
||||
kwargs = {"include": ["documents", "metadatas"], "limit": _BATCH, "offset": offset}
|
||||
kwargs = {
|
||||
"include": ["documents", "metadatas"],
|
||||
"limit": _BATCH,
|
||||
"offset": offset,
|
||||
}
|
||||
if where:
|
||||
kwargs["where"] = where
|
||||
batch = col.get(**kwargs)
|
||||
@@ -386,7 +442,7 @@ def cmd_compress(args):
|
||||
# Store compressed versions (unless dry-run)
|
||||
if not args.dry_run:
|
||||
try:
|
||||
comp_col = client.get_or_create_collection("mempalace_compressed")
|
||||
comp_col = backend.get_or_create_collection(palace_path, "mempalace_compressed")
|
||||
for doc_id, compressed, meta, stats in compressed_entries:
|
||||
comp_meta = dict(meta)
|
||||
comp_meta["compression_ratio"] = round(stats["size_ratio"], 1)
|
||||
@@ -431,7 +487,19 @@ def main():
|
||||
p_init = sub.add_parser("init", help="Detect rooms from your folder structure")
|
||||
p_init.add_argument("dir", help="Project directory to set up")
|
||||
p_init.add_argument(
|
||||
"--yes", action="store_true", help="Auto-accept all detected entities (non-interactive)"
|
||||
"--yes",
|
||||
action="store_true",
|
||||
help="Auto-accept all detected entities (non-interactive)",
|
||||
)
|
||||
p_init.add_argument(
|
||||
"--lang",
|
||||
default=None,
|
||||
help=(
|
||||
"Comma-separated language codes for entity detection "
|
||||
"(e.g. 'en' or 'en,pt-br'). Defaults to value from config "
|
||||
"(MEMPALACE_ENTITY_LANGUAGES env var or config.json), or 'en'. "
|
||||
"When given, the value is also persisted to config.json."
|
||||
),
|
||||
)
|
||||
|
||||
# mine
|
||||
|
||||
@@ -0,0 +1,351 @@
|
||||
"""
|
||||
closet_llm.py — Generate closets via a user-configured LLM for richer indexing.
|
||||
|
||||
The regex-based closet extraction catches action verbs, headers, and proper
|
||||
nouns — but misses implicit topics, foreign-language content, and contextual
|
||||
references. An LLM reads everything and produces better closets.
|
||||
|
||||
This module is **OPTIONAL and opt-in**. Regex closets are always created by
|
||||
the miner; this path regenerates them afterward using whatever LLM the user
|
||||
chooses. Core memory operations remain API-free by design (see CLAUDE.md,
|
||||
"Local-first, zero API").
|
||||
|
||||
## Bring-your-own-LLM configuration
|
||||
|
||||
The endpoint is any OpenAI-compatible Chat Completions URL:
|
||||
|
||||
LLM_ENDPOINT=http://localhost:11434/v1 # Ollama
|
||||
LLM_ENDPOINT=http://localhost:8000/v1 # vLLM, llama.cpp
|
||||
LLM_ENDPOINT=https://api.openai.com/v1
|
||||
LLM_ENDPOINT=https://openrouter.ai/api/v1
|
||||
LLM_ENDPOINT=https://api.anthropic.com/v1 # when proxied through a compat layer
|
||||
|
||||
Set:
|
||||
LLM_ENDPOINT — base URL (required)
|
||||
LLM_KEY — bearer token (optional; local inference usually doesn't need it)
|
||||
LLM_MODEL — model name (required), e.g. "gpt-4o-mini", "llama3:8b", "qwen2.5:7b"
|
||||
|
||||
Or pass flags on the CLI (flags win over env):
|
||||
|
||||
python -m mempalace.closet_llm \\
|
||||
--palace ~/.mempalace/palace \\
|
||||
--endpoint http://localhost:11434/v1 \\
|
||||
--model llama3:8b
|
||||
|
||||
No vendor lock-in. No hidden dependency on any specific provider. Zero deps
|
||||
added to pyproject — uses stdlib urllib.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from .palace import (
|
||||
NORMALIZE_VERSION,
|
||||
get_closets_collection,
|
||||
get_collection,
|
||||
mine_lock,
|
||||
purge_file_closets,
|
||||
upsert_closet_lines,
|
||||
)
|
||||
|
||||
MAX_CONTENT_CHARS = 30000
|
||||
MAX_OUTPUT_TOKENS = 1500
|
||||
HTTP_TIMEOUT_S = 60
|
||||
|
||||
PROMPT_TEMPLATE = """You are reading content filed in a memory palace. Generate a
|
||||
topic-dense index that will be used to find this content later when someone searches.
|
||||
|
||||
Source: {source_file}
|
||||
Wing: {wing} | Room: {room}
|
||||
|
||||
CONTENT:
|
||||
{content}
|
||||
|
||||
---
|
||||
|
||||
Output a JSON object with EXACTLY these fields:
|
||||
|
||||
{{
|
||||
"topics": ["distinctive_word_or_phrase_1", "topic_2", ...],
|
||||
"quotes": ["[Speaker] verbatim quote", ...],
|
||||
"summary": "2-3 sentences describing what this content is about."
|
||||
}}
|
||||
|
||||
RULES:
|
||||
- Topics: 8-15 entries. Include proper nouns (names, places, projects),
|
||||
distinctive technical terms, and key concepts. NOT generic words like
|
||||
"conversation" or "discussion".
|
||||
- Quotes: 2-5 entries. EXACT verbatim from the content, not paraphrased.
|
||||
Attribute with [Speaker] prefix if speaker is identifiable.
|
||||
- Summary: mention WHO, WHAT, and WHY. No filler.
|
||||
- Write in the same language as the content.
|
||||
- Output valid JSON only. No code fences. No commentary.
|
||||
"""
|
||||
|
||||
|
||||
class LLMConfig:
|
||||
"""Resolved LLM connection config. CLI flags > env vars."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
endpoint: Optional[str] = None,
|
||||
key: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
):
|
||||
self.endpoint = (endpoint or os.environ.get("LLM_ENDPOINT", "")).rstrip("/")
|
||||
self.key = key or os.environ.get("LLM_KEY", "")
|
||||
self.model = model or os.environ.get("LLM_MODEL", "")
|
||||
|
||||
def missing(self) -> list:
|
||||
missing = []
|
||||
if not self.endpoint:
|
||||
missing.append("LLM_ENDPOINT (or --endpoint)")
|
||||
if not self.model:
|
||||
missing.append("LLM_MODEL (or --model)")
|
||||
# key is optional — local inference servers (Ollama, vLLM) often don't require one
|
||||
return missing
|
||||
|
||||
|
||||
def _call_llm(cfg: LLMConfig, source_file: str, wing: str, room: str, content: str):
|
||||
"""Single LLM call via OpenAI-compatible /chat/completions.
|
||||
|
||||
Returns (parsed_json_dict_or_None, usage_dict_or_None).
|
||||
"""
|
||||
try:
|
||||
from mempalace.i18n import t
|
||||
|
||||
lang_instruction = t("aaak.instruction")
|
||||
except Exception:
|
||||
lang_instruction = ""
|
||||
|
||||
prompt = PROMPT_TEMPLATE.format(
|
||||
source_file=source_file[:100],
|
||||
wing=wing,
|
||||
room=room,
|
||||
content=content[:MAX_CONTENT_CHARS],
|
||||
)
|
||||
if lang_instruction and "english" not in lang_instruction.lower():
|
||||
prompt += f"\n\nLanguage instruction: {lang_instruction}"
|
||||
|
||||
body = json.dumps(
|
||||
{
|
||||
"model": cfg.model,
|
||||
"max_tokens": MAX_OUTPUT_TOKENS,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
).encode("utf-8")
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if cfg.key:
|
||||
headers["Authorization"] = f"Bearer {cfg.key}"
|
||||
|
||||
url = f"{cfg.endpoint}/chat/completions"
|
||||
|
||||
for attempt in range(3):
|
||||
try:
|
||||
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
|
||||
with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT_S) as resp:
|
||||
raw = resp.read().decode("utf-8")
|
||||
payload = json.loads(raw)
|
||||
|
||||
text = payload["choices"][0]["message"]["content"].strip()
|
||||
text = re.sub(r"^```(?:json)?\s*", "", text)
|
||||
text = re.sub(r"\s*```$", "", text)
|
||||
parsed = json.loads(text)
|
||||
return parsed, payload.get("usage")
|
||||
except json.JSONDecodeError:
|
||||
return None, None
|
||||
except urllib.error.HTTPError as e:
|
||||
# 429 / 503 = retry with backoff
|
||||
if e.code in (429, 503) and attempt < 2:
|
||||
time.sleep(2**attempt)
|
||||
continue
|
||||
return None, None
|
||||
except Exception as e:
|
||||
if "rate" in str(e).lower() and attempt < 2:
|
||||
time.sleep(2**attempt)
|
||||
continue
|
||||
return None, None
|
||||
return None, None
|
||||
|
||||
|
||||
def _parsed_to_closet_lines(parsed, drawer_ids, entities_str):
|
||||
"""Convert LLM's JSON output to closet pointer lines."""
|
||||
lines = []
|
||||
drawer_ref = ",".join(drawer_ids[:3])
|
||||
|
||||
for topic in parsed.get("topics", [])[:15]:
|
||||
lines.append(f"{topic}|{entities_str}|→{drawer_ref}")
|
||||
for quote in parsed.get("quotes", [])[:5]:
|
||||
lines.append(f"{quote}|{entities_str}|→{drawer_ref}")
|
||||
summary = parsed.get("summary", "")
|
||||
if summary:
|
||||
lines.append(f"{summary[:200]}|{entities_str}|→{drawer_ref}")
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def regenerate_closets(
|
||||
palace_path,
|
||||
wing=None,
|
||||
sample=0,
|
||||
dry_run=False,
|
||||
cfg: Optional[LLMConfig] = None,
|
||||
):
|
||||
"""Regenerate closets using a configured LLM for richer topic extraction.
|
||||
|
||||
Reads existing drawers, sends content to the configured endpoint,
|
||||
replaces regex closets with LLM-generated ones. Regex closets remain
|
||||
as the fallback whenever the call fails.
|
||||
"""
|
||||
if cfg is None:
|
||||
cfg = LLMConfig()
|
||||
missing = cfg.missing()
|
||||
if missing:
|
||||
print("Error: missing configuration: " + ", ".join(missing))
|
||||
print("Set env vars LLM_ENDPOINT / LLM_MODEL (and optionally LLM_KEY),")
|
||||
print("or pass --endpoint / --model / --key on the CLI.")
|
||||
return {"error": "missing-config", "missing": missing}
|
||||
|
||||
drawers_col = get_collection(palace_path, create=False)
|
||||
closets_col = get_closets_collection(palace_path)
|
||||
|
||||
total = drawers_col.count()
|
||||
if total == 0:
|
||||
print("No drawers in palace.")
|
||||
return {"processed": 0}
|
||||
|
||||
all_data = drawers_col.get(limit=total, include=["documents", "metadatas"])
|
||||
by_source = {}
|
||||
for doc_id, doc, meta in zip(all_data["ids"], all_data["documents"], all_data["metadatas"]):
|
||||
source = meta.get("source_file", "unknown")
|
||||
w = meta.get("wing", "")
|
||||
if wing and w != wing:
|
||||
continue
|
||||
if source not in by_source:
|
||||
by_source[source] = {"drawer_ids": [], "content": [], "meta": meta}
|
||||
by_source[source]["drawer_ids"].append(doc_id)
|
||||
by_source[source]["content"].append(doc)
|
||||
|
||||
sources = list(by_source.keys())
|
||||
if sample > 0:
|
||||
sources = sources[:sample]
|
||||
|
||||
print(
|
||||
f"Regenerating closets for {len(sources)} source files via {cfg.endpoint} ({cfg.model})..."
|
||||
)
|
||||
if dry_run:
|
||||
print("DRY RUN — no changes will be written")
|
||||
|
||||
processed = 0
|
||||
failed = 0
|
||||
total_input = 0
|
||||
total_output = 0
|
||||
|
||||
for i, source in enumerate(sources, 1):
|
||||
data = by_source[source]
|
||||
content = "\n\n".join(data["content"])
|
||||
meta = data["meta"]
|
||||
w = meta.get("wing", "")
|
||||
r = meta.get("room", "")
|
||||
entities = meta.get("entities", "")
|
||||
|
||||
if dry_run:
|
||||
print(f" [{i}/{len(sources)}] {os.path.basename(source)} ({len(content)} chars)")
|
||||
continue
|
||||
|
||||
parsed, usage = _call_llm(cfg, source, w, r, content)
|
||||
if not parsed:
|
||||
failed += 1
|
||||
print(f" [{i}/{len(sources)}] ✗ {os.path.basename(source)} — LLM failed")
|
||||
continue
|
||||
|
||||
if usage:
|
||||
total_input += usage.get("prompt_tokens", 0)
|
||||
total_output += usage.get("completion_tokens", 0)
|
||||
|
||||
lines = _parsed_to_closet_lines(parsed, data["drawer_ids"], entities)
|
||||
# Use os.path.basename so Windows-style paths survive unchanged;
|
||||
# the naive split('/') would leave a bare path component on Windows
|
||||
# and collide across different files under different drives.
|
||||
closet_id_base = f"closet_{w}_{r}_{os.path.basename(source)[:30]}"
|
||||
|
||||
# Serialize with concurrent mine operations on the same source —
|
||||
# otherwise a regex closet rebuild mid-regenerate races with our
|
||||
# purge+upsert cycle and leaves mixed regex/LLM lines.
|
||||
with mine_lock(source):
|
||||
purge_file_closets(closets_col, source)
|
||||
upsert_closet_lines(
|
||||
closets_col,
|
||||
closet_id_base,
|
||||
lines,
|
||||
{
|
||||
"wing": w,
|
||||
"room": r,
|
||||
"source_file": source,
|
||||
"generated_by": f"llm:{cfg.model}",
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
"entities": entities,
|
||||
# Stamp so the miner's stale-drawer gate doesn't treat
|
||||
# LLM closets as leftovers and rebuild over them next run.
|
||||
"normalize_version": NORMALIZE_VERSION,
|
||||
},
|
||||
)
|
||||
|
||||
processed += 1
|
||||
n_topics = len(parsed.get("topics", []))
|
||||
print(f" [{i}/{len(sources)}] ✓ {os.path.basename(source)} — {n_topics} topics")
|
||||
|
||||
print(f"\nDone. {processed} regenerated, {failed} failed.")
|
||||
if total_input or total_output:
|
||||
print(f"Tokens: {total_input:,} in + {total_output:,} out (cost depends on provider)")
|
||||
|
||||
return {
|
||||
"processed": processed,
|
||||
"failed": failed,
|
||||
"input_tokens": total_input,
|
||||
"output_tokens": total_output,
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Regenerate closets via a user-configured LLM (OpenAI-compatible API)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--palace",
|
||||
default=os.path.expanduser("~/.mempalace/palace"),
|
||||
help="Path to the palace",
|
||||
)
|
||||
parser.add_argument("--wing", default=None, help="Limit to one wing")
|
||||
parser.add_argument("--sample", type=int, default=0, help="Only process first N source files")
|
||||
parser.add_argument("--dry-run", action="store_true", help="List work without calling the LLM")
|
||||
parser.add_argument(
|
||||
"--endpoint",
|
||||
default=None,
|
||||
help="LLM base URL (overrides $LLM_ENDPOINT), e.g. http://localhost:11434/v1",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--key",
|
||||
default=None,
|
||||
help="LLM bearer token (overrides $LLM_KEY). Optional for local inference.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=None,
|
||||
help='LLM model name (overrides $LLM_MODEL), e.g. "gpt-4o-mini" or "llama3:8b"',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
cfg = LLMConfig(endpoint=args.endpoint, key=args.key, model=args.model)
|
||||
regenerate_closets(
|
||||
args.palace, wing=args.wing, sample=args.sample, dry_run=args.dry_run, cfg=cfg
|
||||
)
|
||||
@@ -47,6 +47,30 @@ def sanitize_name(value: str, field_name: str = "name") -> str:
|
||||
return value
|
||||
|
||||
|
||||
def sanitize_kg_value(value: str, field_name: str = "value") -> str:
|
||||
"""Validate a knowledge-graph entity name (subject or object).
|
||||
|
||||
More permissive than sanitize_name — allows punctuation like commas,
|
||||
colons, and parentheses that are common in natural-language KG values.
|
||||
Only blocks null bytes and over-length strings.
|
||||
|
||||
Not used for wing/room names (which have filesystem constraints) or
|
||||
predicates (which should be simple relationship identifiers).
|
||||
"""
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
raise ValueError(f"{field_name} must be a non-empty string")
|
||||
|
||||
value = value.strip()
|
||||
|
||||
if len(value) > MAX_NAME_LENGTH:
|
||||
raise ValueError(f"{field_name} exceeds maximum length of {MAX_NAME_LENGTH} characters")
|
||||
|
||||
if "\x00" in value:
|
||||
raise ValueError(f"{field_name} contains null bytes")
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def sanitize_content(value: str, max_length: int = 100_000) -> str:
|
||||
"""Validate drawer/diary content length."""
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
@@ -173,6 +197,42 @@ class MempalaceConfig:
|
||||
"""Mapping of hall names to keyword lists."""
|
||||
return self._file_config.get("hall_keywords", DEFAULT_HALL_KEYWORDS)
|
||||
|
||||
@property
|
||||
def entity_languages(self):
|
||||
"""Languages whose entity-detection patterns should be applied.
|
||||
|
||||
Reads from env var ``MEMPALACE_ENTITY_LANGUAGES`` (comma-separated)
|
||||
first, then the ``entity_languages`` field in ``config.json``,
|
||||
defaulting to ``["en"]``.
|
||||
"""
|
||||
env_val = os.environ.get("MEMPALACE_ENTITY_LANGUAGES") or os.environ.get(
|
||||
"MEMPAL_ENTITY_LANGUAGES"
|
||||
)
|
||||
if env_val:
|
||||
return [s.strip() for s in env_val.split(",") if s.strip()] or ["en"]
|
||||
cfg = self._file_config.get("entity_languages")
|
||||
if isinstance(cfg, list) and cfg:
|
||||
return [str(s) for s in cfg]
|
||||
return ["en"]
|
||||
|
||||
def set_entity_languages(self, languages):
|
||||
"""Persist the entity-detection language list to ``config.json``."""
|
||||
normalized = [s.strip() for s in languages if s and s.strip()]
|
||||
if not normalized:
|
||||
normalized = ["en"]
|
||||
self._file_config["entity_languages"] = normalized
|
||||
self._config_dir.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
with open(self._config_file, "w", encoding="utf-8") as f:
|
||||
json.dump(self._file_config, f, indent=2, ensure_ascii=False)
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
self._config_file.chmod(0o600)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
return normalized
|
||||
|
||||
@property
|
||||
def hook_silent_save(self):
|
||||
"""Whether the stop hook saves directly (True) or blocks for MCP calls (False)."""
|
||||
@@ -227,4 +287,8 @@ class MempalaceConfig:
|
||||
self._config_dir.mkdir(parents=True, exist_ok=True)
|
||||
with open(self._people_map_file, "w") as f:
|
||||
json.dump(people_map, f, indent=2)
|
||||
try:
|
||||
self._people_map_file.chmod(0o600)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
return self._people_map_file
|
||||
|
||||
+95
-29
@@ -16,7 +16,33 @@ from datetime import datetime
|
||||
from collections import defaultdict
|
||||
|
||||
from .normalize import normalize
|
||||
from .palace import SKIP_DIRS, get_collection, file_already_mined
|
||||
from .palace import (
|
||||
NORMALIZE_VERSION,
|
||||
SKIP_DIRS,
|
||||
file_already_mined,
|
||||
get_collection,
|
||||
mine_lock,
|
||||
)
|
||||
|
||||
|
||||
# Cached hall keywords — avoids re-reading config per drawer
|
||||
_HALL_KEYWORDS_CACHE = None
|
||||
|
||||
|
||||
def _detect_hall_cached(content: str) -> str:
|
||||
"""Route content to a hall using cached keywords. Same logic as miner.detect_hall."""
|
||||
global _HALL_KEYWORDS_CACHE
|
||||
if _HALL_KEYWORDS_CACHE is None:
|
||||
from .config import MempalaceConfig
|
||||
|
||||
_HALL_KEYWORDS_CACHE = MempalaceConfig().hall_keywords
|
||||
content_lower = content[:3000].lower()
|
||||
scores = {}
|
||||
for hall, keywords in _HALL_KEYWORDS_CACHE.items():
|
||||
score = sum(1 for kw in keywords if kw in content_lower)
|
||||
if score > 0:
|
||||
scores[hall] = score
|
||||
return max(scores, key=scores.get) if scores else "general"
|
||||
|
||||
|
||||
# File types that might contain conversations
|
||||
@@ -51,6 +77,7 @@ def _register_file(collection, source_file: str, wing: str, agent: str):
|
||||
"added_by": agent,
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
"ingest_mode": "registry",
|
||||
"normalize_version": NORMALIZE_VERSION,
|
||||
}
|
||||
],
|
||||
)
|
||||
@@ -272,6 +299,63 @@ def scan_convos(convo_dir: str) -> list:
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extract_mode):
|
||||
"""Lock the source file, purge stale drawers, and upsert fresh chunks.
|
||||
|
||||
Combines the per-file serialization that prevents concurrent agents from
|
||||
duplicating work (via mine_lock) with the normalize-version rebuild
|
||||
contract (purge-before-insert so pre-v2 drawers don't survive).
|
||||
|
||||
Returns (drawers_added, room_counts_delta, skipped).
|
||||
"""
|
||||
room_counts_delta: dict = defaultdict(int)
|
||||
drawers_added = 0
|
||||
with mine_lock(source_file):
|
||||
# Re-check after lock — another agent may have just finished this file
|
||||
# at the current schema. A stale-version hit here returns False, so we
|
||||
# still fall through to the purge+rebuild path below.
|
||||
if file_already_mined(collection, source_file):
|
||||
return 0, room_counts_delta, True
|
||||
|
||||
# Purge stale drawers first. When the normalize schema bumps,
|
||||
# file_already_mined() returned False for pre-v2 drawers — clean
|
||||
# them out so the source doesn't end up with mixed old/new drawers.
|
||||
try:
|
||||
collection.delete(where={"source_file": source_file})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for chunk in chunks:
|
||||
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
|
||||
if extract_mode == "general":
|
||||
room_counts_delta[chunk_room] += 1
|
||||
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
|
||||
try:
|
||||
collection.upsert(
|
||||
documents=[chunk["content"]],
|
||||
ids=[drawer_id],
|
||||
metadatas=[
|
||||
{
|
||||
"wing": wing,
|
||||
"room": chunk_room,
|
||||
"hall": _detect_hall_cached(chunk["content"]),
|
||||
"source_file": source_file,
|
||||
"chunk_index": chunk["chunk_index"],
|
||||
"added_by": agent,
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
"ingest_mode": "convos",
|
||||
"extract_mode": extract_mode,
|
||||
"normalize_version": NORMALIZE_VERSION,
|
||||
}
|
||||
],
|
||||
)
|
||||
drawers_added += 1
|
||||
except Exception as e:
|
||||
if "already exists" not in str(e).lower():
|
||||
raise
|
||||
return drawers_added, room_counts_delta, False
|
||||
|
||||
|
||||
def mine_convos(
|
||||
convo_dir: str,
|
||||
palace_path: str,
|
||||
@@ -375,34 +459,16 @@ def mine_convos(
|
||||
if extract_mode != "general":
|
||||
room_counts[room] += 1
|
||||
|
||||
# File each chunk
|
||||
drawers_added = 0
|
||||
for chunk in chunks:
|
||||
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
|
||||
if extract_mode == "general":
|
||||
room_counts[chunk_room] += 1
|
||||
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
|
||||
try:
|
||||
collection.upsert(
|
||||
documents=[chunk["content"]],
|
||||
ids=[drawer_id],
|
||||
metadatas=[
|
||||
{
|
||||
"wing": wing,
|
||||
"room": chunk_room,
|
||||
"source_file": source_file,
|
||||
"chunk_index": chunk["chunk_index"],
|
||||
"added_by": agent,
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
"ingest_mode": "convos",
|
||||
"extract_mode": extract_mode,
|
||||
}
|
||||
],
|
||||
)
|
||||
drawers_added += 1
|
||||
except Exception as e:
|
||||
if "already exists" not in str(e).lower():
|
||||
raise
|
||||
# Lock + purge stale + file fresh chunks. Lock serializes concurrent
|
||||
# agents; purge removes pre-v2 drawers so the schema bump applies.
|
||||
drawers_added, room_delta, skipped = _file_chunks_locked(
|
||||
collection, source_file, chunks, wing, room, agent, extract_mode
|
||||
)
|
||||
if skipped:
|
||||
files_skipped += 1
|
||||
continue
|
||||
for r, n in room_delta.items():
|
||||
room_counts[r] += n
|
||||
|
||||
total_drawers += drawers_added
|
||||
print(f" ✓ [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers_added}")
|
||||
|
||||
+3
-5
@@ -27,7 +27,7 @@ import os
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
import chromadb
|
||||
from .backends.chroma import ChromaBackend
|
||||
|
||||
|
||||
COLLECTION_NAME = "mempalace_drawers"
|
||||
@@ -130,8 +130,7 @@ def dedup_source_group(col, drawer_ids, threshold=DEFAULT_THRESHOLD, dry_run=Tru
|
||||
def show_stats(palace_path=None):
|
||||
"""Show duplication statistics without making changes."""
|
||||
palace_path = palace_path or _get_palace_path()
|
||||
client = chromadb.PersistentClient(path=palace_path)
|
||||
col = client.get_collection(COLLECTION_NAME)
|
||||
col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
|
||||
|
||||
groups = get_source_groups(col)
|
||||
|
||||
@@ -163,8 +162,7 @@ def dedup_palace(
|
||||
print(" MemPalace Deduplicator")
|
||||
print(f"{'=' * 55}")
|
||||
|
||||
client = chromadb.PersistentClient(path=palace_path)
|
||||
col = client.get_collection(COLLECTION_NAME)
|
||||
col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
|
||||
|
||||
print(f" Palace: {palace_path}")
|
||||
print(f" Drawers: {col.count():,}")
|
||||
|
||||
@@ -158,6 +158,8 @@ _FLAG_SIGNALS = {
|
||||
}
|
||||
|
||||
# Common filler/stop words to strip from topic extraction
|
||||
_ALPHA_RE = re.compile(r"[^a-zA-Z]")
|
||||
|
||||
_STOP_WORDS = {
|
||||
"the",
|
||||
"a",
|
||||
@@ -360,7 +362,7 @@ class Dialect:
|
||||
return cls(
|
||||
entities=config.get("entities", {}),
|
||||
skip_names=config.get("skip_names", []),
|
||||
lang=config.get("lang"),
|
||||
lang=config.get("lang", "en"),
|
||||
)
|
||||
|
||||
def save_config(self, config_path: str):
|
||||
@@ -541,7 +543,7 @@ class Dialect:
|
||||
# Fallback: find capitalized words that look like names (2+ chars, not sentence-start)
|
||||
words = text.split()
|
||||
for i, w in enumerate(words):
|
||||
clean = re.sub(r"[^a-zA-Z]", "", w)
|
||||
clean = _ALPHA_RE.sub("", w)
|
||||
if (
|
||||
len(clean) >= 2
|
||||
and clean[0].isupper()
|
||||
|
||||
@@ -0,0 +1,209 @@
|
||||
"""
|
||||
diary_ingest.py — Ingest daily summary files into the palace.
|
||||
|
||||
Architecture:
|
||||
- ONE drawer per (wing, day) — full verbatim content, upserted as the day grows.
|
||||
- Closets pack topics up to CLOSET_CHAR_LIMIT, never split mid-topic.
|
||||
- A re-ingest fully purges the prior day's closets before rebuilding so a
|
||||
shorter day never leaves orphans behind.
|
||||
- Only new entries are processed by default (tracks entry count in a state
|
||||
file under ``~/.mempalace/state/`` — never inside the user's diary dir).
|
||||
- Per-file ``mine_lock`` so concurrent ingest from two terminals can't race.
|
||||
- Entities extracted and stamped on metadata for filterable search.
|
||||
|
||||
Usage:
|
||||
python -m mempalace.diary_ingest --dir ~/daily_summaries --palace ~/.mempalace/palace
|
||||
python -m mempalace.diary_ingest --dir ~/daily_summaries --palace ~/.mempalace/palace --force
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from .miner import _extract_entities_for_metadata
|
||||
from .palace import (
|
||||
build_closet_lines,
|
||||
get_closets_collection,
|
||||
get_collection,
|
||||
mine_lock,
|
||||
purge_file_closets,
|
||||
upsert_closet_lines,
|
||||
)
|
||||
|
||||
DIARY_ENTRY_RE = re.compile(r"^## .+", re.MULTILINE)
|
||||
|
||||
|
||||
def _state_file_for(palace_path: str, diary_dir: Path) -> Path:
|
||||
"""Return the per-(palace, diary-dir) state-file path under ~/.mempalace/state.
|
||||
|
||||
Keyed by sha256 of (palace_path, diary_dir) so multiple diary folders
|
||||
pointing at the same palace each get an independent state file. The
|
||||
state file is *never* written inside the user's diary directory.
|
||||
"""
|
||||
state_root = Path(os.path.expanduser("~")) / ".mempalace" / "state"
|
||||
state_root.mkdir(parents=True, exist_ok=True)
|
||||
key = hashlib.sha256(f"{palace_path}|{diary_dir}".encode()).hexdigest()[:24]
|
||||
return state_root / f"diary_ingest_{key}.json"
|
||||
|
||||
|
||||
def _split_entries(text):
|
||||
"""Split diary text into (header, body) pairs per ## entry."""
|
||||
parts = DIARY_ENTRY_RE.split(text)
|
||||
headers = DIARY_ENTRY_RE.findall(text)
|
||||
entries = []
|
||||
for i, header in enumerate(headers):
|
||||
body = parts[i + 1] if i + 1 < len(parts) else ""
|
||||
entries.append((header.strip(), body.strip()))
|
||||
return entries
|
||||
|
||||
|
||||
def _diary_drawer_id(wing: str, date_str: str) -> str:
|
||||
"""Stable, wing-scoped drawer ID. Two diaries (e.g. 'work' vs 'personal')
|
||||
sharing the same date never collide."""
|
||||
suffix = hashlib.sha256(f"{wing}|{date_str}".encode()).hexdigest()[:24]
|
||||
return f"drawer_diary_{suffix}"
|
||||
|
||||
|
||||
def _diary_closet_id_base(wing: str, date_str: str) -> str:
|
||||
suffix = hashlib.sha256(f"{wing}|{date_str}".encode()).hexdigest()[:24]
|
||||
return f"closet_diary_{suffix}"
|
||||
|
||||
|
||||
def ingest_diaries(
|
||||
diary_dir,
|
||||
palace_path,
|
||||
wing="diary",
|
||||
force=False,
|
||||
):
|
||||
"""Ingest daily summary files into the palace.
|
||||
|
||||
Each date file gets ONE drawer keyed by ``(wing, date)`` and closets that
|
||||
pack topics atomically up to ``CLOSET_CHAR_LIMIT``. ``force=True`` rebuilds
|
||||
every entry's closets from scratch (purging stale ones); the default
|
||||
incremental mode only processes entries appended since the last run.
|
||||
"""
|
||||
diary_dir = Path(diary_dir).expanduser().resolve()
|
||||
if not diary_dir.exists():
|
||||
print(f"Diary directory not found: {diary_dir}")
|
||||
return {"days_updated": 0, "closets_created": 0}
|
||||
|
||||
diary_files = sorted(diary_dir.glob("*.md"))
|
||||
if not diary_files:
|
||||
print(f"No .md files in {diary_dir}")
|
||||
return {"days_updated": 0, "closets_created": 0}
|
||||
|
||||
state_file = _state_file_for(str(palace_path), diary_dir)
|
||||
if force or not state_file.exists():
|
||||
state: dict = {}
|
||||
else:
|
||||
try:
|
||||
state = json.loads(state_file.read_text())
|
||||
except Exception:
|
||||
state = {}
|
||||
|
||||
drawers_col = get_collection(palace_path)
|
||||
closets_col = get_closets_collection(palace_path)
|
||||
|
||||
days_updated = 0
|
||||
closets_created = 0
|
||||
|
||||
for diary_path in diary_files:
|
||||
text = diary_path.read_text(encoding="utf-8", errors="replace")
|
||||
if len(text.strip()) < 50:
|
||||
continue
|
||||
|
||||
date_match = re.match(r"(\d{4}-\d{2}-\d{2})", diary_path.stem)
|
||||
if not date_match:
|
||||
continue
|
||||
date_str = date_match.group(1)
|
||||
|
||||
# Skip if content hasn't changed
|
||||
state_key = f"{wing}|{diary_path.name}"
|
||||
prev_size = state.get(state_key, {}).get("size", 0)
|
||||
curr_size = len(text)
|
||||
if curr_size == prev_size and not force:
|
||||
continue
|
||||
|
||||
now_iso = datetime.now(timezone.utc).isoformat()
|
||||
drawer_id = _diary_drawer_id(wing, date_str)
|
||||
entities = _extract_entities_for_metadata(text)
|
||||
source_file = str(diary_path)
|
||||
|
||||
# Serialize per source — two terminals running ingest at once must
|
||||
# not interleave the upsert + closet-rebuild.
|
||||
with mine_lock(source_file):
|
||||
drawer_meta = {
|
||||
"date": date_str,
|
||||
"wing": wing,
|
||||
"room": "daily",
|
||||
"source_file": source_file,
|
||||
"source_session": "daily_diary",
|
||||
"filed_at": now_iso,
|
||||
}
|
||||
if entities:
|
||||
drawer_meta["entities"] = entities
|
||||
drawers_col.upsert(
|
||||
documents=[text],
|
||||
ids=[drawer_id],
|
||||
metadatas=[drawer_meta],
|
||||
)
|
||||
|
||||
entries = _split_entries(text)
|
||||
prev_entry_count = state.get(state_key, {}).get("entry_count", 0)
|
||||
new_entries = entries if force else entries[prev_entry_count:]
|
||||
|
||||
if new_entries:
|
||||
all_lines = []
|
||||
for header, body in new_entries:
|
||||
entry_text = f"{header}\n{body}"
|
||||
entry_lines = build_closet_lines(
|
||||
source_file, [drawer_id], entry_text, wing, "daily"
|
||||
)
|
||||
all_lines.extend(entry_lines)
|
||||
|
||||
if all_lines:
|
||||
closet_id_base = _diary_closet_id_base(wing, date_str)
|
||||
closet_meta = {
|
||||
"date": date_str,
|
||||
"wing": wing,
|
||||
"room": "daily",
|
||||
"source_file": source_file,
|
||||
"filed_at": now_iso,
|
||||
}
|
||||
if entities:
|
||||
closet_meta["entities"] = entities
|
||||
# On a force rebuild, wipe any leftover numbered closets
|
||||
# from a longer prior run before re-writing.
|
||||
if force:
|
||||
purge_file_closets(closets_col, source_file)
|
||||
n = upsert_closet_lines(closets_col, closet_id_base, all_lines, closet_meta)
|
||||
closets_created += n
|
||||
|
||||
state[state_key] = {
|
||||
"size": curr_size,
|
||||
"entry_count": len(entries),
|
||||
"ingested_at": now_iso,
|
||||
}
|
||||
days_updated += 1
|
||||
|
||||
state_file.write_text(json.dumps(state, indent=2))
|
||||
if days_updated:
|
||||
print(f"Diary: {days_updated} days updated, {closets_created} new closets")
|
||||
|
||||
return {"days_updated": days_updated, "closets_created": closets_created}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Ingest daily summaries into the palace")
|
||||
parser.add_argument("--dir", required=True, help="Path to daily_summaries directory")
|
||||
parser.add_argument("--palace", default=os.path.expanduser("~/.mempalace/palace"))
|
||||
parser.add_argument("--wing", default="diary")
|
||||
parser.add_argument("--force", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
ingest_diaries(args.dir, args.palace, wing=args.wing, force=args.force)
|
||||
+152
-415
@@ -9,391 +9,70 @@ Two-pass approach:
|
||||
Used by mempalace init before mining begins.
|
||||
The confirmed entity map feeds the miner as the taxonomy.
|
||||
|
||||
Multi-language support:
|
||||
All lexical patterns (person verbs, pronouns, dialogue markers, project
|
||||
verbs, stopwords, and the candidate-extraction character class) live in
|
||||
the ``entity`` section of ``mempalace/i18n/<lang>.json``. Every public
|
||||
function accepts a ``languages`` tuple and applies the union of the
|
||||
requested locales' patterns. The default is ``("en",)`` — existing
|
||||
English-only callers behave exactly as before.
|
||||
|
||||
To add a new language: add an ``entity`` section to that locale's JSON.
|
||||
No code changes required.
|
||||
|
||||
Usage:
|
||||
from entity_detector import detect_entities, confirm_entities
|
||||
candidates = detect_entities(file_paths)
|
||||
from mempalace.entity_detector import detect_entities, confirm_entities
|
||||
candidates = detect_entities(file_paths) # English only
|
||||
candidates = detect_entities(paths, languages=("en", "pt-br"))
|
||||
confirmed = confirm_entities(candidates) # interactive review
|
||||
"""
|
||||
|
||||
import re
|
||||
import os
|
||||
import functools
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
from mempalace.i18n import get_entity_patterns
|
||||
|
||||
# ==================== SIGNAL PATTERNS ====================
|
||||
|
||||
# Person signals — things people do
|
||||
PERSON_VERB_PATTERNS = [
|
||||
r"\b{name}\s+said\b",
|
||||
r"\b{name}\s+asked\b",
|
||||
r"\b{name}\s+told\b",
|
||||
r"\b{name}\s+replied\b",
|
||||
r"\b{name}\s+laughed\b",
|
||||
r"\b{name}\s+smiled\b",
|
||||
r"\b{name}\s+cried\b",
|
||||
r"\b{name}\s+felt\b",
|
||||
r"\b{name}\s+thinks?\b",
|
||||
r"\b{name}\s+wants?\b",
|
||||
r"\b{name}\s+loves?\b",
|
||||
r"\b{name}\s+hates?\b",
|
||||
r"\b{name}\s+knows?\b",
|
||||
r"\b{name}\s+decided\b",
|
||||
r"\b{name}\s+pushed\b",
|
||||
r"\b{name}\s+wrote\b",
|
||||
r"\bhey\s+{name}\b",
|
||||
r"\bthanks?\s+{name}\b",
|
||||
r"\bhi\s+{name}\b",
|
||||
r"\bdear\s+{name}\b",
|
||||
]
|
||||
# ==================== LANGUAGE-AWARE PATTERN LOADING ====================
|
||||
|
||||
# Person signals — pronouns resolving nearby
|
||||
PRONOUN_PATTERNS = [
|
||||
r"\bshe\b",
|
||||
r"\bher\b",
|
||||
r"\bhers\b",
|
||||
r"\bhe\b",
|
||||
r"\bhim\b",
|
||||
r"\bhis\b",
|
||||
r"\bthey\b",
|
||||
r"\bthem\b",
|
||||
r"\btheir\b",
|
||||
]
|
||||
|
||||
# Person signals — dialogue markers
|
||||
DIALOGUE_PATTERNS = [
|
||||
r"^>\s*{name}[:\s]", # > Speaker: ...
|
||||
r"^{name}:\s", # Speaker: ...
|
||||
r"^\[{name}\]", # [Speaker]
|
||||
r'"{name}\s+said',
|
||||
]
|
||||
def _normalize_langs(languages) -> tuple:
|
||||
"""Coerce a language input into a non-empty hashable tuple."""
|
||||
if not languages:
|
||||
return ("en",)
|
||||
if isinstance(languages, str):
|
||||
return (languages,)
|
||||
return tuple(languages)
|
||||
|
||||
# Project signals — things projects have/do
|
||||
PROJECT_VERB_PATTERNS = [
|
||||
r"\bbuilding\s+{name}\b",
|
||||
r"\bbuilt\s+{name}\b",
|
||||
r"\bship(?:ping|ped)?\s+{name}\b",
|
||||
r"\blaunch(?:ing|ed)?\s+{name}\b",
|
||||
r"\bdeploy(?:ing|ed)?\s+{name}\b",
|
||||
r"\binstall(?:ing|ed)?\s+{name}\b",
|
||||
r"\bthe\s+{name}\s+architecture\b",
|
||||
r"\bthe\s+{name}\s+pipeline\b",
|
||||
r"\bthe\s+{name}\s+system\b",
|
||||
r"\bthe\s+{name}\s+repo\b",
|
||||
r"\b{name}\s+v\d+\b", # MemPal v2
|
||||
r"\b{name}\.py\b", # mempalace.py
|
||||
r"\b{name}-core\b", # mempal-core (hyphen only, not underscore)
|
||||
r"\b{name}-local\b",
|
||||
r"\bimport\s+{name}\b",
|
||||
r"\bpip\s+install\s+{name}\b",
|
||||
]
|
||||
|
||||
# Words that are almost certainly NOT entities
|
||||
STOPWORDS = {
|
||||
"the",
|
||||
"a",
|
||||
"an",
|
||||
"and",
|
||||
"or",
|
||||
"but",
|
||||
"in",
|
||||
"on",
|
||||
"at",
|
||||
"to",
|
||||
"for",
|
||||
"of",
|
||||
"with",
|
||||
"by",
|
||||
"from",
|
||||
"as",
|
||||
"is",
|
||||
"was",
|
||||
"are",
|
||||
"were",
|
||||
"be",
|
||||
"been",
|
||||
"being",
|
||||
"have",
|
||||
"has",
|
||||
"had",
|
||||
"do",
|
||||
"does",
|
||||
"did",
|
||||
"will",
|
||||
"would",
|
||||
"could",
|
||||
"should",
|
||||
"may",
|
||||
"might",
|
||||
"must",
|
||||
"shall",
|
||||
"can",
|
||||
"this",
|
||||
"that",
|
||||
"these",
|
||||
"those",
|
||||
"it",
|
||||
"its",
|
||||
"they",
|
||||
"them",
|
||||
"their",
|
||||
"we",
|
||||
"our",
|
||||
"you",
|
||||
"your",
|
||||
"i",
|
||||
"my",
|
||||
"me",
|
||||
"he",
|
||||
"she",
|
||||
"his",
|
||||
"her",
|
||||
"who",
|
||||
"what",
|
||||
"when",
|
||||
"where",
|
||||
"why",
|
||||
"how",
|
||||
"which",
|
||||
"if",
|
||||
"then",
|
||||
"so",
|
||||
"not",
|
||||
"no",
|
||||
"yes",
|
||||
"ok",
|
||||
"okay",
|
||||
"just",
|
||||
"very",
|
||||
"really",
|
||||
"also",
|
||||
"already",
|
||||
"still",
|
||||
"even",
|
||||
"only",
|
||||
"here",
|
||||
"there",
|
||||
"now",
|
||||
"then",
|
||||
"too",
|
||||
"up",
|
||||
"out",
|
||||
"about",
|
||||
"like",
|
||||
"use",
|
||||
"get",
|
||||
"got",
|
||||
"make",
|
||||
"made",
|
||||
"take",
|
||||
"put",
|
||||
"come",
|
||||
"go",
|
||||
"see",
|
||||
"know",
|
||||
"think",
|
||||
"true",
|
||||
"false",
|
||||
"none",
|
||||
"null",
|
||||
"new",
|
||||
"old",
|
||||
"all",
|
||||
"any",
|
||||
"some",
|
||||
"true",
|
||||
"false",
|
||||
"return",
|
||||
"print",
|
||||
"def",
|
||||
"class",
|
||||
"import",
|
||||
"from",
|
||||
# Common capitalized words in prose that aren't entities
|
||||
"step",
|
||||
"usage",
|
||||
"run",
|
||||
"check",
|
||||
"find",
|
||||
"add",
|
||||
"get",
|
||||
"set",
|
||||
"list",
|
||||
"args",
|
||||
"dict",
|
||||
"str",
|
||||
"int",
|
||||
"bool",
|
||||
"path",
|
||||
"file",
|
||||
"type",
|
||||
"name",
|
||||
"note",
|
||||
"example",
|
||||
"option",
|
||||
"result",
|
||||
"error",
|
||||
"warning",
|
||||
"info",
|
||||
"every",
|
||||
"each",
|
||||
"more",
|
||||
"less",
|
||||
"next",
|
||||
"last",
|
||||
"first",
|
||||
"second",
|
||||
"stack",
|
||||
"layer",
|
||||
"mode",
|
||||
"test",
|
||||
"stop",
|
||||
"start",
|
||||
"copy",
|
||||
"move",
|
||||
"source",
|
||||
"target",
|
||||
"output",
|
||||
"input",
|
||||
"data",
|
||||
"item",
|
||||
"key",
|
||||
"value",
|
||||
"returns",
|
||||
"raises",
|
||||
"yields",
|
||||
"none",
|
||||
"self",
|
||||
"cls",
|
||||
"kwargs",
|
||||
# Common sentence-starting / abstract words that aren't entities
|
||||
"world",
|
||||
"well",
|
||||
"want",
|
||||
"topic",
|
||||
"choose",
|
||||
"social",
|
||||
"cars",
|
||||
"phones",
|
||||
"healthcare",
|
||||
"ex",
|
||||
"machina",
|
||||
"deus",
|
||||
"human",
|
||||
"humans",
|
||||
"people",
|
||||
"things",
|
||||
"something",
|
||||
"nothing",
|
||||
"everything",
|
||||
"anything",
|
||||
"someone",
|
||||
"everyone",
|
||||
"anyone",
|
||||
"way",
|
||||
"time",
|
||||
"day",
|
||||
"life",
|
||||
"place",
|
||||
"thing",
|
||||
"part",
|
||||
"kind",
|
||||
"sort",
|
||||
"case",
|
||||
"point",
|
||||
"idea",
|
||||
"fact",
|
||||
"sense",
|
||||
"question",
|
||||
"answer",
|
||||
"reason",
|
||||
"number",
|
||||
"version",
|
||||
"system",
|
||||
# Greetings and filler words at sentence starts
|
||||
"hey",
|
||||
"hi",
|
||||
"hello",
|
||||
"thanks",
|
||||
"thank",
|
||||
"right",
|
||||
"let",
|
||||
"ok",
|
||||
# UI/action words that appear in how-to content
|
||||
"click",
|
||||
"hit",
|
||||
"press",
|
||||
"tap",
|
||||
"drag",
|
||||
"drop",
|
||||
"open",
|
||||
"close",
|
||||
"save",
|
||||
"load",
|
||||
"launch",
|
||||
"install",
|
||||
"download",
|
||||
"upload",
|
||||
"scroll",
|
||||
"select",
|
||||
"enter",
|
||||
"submit",
|
||||
"cancel",
|
||||
"confirm",
|
||||
"delete",
|
||||
"copy",
|
||||
"paste",
|
||||
"type",
|
||||
"write",
|
||||
"read",
|
||||
"search",
|
||||
"find",
|
||||
"show",
|
||||
"hide",
|
||||
# Common filesystem/technical capitalized words
|
||||
"desktop",
|
||||
"documents",
|
||||
"downloads",
|
||||
"users",
|
||||
"home",
|
||||
"library",
|
||||
"applications",
|
||||
"system",
|
||||
"preferences",
|
||||
"settings",
|
||||
"terminal",
|
||||
# Abstract/topic words
|
||||
"actor",
|
||||
"vector",
|
||||
"remote",
|
||||
"control",
|
||||
"duration",
|
||||
"fetch",
|
||||
# Abstract concepts that appear as subjects but aren't entities
|
||||
"agents",
|
||||
"tools",
|
||||
"others",
|
||||
"guards",
|
||||
"ethics",
|
||||
"regulation",
|
||||
"learning",
|
||||
"thinking",
|
||||
"memory",
|
||||
"language",
|
||||
"intelligence",
|
||||
"technology",
|
||||
"society",
|
||||
"culture",
|
||||
"future",
|
||||
"history",
|
||||
"science",
|
||||
"model",
|
||||
"models",
|
||||
"network",
|
||||
"networks",
|
||||
"training",
|
||||
"inference",
|
||||
}
|
||||
@functools.lru_cache(maxsize=32)
|
||||
def _get_stopwords(languages: tuple) -> frozenset:
|
||||
"""Return the union of stopwords across the given languages."""
|
||||
patterns = get_entity_patterns(languages)
|
||||
return frozenset(patterns["stopwords"])
|
||||
|
||||
|
||||
# ==================== BACKWARD-COMPAT MODULE CONSTANTS ====================
|
||||
#
|
||||
# These mirror the old module-level constants so existing imports keep working.
|
||||
# They reflect the English defaults and are populated at import time from
|
||||
# ``mempalace/i18n/en.json``. Callers that need multi-language behavior should
|
||||
# pass the ``languages`` parameter to the public functions below.
|
||||
|
||||
_EN = get_entity_patterns(("en",))
|
||||
|
||||
PERSON_VERB_PATTERNS = list(_EN["person_verb_patterns"])
|
||||
PRONOUN_PATTERNS = list(_EN["pronoun_patterns"])
|
||||
PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE) if PRONOUN_PATTERNS else None
|
||||
DIALOGUE_PATTERNS = list(_EN["dialogue_patterns"])
|
||||
PROJECT_VERB_PATTERNS = list(_EN["project_verb_patterns"])
|
||||
STOPWORDS = set(_EN["stopwords"])
|
||||
|
||||
|
||||
# ==================== EXTENSION POINTS (not language-scoped) ====================
|
||||
|
||||
# For entity detection — prose only, no code files
|
||||
# Code files have too many capitalized names (classes, functions) that aren't entities
|
||||
@@ -440,55 +119,107 @@ SKIP_DIRS = {
|
||||
# ==================== CANDIDATE EXTRACTION ====================
|
||||
|
||||
|
||||
def extract_candidates(text: str) -> dict:
|
||||
def extract_candidates(text: str, languages=("en",)) -> dict:
|
||||
"""
|
||||
Extract all capitalized proper noun candidates from text.
|
||||
Returns {name: frequency} for names appearing 3+ times.
|
||||
"""
|
||||
# Find all capitalized words (not at sentence start — harder, so we use frequency as filter)
|
||||
raw = re.findall(r"\b([A-Z][a-z]{1,19})\b", text)
|
||||
|
||||
counts = defaultdict(int)
|
||||
for word in raw:
|
||||
if word.lower() not in STOPWORDS and len(word) > 1:
|
||||
Each language contributes its own character-class pattern (e.g. ASCII
|
||||
for English, Latin+diacritics for pt-br, Cyrillic for Russian,
|
||||
Devanagari for Hindi). Matches from all languages are unioned.
|
||||
"""
|
||||
langs = _normalize_langs(languages)
|
||||
patterns = get_entity_patterns(langs)
|
||||
stopwords = _get_stopwords(langs)
|
||||
|
||||
counts: defaultdict = defaultdict(int)
|
||||
|
||||
# Single-word candidates — one pattern per language
|
||||
for raw_pat in patterns["candidate_patterns"]:
|
||||
try:
|
||||
rx = re.compile(rf"\b({raw_pat})\b")
|
||||
except re.error:
|
||||
continue
|
||||
for word in rx.findall(text):
|
||||
if word.lower() in stopwords:
|
||||
continue
|
||||
if len(word) < 2:
|
||||
continue
|
||||
counts[word] += 1
|
||||
|
||||
# Also find multi-word proper nouns (e.g. "Memory Palace", "Claude Code")
|
||||
multi = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b", text)
|
||||
for phrase in multi:
|
||||
if not any(w.lower() in STOPWORDS for w in phrase.split()):
|
||||
# Multi-word candidates — one pattern per language
|
||||
for raw_pat in patterns["multi_word_patterns"]:
|
||||
try:
|
||||
rx = re.compile(rf"\b({raw_pat})\b")
|
||||
except re.error:
|
||||
continue
|
||||
for phrase in rx.findall(text):
|
||||
if any(w.lower() in stopwords for w in phrase.split()):
|
||||
continue
|
||||
counts[phrase] += 1
|
||||
|
||||
# Filter: must appear at least 3 times to be a candidate
|
||||
return {name: count for name, count in counts.items() if count >= 3}
|
||||
|
||||
|
||||
# ==================== SIGNAL SCORING ====================
|
||||
|
||||
|
||||
def _build_patterns(name: str) -> dict:
|
||||
"""Pre-compile all regex patterns for a single entity name."""
|
||||
@functools.lru_cache(maxsize=256)
|
||||
def _build_patterns(name: str, languages: tuple = ("en",)) -> dict:
|
||||
"""Pre-compile all regex patterns for a single entity name, per language set."""
|
||||
n = re.escape(name)
|
||||
langs = _normalize_langs(languages)
|
||||
sources = get_entity_patterns(langs)
|
||||
|
||||
def _compile_each(raw_patterns, flags=re.IGNORECASE):
|
||||
compiled = []
|
||||
for p in raw_patterns:
|
||||
try:
|
||||
compiled.append(re.compile(p.format(name=n), flags))
|
||||
except (re.error, KeyError, IndexError):
|
||||
continue
|
||||
return compiled
|
||||
|
||||
direct_sources = sources.get("direct_address_patterns") or []
|
||||
direct_compiled = []
|
||||
for raw in direct_sources:
|
||||
try:
|
||||
direct_compiled.append(re.compile(raw.format(name=n), re.IGNORECASE))
|
||||
except (re.error, KeyError, IndexError):
|
||||
continue
|
||||
|
||||
return {
|
||||
"dialogue": [
|
||||
re.compile(p.format(name=n), re.MULTILINE | re.IGNORECASE) for p in DIALOGUE_PATTERNS
|
||||
],
|
||||
"person_verbs": [re.compile(p.format(name=n), re.IGNORECASE) for p in PERSON_VERB_PATTERNS],
|
||||
"project_verbs": [
|
||||
re.compile(p.format(name=n), re.IGNORECASE) for p in PROJECT_VERB_PATTERNS
|
||||
],
|
||||
"direct": re.compile(rf"\bhey\s+{n}\b|\bthanks?\s+{n}\b|\bhi\s+{n}\b", re.IGNORECASE),
|
||||
"dialogue": _compile_each(sources["dialogue_patterns"], re.MULTILINE | re.IGNORECASE),
|
||||
"person_verbs": _compile_each(sources["person_verb_patterns"]),
|
||||
"project_verbs": _compile_each(sources["project_verb_patterns"]),
|
||||
"direct": direct_compiled,
|
||||
"versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
|
||||
"code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
|
||||
}
|
||||
|
||||
|
||||
def score_entity(name: str, text: str, lines: list) -> dict:
|
||||
@functools.lru_cache(maxsize=32)
|
||||
def _pronoun_re(languages: tuple):
|
||||
"""Compile a combined pronoun regex for the given languages."""
|
||||
langs = _normalize_langs(languages)
|
||||
patterns = get_entity_patterns(langs)
|
||||
pronouns = patterns.get("pronoun_patterns") or []
|
||||
if not pronouns:
|
||||
return None
|
||||
try:
|
||||
return re.compile("|".join(pronouns), re.IGNORECASE)
|
||||
except re.error:
|
||||
return None
|
||||
|
||||
|
||||
def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict:
|
||||
"""
|
||||
Score a candidate entity as person vs project.
|
||||
Returns scores and the signals that fired.
|
||||
"""
|
||||
patterns = _build_patterns(name)
|
||||
langs = _normalize_langs(languages)
|
||||
patterns = _build_patterns(name, langs)
|
||||
pronoun_re = _pronoun_re(langs)
|
||||
person_score = 0
|
||||
project_score = 0
|
||||
person_signals = []
|
||||
@@ -511,24 +242,25 @@ def score_entity(name: str, text: str, lines: list) -> dict:
|
||||
person_signals.append(f"'{name} ...' action ({matches}x)")
|
||||
|
||||
# Pronoun proximity — pronouns within 3 lines of the name
|
||||
name_lower = name.lower()
|
||||
name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
|
||||
pronoun_hits = 0
|
||||
for idx in name_line_indices:
|
||||
window_text = " ".join(lines[max(0, idx - 2) : idx + 3]).lower()
|
||||
for pronoun_pattern in PRONOUN_PATTERNS:
|
||||
if re.search(pronoun_pattern, window_text):
|
||||
if pronoun_re is not None:
|
||||
name_lower = name.lower()
|
||||
name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
|
||||
pronoun_hits = 0
|
||||
for idx in name_line_indices:
|
||||
window_text = " ".join(lines[max(0, idx - 2) : idx + 3])
|
||||
if pronoun_re.search(window_text):
|
||||
pronoun_hits += 1
|
||||
break
|
||||
if pronoun_hits > 0:
|
||||
person_score += pronoun_hits * 2
|
||||
person_signals.append(f"pronoun nearby ({pronoun_hits}x)")
|
||||
if pronoun_hits > 0:
|
||||
person_score += pronoun_hits * 2
|
||||
person_signals.append(f"pronoun nearby ({pronoun_hits}x)")
|
||||
|
||||
# Direct address
|
||||
direct = len(patterns["direct"].findall(text))
|
||||
if direct > 0:
|
||||
person_score += direct * 4
|
||||
person_signals.append(f"addressed directly ({direct}x)")
|
||||
direct_hits = 0
|
||||
for rx in patterns["direct"]:
|
||||
direct_hits += len(rx.findall(text))
|
||||
if direct_hits > 0:
|
||||
person_score += direct_hits * 4
|
||||
person_signals.append(f"addressed directly ({direct_hits}x)")
|
||||
|
||||
# --- Project signals ---
|
||||
|
||||
@@ -629,13 +361,15 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict:
|
||||
# ==================== MAIN DETECT ====================
|
||||
|
||||
|
||||
def detect_entities(file_paths: list, max_files: int = 10) -> dict:
|
||||
def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) -> dict:
|
||||
"""
|
||||
Scan files and detect entity candidates.
|
||||
|
||||
Args:
|
||||
file_paths: List of Path objects to scan
|
||||
max_files: Max files to read (for speed)
|
||||
languages: Tuple of language codes whose entity patterns should be
|
||||
applied (union). Defaults to ``("en",)``.
|
||||
|
||||
Returns:
|
||||
{
|
||||
@@ -644,6 +378,8 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
|
||||
"uncertain":[...entity dicts...],
|
||||
}
|
||||
"""
|
||||
langs = _normalize_langs(languages)
|
||||
|
||||
# Collect text from files
|
||||
all_text = []
|
||||
all_lines = []
|
||||
@@ -666,7 +402,7 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
|
||||
combined_text = "\n".join(all_text)
|
||||
|
||||
# Extract candidates
|
||||
candidates = extract_candidates(combined_text)
|
||||
candidates = extract_candidates(combined_text, languages=langs)
|
||||
|
||||
if not candidates:
|
||||
return {"people": [], "projects": [], "uncertain": []}
|
||||
@@ -677,7 +413,7 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
|
||||
uncertain = []
|
||||
|
||||
for name, frequency in sorted(candidates.items(), key=lambda x: x[1], reverse=True):
|
||||
scores = score_entity(name, combined_text, all_lines)
|
||||
scores = score_entity(name, combined_text, all_lines, languages=langs)
|
||||
entity = classify_entity(name, frequency, scores)
|
||||
|
||||
if entity["type"] == "person":
|
||||
@@ -841,13 +577,14 @@ if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python entity_detector.py <directory>")
|
||||
print("Usage: python entity_detector.py <directory> [lang1,lang2,...]")
|
||||
sys.exit(1)
|
||||
|
||||
project_dir = sys.argv[1]
|
||||
print(f"Scanning: {project_dir}")
|
||||
langs = tuple(sys.argv[2].split(",")) if len(sys.argv) >= 3 else ("en",)
|
||||
print(f"Scanning: {project_dir} (languages: {', '.join(langs)})")
|
||||
files = scan_for_detection(project_dir)
|
||||
print(f"Reading {len(files)} files...")
|
||||
detected = detect_entities(files)
|
||||
detected = detect_entities(files, languages=langs)
|
||||
confirmed = confirm_entities(detected)
|
||||
print("Confirmed entities:", confirmed)
|
||||
|
||||
@@ -178,6 +178,12 @@ def _wikipedia_lookup(word: str) -> dict:
|
||||
Look up a word via Wikipedia REST API.
|
||||
Returns inferred type (person/place/concept/unknown) + confidence + summary.
|
||||
Free, no API key, handles disambiguation pages.
|
||||
|
||||
**Privacy warning:** This function makes an outbound HTTPS request to
|
||||
en.wikipedia.org, sending the queried word over the network. It should
|
||||
only be called when the caller has explicitly opted in via
|
||||
``allow_network=True`` in :meth:`EntityRegistry.research`. The default
|
||||
behaviour of ``research()`` is local-only (no network calls).
|
||||
"""
|
||||
try:
|
||||
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(word)}"
|
||||
@@ -244,13 +250,14 @@ def _wikipedia_lookup(word: str) -> dict:
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 404:
|
||||
# Not in Wikipedia — strong signal it's a proper noun (unusual name, nickname)
|
||||
# Not in Wikipedia — this tells us nothing definitive about
|
||||
# the word. Return "unknown" so the caller can decide.
|
||||
return {
|
||||
"inferred_type": "person",
|
||||
"confidence": 0.70,
|
||||
"inferred_type": "unknown",
|
||||
"confidence": 0.3,
|
||||
"wiki_summary": None,
|
||||
"wiki_title": None,
|
||||
"note": "not found in Wikipedia — likely a proper noun or unusual name",
|
||||
"note": "not found in Wikipedia",
|
||||
}
|
||||
return {"inferred_type": "unknown", "confidence": 0.0, "wiki_summary": None}
|
||||
except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError):
|
||||
@@ -309,7 +316,15 @@ class EntityRegistry:
|
||||
|
||||
def save(self):
|
||||
self._path.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
self._path.parent.chmod(0o700)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
self._path.write_text(json.dumps(self._data, indent=2), encoding="utf-8")
|
||||
try:
|
||||
self._path.chmod(0o600)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _empty() -> dict:
|
||||
@@ -502,20 +517,41 @@ class EntityRegistry:
|
||||
|
||||
# ── Research unknown words ───────────────────────────────────────────────
|
||||
|
||||
def research(self, word: str, auto_confirm: bool = False) -> dict:
|
||||
def research(self, word: str, auto_confirm: bool = False, allow_network: bool = False) -> dict:
|
||||
"""
|
||||
Research an unknown word via Wikipedia.
|
||||
Caches result. If auto_confirm=False, marks as unconfirmed (needs user review).
|
||||
Returns the lookup result.
|
||||
Research an unknown word.
|
||||
|
||||
By default this is **local-only**: it checks the wiki cache and
|
||||
returns ``"unknown"`` for uncached words. Pass
|
||||
``allow_network=True`` to explicitly opt in to an outbound
|
||||
Wikipedia lookup. This design honours the project's
|
||||
*local-first, zero API* and *privacy by architecture* principles
|
||||
— no data leaves the machine unless the caller requests it.
|
||||
|
||||
Caches result. If *auto_confirm* is ``False``, marks the entry
|
||||
as unconfirmed (needs user review).
|
||||
"""
|
||||
# Already cached?
|
||||
cache = self._data.setdefault("wiki_cache", {})
|
||||
# Check cache (read-only — no mutation when allow_network is False)
|
||||
cache = self._data.get("wiki_cache", {})
|
||||
if word in cache:
|
||||
return cache[word]
|
||||
|
||||
if not allow_network:
|
||||
return {
|
||||
"inferred_type": "unknown",
|
||||
"confidence": 0.0,
|
||||
"wiki_summary": None,
|
||||
"wiki_title": None,
|
||||
"word": word,
|
||||
"confirmed": False,
|
||||
"note": "network lookup disabled — pass allow_network=True to query Wikipedia",
|
||||
}
|
||||
|
||||
# Network path — ensure wiki_cache key exists before writing
|
||||
cache = self._data.setdefault("wiki_cache", {})
|
||||
result = _wikipedia_lookup(word)
|
||||
result["word"] = word
|
||||
result["confirmed"] = auto_confirm
|
||||
result.setdefault("word", word)
|
||||
result.setdefault("confirmed", auto_confirm)
|
||||
|
||||
cache[word] = result
|
||||
self.save()
|
||||
@@ -547,15 +583,19 @@ class EntityRegistry:
|
||||
|
||||
# ── Learn from sessions ──────────────────────────────────────────────────
|
||||
|
||||
def learn_from_text(self, text: str, min_confidence: float = 0.75) -> list:
|
||||
def learn_from_text(self, text: str, min_confidence: float = 0.75, languages=("en",)) -> list:
|
||||
"""
|
||||
Scan session text for new entity candidates.
|
||||
Returns list of newly discovered candidates for review.
|
||||
|
||||
``languages`` is forwarded to entity detection — pass the user's
|
||||
configured ``MempalaceConfig().entity_languages`` to match the
|
||||
locales used at ``mempalace init`` time.
|
||||
"""
|
||||
from mempalace.entity_detector import extract_candidates, score_entity, classify_entity
|
||||
|
||||
lines = text.splitlines()
|
||||
candidates = extract_candidates(text)
|
||||
candidates = extract_candidates(text, languages=languages)
|
||||
new_candidates = []
|
||||
|
||||
for name, frequency in candidates.items():
|
||||
@@ -563,7 +603,7 @@ class EntityRegistry:
|
||||
if name in self.people or name in self.projects:
|
||||
continue
|
||||
|
||||
scores = score_entity(name, text, lines)
|
||||
scores = score_entity(name, text, lines, languages=languages)
|
||||
entity = classify_entity(name, frequency, scores)
|
||||
|
||||
if entity["type"] == "person" and entity["confidence"] >= min_confidence:
|
||||
|
||||
+13
-1
@@ -49,9 +49,15 @@ def export_palace(palace_path: str, output_dir: str, format: str = "markdown") -
|
||||
return {"wings": 0, "rooms": 0, "drawers": 0}
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
try:
|
||||
os.chmod(output_dir, 0o700)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
|
||||
# Track which room files have been opened (so we can append vs overwrite)
|
||||
opened_rooms: set[tuple[str, str]] = set()
|
||||
# Track which wing directories have been created and chmoded
|
||||
created_wing_dirs: set[str] = set()
|
||||
# Track stats per wing: {wing: {room: count}}
|
||||
wing_stats: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
||||
total_drawers = 0
|
||||
@@ -82,7 +88,13 @@ def export_palace(palace_path: str, output_dir: str, format: str = "markdown") -
|
||||
for wing, rooms in batch_grouped.items():
|
||||
safe_wing = _safe_path_component(wing)
|
||||
wing_dir = os.path.join(output_dir, safe_wing)
|
||||
os.makedirs(wing_dir, exist_ok=True)
|
||||
if wing_dir not in created_wing_dirs:
|
||||
os.makedirs(wing_dir, exist_ok=True)
|
||||
try:
|
||||
os.chmod(wing_dir, 0o700)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
created_wing_dirs.add(wing_dir)
|
||||
|
||||
for room, drawers in rooms.items():
|
||||
safe_room = _safe_path_component(room)
|
||||
|
||||
@@ -0,0 +1,335 @@
|
||||
"""
|
||||
fact_checker.py — Verify text against known facts in the palace.
|
||||
|
||||
Checks AI responses, diary entries, and new content against the entity
|
||||
registry and knowledge graph for three classes of issue:
|
||||
|
||||
* similar_name — text mentions a name that's one/two edits
|
||||
away from *another* registered name, raising
|
||||
the possibility of a typo or mix-up.
|
||||
* relationship_mismatch — text asserts a role between two entities
|
||||
(e.g. "Bob is Alice's brother") while the KG
|
||||
records a *different* current role for the
|
||||
same subject/object pair.
|
||||
* stale_fact — text asserts a fact that the KG marks closed
|
||||
(``valid_to`` in the past).
|
||||
|
||||
Purely offline. Inputs: entity_registry JSON + KG SQLite. No network.
|
||||
|
||||
Usage:
|
||||
from mempalace.fact_checker import check_text
|
||||
issues = check_text("Bob is Alice's brother", palace_path)
|
||||
|
||||
# CLI
|
||||
python -m mempalace.fact_checker "Bob is Alice's brother" \\
|
||||
--palace ~/.mempalace/palace
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
|
||||
# Share miner's mtime-cached registry loader so we don't double-read
|
||||
# ~/.mempalace/known_entities.json on every check_text call.
|
||||
from .miner import _load_known_entities_raw
|
||||
|
||||
|
||||
# Narrow detection patterns — parse "X is Y's Z" and "X's Z is Y".
|
||||
# Names are captured greedily as word sequences (letters + optional
|
||||
# capitalized follow-ons) so simple multi-token names still work.
|
||||
# Relationship words are constrained to sane lengths to avoid matching
|
||||
# arbitrary filler.
|
||||
_RELATIONSHIP_PATTERNS = [
|
||||
# "Bob is Alice's brother" → subject=Bob, possessor=Alice, role=brother
|
||||
re.compile(r"\b([A-Z][\w-]+)\s+is\s+([A-Z][\w-]+)'s\s+([a-z]{3,20})\b"),
|
||||
# "Alice's brother is Bob" → possessor=Alice, role=brother, subject=Bob
|
||||
re.compile(r"\b([A-Z][\w-]+)'s\s+([a-z]{3,20})\s+is\s+([A-Z][\w-]+)\b"),
|
||||
]
|
||||
|
||||
|
||||
def check_text(text: str, palace_path: str = None, config=None) -> list:
|
||||
"""Return a list of issues detected in ``text``.
|
||||
|
||||
Empty list means "no contradictions found" — absence of evidence, not
|
||||
evidence of absence. The detector is deliberately conservative:
|
||||
every issue is anchored to a specific KG fact or registry entry.
|
||||
"""
|
||||
if config is None:
|
||||
from .config import MempalaceConfig
|
||||
|
||||
config = MempalaceConfig()
|
||||
if palace_path is None:
|
||||
palace_path = config.palace_path
|
||||
|
||||
if not text:
|
||||
return []
|
||||
|
||||
issues: list = []
|
||||
entity_names_raw = _load_known_entities_raw()
|
||||
|
||||
issues.extend(_check_entity_confusion(text, entity_names_raw))
|
||||
issues.extend(_check_kg_contradictions(text, palace_path))
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
# ── entity-name confusion ────────────────────────────────────────────
|
||||
|
||||
|
||||
def _flatten_names(entity_names_raw: dict) -> set:
|
||||
"""Flatten a ``{category: [names]}`` or ``{category: {name: meta}}``
|
||||
registry into a set of names."""
|
||||
flat: set = set()
|
||||
for cat in entity_names_raw.values():
|
||||
if isinstance(cat, list):
|
||||
flat.update(str(n) for n in cat if n)
|
||||
elif isinstance(cat, dict):
|
||||
flat.update(str(k) for k in cat.keys() if k)
|
||||
return flat
|
||||
|
||||
|
||||
def _check_entity_confusion(text: str, entity_names_raw: dict) -> list:
|
||||
"""Flag names mentioned in the text that are edit-distance ≤ 2 from
|
||||
a *different* registered name — a common typo / mix-up pattern.
|
||||
|
||||
Performance note: the original O(n²) pairwise scan over the full
|
||||
registry is gone. We first identify which names actually appear in
|
||||
the text, then only compute edit distance between *mentioned* names
|
||||
and the rest of the registry. This makes the cost O(m × n) where m
|
||||
is the handful of names in the text, not the full registry.
|
||||
"""
|
||||
all_names = _flatten_names(entity_names_raw)
|
||||
if not all_names:
|
||||
return []
|
||||
|
||||
# Which names from the registry actually appear in the text?
|
||||
mentioned: list = []
|
||||
for name in all_names:
|
||||
if re.search(r"\b" + re.escape(name) + r"\b", text, re.IGNORECASE):
|
||||
mentioned.append(name)
|
||||
if not mentioned:
|
||||
return []
|
||||
|
||||
issues: list = []
|
||||
seen_pairs: set = set()
|
||||
for name_a in mentioned:
|
||||
a_lower = name_a.lower()
|
||||
for name_b in all_names:
|
||||
if name_b == name_a:
|
||||
continue
|
||||
# Dedupe by unordered pair so we don't double-report.
|
||||
pair_key = tuple(sorted((name_a.lower(), name_b.lower())))
|
||||
if pair_key in seen_pairs:
|
||||
continue
|
||||
# Only flag when name_b is a *different* registry entry that
|
||||
# was NOT mentioned — otherwise both names in the text is
|
||||
# just the user writing about two people.
|
||||
if name_b in mentioned:
|
||||
seen_pairs.add(pair_key)
|
||||
continue
|
||||
distance = _edit_distance(a_lower, name_b.lower())
|
||||
if 0 < distance <= 2:
|
||||
issues.append(
|
||||
{
|
||||
"type": "similar_name",
|
||||
"detail": (
|
||||
f"'{name_a}' mentioned — did you mean "
|
||||
f"'{name_b}'? (edit distance {distance})"
|
||||
),
|
||||
"names": [name_a, name_b],
|
||||
"distance": distance,
|
||||
}
|
||||
)
|
||||
seen_pairs.add(pair_key)
|
||||
return issues
|
||||
|
||||
|
||||
# ── KG contradictions ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _extract_claims(text: str) -> list:
|
||||
"""Yield structured (subject, predicate, object) claims from ``text``.
|
||||
|
||||
The two supported surface forms are "X is Y's Z" and "X's Z is Y",
|
||||
both of which resolve to the triple ``(X, Z, Y)`` — ``X`` has role
|
||||
``Z`` with respect to ``Y``. Matches are case-preserving for the
|
||||
entity names (KG lookup is case-insensitive on normalized IDs).
|
||||
"""
|
||||
claims: list = []
|
||||
for pat in _RELATIONSHIP_PATTERNS:
|
||||
for match in pat.finditer(text):
|
||||
groups = match.groups()
|
||||
if pat is _RELATIONSHIP_PATTERNS[0]:
|
||||
subject, possessor, role = groups[0], groups[1], groups[2]
|
||||
else:
|
||||
possessor, role, subject = groups[0], groups[1], groups[2]
|
||||
claims.append(
|
||||
{
|
||||
"subject": subject,
|
||||
"predicate": role.lower(),
|
||||
"object": possessor,
|
||||
"span": match.group(0),
|
||||
}
|
||||
)
|
||||
return claims
|
||||
|
||||
|
||||
def _check_kg_contradictions(text: str, palace_path: str) -> list:
|
||||
"""Compare each claim in ``text`` against the KG.
|
||||
|
||||
For every claim ``(subject, predicate, object)`` parsed from the
|
||||
text, look up the subject's current KG triples:
|
||||
|
||||
* ``relationship_mismatch`` fires when the KG records a fact about
|
||||
the same ``(subject, object)`` pair but with a *different*
|
||||
predicate — e.g. text says "brother" but KG says "husband".
|
||||
* ``stale_fact`` fires when the KG has the exact ``(subject,
|
||||
predicate, object)`` triple but its ``valid_to`` is in the past,
|
||||
meaning the claim is no longer current.
|
||||
"""
|
||||
claims = _extract_claims(text)
|
||||
if not claims:
|
||||
return []
|
||||
|
||||
try:
|
||||
from .knowledge_graph import KnowledgeGraph
|
||||
|
||||
# KG lives alongside the palace collection; mcp_server uses the
|
||||
# same convention (see _kg init). Pass ``db_path`` — the previous
|
||||
# code passed a nonexistent ``palace_path`` kwarg which raised
|
||||
# TypeError, silently swallowed by the outer except and rendered
|
||||
# the entire KG-check path dead.
|
||||
kg = KnowledgeGraph(db_path=os.path.join(palace_path, "knowledge_graph.sqlite3"))
|
||||
except Exception:
|
||||
# KG unavailable (brand-new palace, corrupted DB, etc.) — skip.
|
||||
return []
|
||||
|
||||
issues: list = []
|
||||
for claim in claims:
|
||||
subject = claim["subject"]
|
||||
claim_pred = claim["predicate"]
|
||||
claim_obj = claim["object"]
|
||||
try:
|
||||
facts = kg.query_entity(subject, direction="outgoing")
|
||||
except Exception:
|
||||
continue
|
||||
if not facts:
|
||||
continue
|
||||
|
||||
current_facts = [f for f in facts if f.get("current")]
|
||||
|
||||
# Mismatch: KG fact about same (subject, object) pair but different predicate.
|
||||
for fact in current_facts:
|
||||
if not _objects_match(fact.get("object"), claim_obj):
|
||||
continue
|
||||
kg_pred = (fact.get("predicate") or "").lower()
|
||||
if kg_pred and kg_pred != claim_pred:
|
||||
issues.append(
|
||||
{
|
||||
"type": "relationship_mismatch",
|
||||
"detail": (
|
||||
f"Text says '{claim['span']}' but KG records "
|
||||
f"{subject} {kg_pred} {fact.get('object')}"
|
||||
),
|
||||
"entity": subject,
|
||||
"claim": {
|
||||
"predicate": claim_pred,
|
||||
"object": claim_obj,
|
||||
},
|
||||
"kg_fact": {
|
||||
"predicate": kg_pred,
|
||||
"object": fact.get("object"),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Stale fact: exact match on (subject, predicate, object) but KG
|
||||
# closed the window in the past.
|
||||
now_iso = datetime.now(timezone.utc).date().isoformat()
|
||||
for fact in facts:
|
||||
if fact.get("current"):
|
||||
continue
|
||||
kg_pred = (fact.get("predicate") or "").lower()
|
||||
if kg_pred != claim_pred:
|
||||
continue
|
||||
if not _objects_match(fact.get("object"), claim_obj):
|
||||
continue
|
||||
valid_to = fact.get("valid_to")
|
||||
if valid_to and str(valid_to) < now_iso:
|
||||
issues.append(
|
||||
{
|
||||
"type": "stale_fact",
|
||||
"detail": (
|
||||
f"Text says '{claim['span']}' but KG marks "
|
||||
f"this fact closed on {valid_to}"
|
||||
),
|
||||
"entity": subject,
|
||||
"valid_to": valid_to,
|
||||
}
|
||||
)
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def _objects_match(kg_obj, claim_obj: str) -> bool:
|
||||
if kg_obj is None or not claim_obj:
|
||||
return False
|
||||
return str(kg_obj).strip().lower() == claim_obj.strip().lower()
|
||||
|
||||
|
||||
# ── Levenshtein helper (tight iterative version) ─────────────────────
|
||||
|
||||
|
||||
def _edit_distance(s1: str, s2: str) -> int:
|
||||
"""Levenshtein distance. O(len(s1) * len(s2)) time, O(len(s2)) space."""
|
||||
if len(s1) < len(s2):
|
||||
s1, s2 = s2, s1
|
||||
if not s2:
|
||||
return len(s1)
|
||||
prev = list(range(len(s2) + 1))
|
||||
for i, c1 in enumerate(s1):
|
||||
curr = [i + 1]
|
||||
for j, c2 in enumerate(s2):
|
||||
curr.append(
|
||||
min(
|
||||
prev[j + 1] + 1,
|
||||
curr[j] + 1,
|
||||
prev[j] + (0 if c1 == c2 else 1),
|
||||
)
|
||||
)
|
||||
prev = curr
|
||||
return prev[-1]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Check text against known facts in the MemPalace palace.",
|
||||
epilog="Exits 0 when no issues found, 1 when one or more issues detected.",
|
||||
)
|
||||
parser.add_argument("text", nargs="?", help="Text to check (or use --stdin).")
|
||||
parser.add_argument(
|
||||
"--palace",
|
||||
default=os.path.expanduser("~/.mempalace/palace"),
|
||||
help="Path to the palace directory.",
|
||||
)
|
||||
parser.add_argument("--stdin", action="store_true", help="Read text from stdin.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.stdin:
|
||||
text_in = sys.stdin.read()
|
||||
elif args.text:
|
||||
text_in = args.text
|
||||
else:
|
||||
parser.error("Provide text as argument or use --stdin.")
|
||||
|
||||
found = check_text(text_in, palace_path=args.palace)
|
||||
if found:
|
||||
print(json.dumps(found, indent=2))
|
||||
sys.exit(1)
|
||||
print("No contradictions found.")
|
||||
+105
-41
@@ -18,18 +18,22 @@ SAVE_INTERVAL = 15
|
||||
STATE_DIR = Path.home() / ".mempalace" / "hook_state"
|
||||
|
||||
STOP_BLOCK_REASON = (
|
||||
"AUTO-SAVE checkpoint. Save key topics, decisions, quotes, and code "
|
||||
"from this session to your memory system. Organize into appropriate "
|
||||
"categories. Use verbatim quotes where possible. Continue conversation "
|
||||
"after saving."
|
||||
"AUTO-SAVE checkpoint (MemPalace). Save this session's key content:\n"
|
||||
"1. mempalace_diary_write — AAAK-compressed session summary\n"
|
||||
"2. mempalace_add_drawer — verbatim quotes, decisions, code snippets\n"
|
||||
"3. mempalace_kg_add — entity relationships (optional)\n"
|
||||
"Do NOT write to Claude Code's native auto-memory (.md files). "
|
||||
"Continue conversation after saving."
|
||||
)
|
||||
|
||||
PRECOMPACT_BLOCK_REASON = (
|
||||
"COMPACTION IMMINENT. Save ALL topics, decisions, quotes, code, and "
|
||||
"important context from this session to your memory system. Be thorough "
|
||||
"\u2014 after compaction, detailed context will be lost. Organize into "
|
||||
"appropriate categories. Use verbatim quotes where possible. Save "
|
||||
"everything, then allow compaction to proceed."
|
||||
"COMPACTION IMMINENT (MemPalace). Save ALL session content before context is lost:\n"
|
||||
"1. mempalace_diary_write — thorough AAAK-compressed session summary\n"
|
||||
"2. mempalace_add_drawer — ALL verbatim quotes, decisions, code, context\n"
|
||||
"3. mempalace_kg_add — entity relationships (optional)\n"
|
||||
"Be thorough \u2014 after compaction, detailed context will be lost. "
|
||||
"Do NOT write to Claude Code's native auto-memory (.md files). "
|
||||
"Save everything to MemPalace, then allow compaction to proceed."
|
||||
)
|
||||
|
||||
|
||||
@@ -39,9 +43,32 @@ def _sanitize_session_id(session_id: str) -> str:
|
||||
return sanitized or "unknown"
|
||||
|
||||
|
||||
def _validate_transcript_path(transcript_path: str) -> Path:
|
||||
"""Validate and resolve a transcript path, rejecting paths outside expected roots.
|
||||
|
||||
Returns a resolved Path if valid, or None if the path should be rejected.
|
||||
Accepted paths must:
|
||||
- Have a .jsonl or .json extension
|
||||
- Not contain '..' after resolution (path traversal prevention)
|
||||
"""
|
||||
if not transcript_path:
|
||||
return None
|
||||
path = Path(transcript_path).expanduser().resolve()
|
||||
if path.suffix not in (".jsonl", ".json"):
|
||||
return None
|
||||
# Reject if the original input contained '..' traversal components
|
||||
if ".." in Path(transcript_path).parts:
|
||||
return None
|
||||
return path
|
||||
|
||||
|
||||
def _count_human_messages(transcript_path: str) -> int:
|
||||
"""Count human messages in a JSONL transcript, skipping command-messages."""
|
||||
path = Path(transcript_path).expanduser()
|
||||
path = _validate_transcript_path(transcript_path)
|
||||
if path is None:
|
||||
if transcript_path:
|
||||
_log(f"WARNING: transcript_path rejected by validator: {transcript_path!r}")
|
||||
return 0
|
||||
if not path.is_file():
|
||||
return 0
|
||||
count = 0
|
||||
@@ -78,14 +105,30 @@ def _count_human_messages(transcript_path: str) -> int:
|
||||
return count
|
||||
|
||||
|
||||
_state_dir_initialized = False
|
||||
|
||||
|
||||
def _log(message: str):
|
||||
"""Append to hook state log file."""
|
||||
global _state_dir_initialized
|
||||
try:
|
||||
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
if not _state_dir_initialized:
|
||||
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
STATE_DIR.chmod(0o700)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
_state_dir_initialized = True
|
||||
log_path = STATE_DIR / "hook.log"
|
||||
is_new = not log_path.exists()
|
||||
timestamp = datetime.now().strftime("%H:%M:%S")
|
||||
with open(log_path, "a") as f:
|
||||
f.write(f"[{timestamp}] {message}\n")
|
||||
if is_new:
|
||||
try:
|
||||
log_path.chmod(0o600)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
@@ -95,20 +138,53 @@ def _output(data: dict):
|
||||
print(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
def _maybe_auto_ingest():
|
||||
"""If MEMPAL_DIR is set and exists, run mempalace mine in background."""
|
||||
def _get_mine_dir(transcript_path: str = "") -> str:
|
||||
"""Determine directory to mine from MEMPAL_DIR or transcript path."""
|
||||
mempal_dir = os.environ.get("MEMPAL_DIR", "")
|
||||
if mempal_dir and os.path.isdir(mempal_dir):
|
||||
try:
|
||||
log_path = STATE_DIR / "hook.log"
|
||||
with open(log_path, "a") as log_f:
|
||||
subprocess.Popen(
|
||||
[sys.executable, "-m", "mempalace", "mine", mempal_dir],
|
||||
stdout=log_f,
|
||||
stderr=log_f,
|
||||
)
|
||||
except OSError:
|
||||
pass
|
||||
return mempal_dir
|
||||
if transcript_path:
|
||||
path = Path(transcript_path).expanduser()
|
||||
if path.is_file():
|
||||
return str(path.parent)
|
||||
return ""
|
||||
|
||||
|
||||
def _maybe_auto_ingest(transcript_path: str = ""):
|
||||
"""Run mempalace mine in background if a mine directory is available."""
|
||||
mine_dir = _get_mine_dir(transcript_path)
|
||||
if not mine_dir:
|
||||
return
|
||||
try:
|
||||
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
log_path = STATE_DIR / "hook.log"
|
||||
with open(log_path, "a") as log_f:
|
||||
subprocess.Popen(
|
||||
[sys.executable, "-m", "mempalace", "mine", mine_dir],
|
||||
stdout=log_f,
|
||||
stderr=log_f,
|
||||
)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _mine_sync(transcript_path: str = ""):
|
||||
"""Run mempalace mine synchronously (for precompact -- data must land first)."""
|
||||
mine_dir = _get_mine_dir(transcript_path)
|
||||
if not mine_dir:
|
||||
return
|
||||
try:
|
||||
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
log_path = STATE_DIR / "hook.log"
|
||||
with open(log_path, "a") as log_f:
|
||||
subprocess.run(
|
||||
[sys.executable, "-m", "mempalace", "mine", mine_dir],
|
||||
stdout=log_f,
|
||||
stderr=log_f,
|
||||
timeout=60,
|
||||
)
|
||||
except (OSError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
|
||||
|
||||
SUPPORTED_HARNESSES = {"claude-code", "codex"}
|
||||
@@ -165,7 +241,7 @@ def hook_stop(data: dict, harness: str):
|
||||
_log(f"TRIGGERING SAVE at exchange {exchange_count}")
|
||||
|
||||
# Optional: auto-ingest if MEMPAL_DIR is set
|
||||
_maybe_auto_ingest()
|
||||
_maybe_auto_ingest(transcript_path)
|
||||
|
||||
_output({"decision": "block", "reason": STOP_BLOCK_REASON})
|
||||
else:
|
||||
@@ -187,29 +263,17 @@ def hook_session_start(data: dict, harness: str):
|
||||
|
||||
|
||||
def hook_precompact(data: dict, harness: str):
|
||||
"""Precompact hook: always block with comprehensive save instruction."""
|
||||
"""Precompact hook: mine transcript synchronously, then allow compaction."""
|
||||
parsed = _parse_harness_input(data, harness)
|
||||
session_id = parsed["session_id"]
|
||||
transcript_path = parsed["transcript_path"]
|
||||
|
||||
_log(f"PRE-COMPACT triggered for session {session_id}")
|
||||
|
||||
# Optional: auto-ingest synchronously before compaction (so memories land first)
|
||||
mempal_dir = os.environ.get("MEMPAL_DIR", "")
|
||||
if mempal_dir and os.path.isdir(mempal_dir):
|
||||
try:
|
||||
log_path = STATE_DIR / "hook.log"
|
||||
with open(log_path, "a") as log_f:
|
||||
subprocess.run(
|
||||
[sys.executable, "-m", "mempalace", "mine", mempal_dir],
|
||||
stdout=log_f,
|
||||
stderr=log_f,
|
||||
timeout=60,
|
||||
)
|
||||
except OSError:
|
||||
pass
|
||||
# Mine synchronously so data lands before compaction proceeds
|
||||
_mine_sync(transcript_path)
|
||||
|
||||
# Always block -- compaction = save everything
|
||||
_output({"decision": "block", "reason": PRECOMPACT_BLOCK_REASON})
|
||||
_output({})
|
||||
|
||||
|
||||
def run_hook(hook_name: str, harness: str):
|
||||
|
||||
@@ -7,6 +7,10 @@ Usage:
|
||||
print(t("cli.mine_start", path="/docs")) # "Extraction de /docs..."
|
||||
print(t("terms.wing")) # "aile"
|
||||
print(t("aaak.instruction")) # AAAK compression instruction in French
|
||||
|
||||
Each locale JSON may include an ``entity`` section with patterns used by
|
||||
``mempalace.entity_detector``. See ``get_entity_patterns`` for the merge rules
|
||||
and the README section "Adding a new language" for the schema.
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -16,6 +20,9 @@ _LANG_DIR = Path(__file__).parent
|
||||
_strings: dict = {}
|
||||
_current_lang: str = "en"
|
||||
|
||||
# Cache: tuple(langs) -> merged entity pattern dict
|
||||
_entity_cache: dict = {}
|
||||
|
||||
|
||||
def available_languages() -> list[str]:
|
||||
"""Return list of available language codes."""
|
||||
@@ -72,5 +79,112 @@ def get_regex() -> dict:
|
||||
return _strings.get("regex", {})
|
||||
|
||||
|
||||
def _load_entity_section(lang: str) -> dict:
|
||||
"""Load the raw entity section for one language. Returns {} if missing."""
|
||||
lang_file = _LANG_DIR / f"{lang}.json"
|
||||
if not lang_file.exists():
|
||||
return {}
|
||||
try:
|
||||
data = json.loads(lang_file.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
return data.get("entity", {}) or {}
|
||||
|
||||
|
||||
def get_entity_patterns(languages=("en",)) -> dict:
|
||||
"""Return merged entity detection patterns for the requested languages.
|
||||
|
||||
Entity detection patterns live under each locale's ``entity`` section.
|
||||
This function merges them into a single dict for consumption by
|
||||
``mempalace.entity_detector``.
|
||||
|
||||
Merge rules:
|
||||
- List fields (person_verb_patterns, pronoun_patterns, dialogue_patterns,
|
||||
project_verb_patterns) are concatenated in the order of ``languages``,
|
||||
with duplicates removed while preserving first occurrence.
|
||||
- ``stopwords`` is the set union across all languages, returned as a
|
||||
sorted list.
|
||||
- ``candidate_patterns`` and ``multi_word_patterns`` are returned as
|
||||
lists (one per language) since they use different character classes;
|
||||
callers run each pattern independently and union the matches.
|
||||
- ``direct_address_pattern`` is returned as a list of per-language
|
||||
alternation patterns (not concatenated — each is applied separately).
|
||||
|
||||
If ``languages`` is empty or no requested language declares entity data,
|
||||
English is used as a fallback so callers always get a working config.
|
||||
"""
|
||||
if not languages:
|
||||
languages = ("en",)
|
||||
key = tuple(languages)
|
||||
if key in _entity_cache:
|
||||
return _entity_cache[key]
|
||||
|
||||
candidate_patterns: list[str] = []
|
||||
multi_word_patterns: list[str] = []
|
||||
person_verbs: list[str] = []
|
||||
pronouns: list[str] = []
|
||||
dialogue: list[str] = []
|
||||
direct_address: list[str] = []
|
||||
project_verbs: list[str] = []
|
||||
stopwords: set = set()
|
||||
|
||||
found_any = False
|
||||
for lang in languages:
|
||||
section = _load_entity_section(lang)
|
||||
if not section:
|
||||
continue
|
||||
found_any = True
|
||||
if section.get("candidate_pattern"):
|
||||
candidate_patterns.append(section["candidate_pattern"])
|
||||
if section.get("multi_word_pattern"):
|
||||
multi_word_patterns.append(section["multi_word_pattern"])
|
||||
if section.get("direct_address_pattern"):
|
||||
direct_address.append(section["direct_address_pattern"])
|
||||
person_verbs.extend(section.get("person_verb_patterns", []))
|
||||
pronouns.extend(section.get("pronoun_patterns", []))
|
||||
dialogue.extend(section.get("dialogue_patterns", []))
|
||||
project_verbs.extend(section.get("project_verb_patterns", []))
|
||||
stopwords.update(w.lower() for w in section.get("stopwords", []))
|
||||
|
||||
if not found_any:
|
||||
# Fallback: load English directly
|
||||
section = _load_entity_section("en")
|
||||
if section.get("candidate_pattern"):
|
||||
candidate_patterns.append(section["candidate_pattern"])
|
||||
if section.get("multi_word_pattern"):
|
||||
multi_word_patterns.append(section["multi_word_pattern"])
|
||||
if section.get("direct_address_pattern"):
|
||||
direct_address.append(section["direct_address_pattern"])
|
||||
person_verbs.extend(section.get("person_verb_patterns", []))
|
||||
pronouns.extend(section.get("pronoun_patterns", []))
|
||||
dialogue.extend(section.get("dialogue_patterns", []))
|
||||
project_verbs.extend(section.get("project_verb_patterns", []))
|
||||
stopwords.update(w.lower() for w in section.get("stopwords", []))
|
||||
|
||||
merged = {
|
||||
"candidate_patterns": candidate_patterns,
|
||||
"multi_word_patterns": multi_word_patterns,
|
||||
"person_verb_patterns": _dedupe(person_verbs),
|
||||
"pronoun_patterns": _dedupe(pronouns),
|
||||
"dialogue_patterns": _dedupe(dialogue),
|
||||
"direct_address_patterns": direct_address,
|
||||
"project_verb_patterns": _dedupe(project_verbs),
|
||||
"stopwords": sorted(stopwords),
|
||||
}
|
||||
_entity_cache[key] = merged
|
||||
return merged
|
||||
|
||||
|
||||
def _dedupe(items: list) -> list:
|
||||
"""Remove duplicates while preserving first-occurrence order."""
|
||||
seen = set()
|
||||
out = []
|
||||
for item in items:
|
||||
if item not in seen:
|
||||
seen.add(item)
|
||||
out.append(item)
|
||||
return out
|
||||
|
||||
|
||||
# Auto-load English on import
|
||||
load_lang("en")
|
||||
|
||||
@@ -40,5 +40,107 @@
|
||||
"stop_words": "the this that these those some many most each every other only such very will would could should must shall yeah okay also even then now already still back done make take give know think want need going come find work added saved session summary conversation topics source about once just really actually here there where good great better thank please sorry right wrong true false",
|
||||
"quote_pattern": "\"([^\"]{20,200})\"",
|
||||
"action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}"
|
||||
},
|
||||
"entity": {
|
||||
"candidate_pattern": "[A-Z][a-z]{1,19}",
|
||||
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
|
||||
"person_verb_patterns": [
|
||||
"\\b{name}\\s+said\\b",
|
||||
"\\b{name}\\s+asked\\b",
|
||||
"\\b{name}\\s+told\\b",
|
||||
"\\b{name}\\s+replied\\b",
|
||||
"\\b{name}\\s+laughed\\b",
|
||||
"\\b{name}\\s+smiled\\b",
|
||||
"\\b{name}\\s+cried\\b",
|
||||
"\\b{name}\\s+felt\\b",
|
||||
"\\b{name}\\s+thinks?\\b",
|
||||
"\\b{name}\\s+wants?\\b",
|
||||
"\\b{name}\\s+loves?\\b",
|
||||
"\\b{name}\\s+hates?\\b",
|
||||
"\\b{name}\\s+knows?\\b",
|
||||
"\\b{name}\\s+decided\\b",
|
||||
"\\b{name}\\s+pushed\\b",
|
||||
"\\b{name}\\s+wrote\\b",
|
||||
"\\bhey\\s+{name}\\b",
|
||||
"\\bthanks?\\s+{name}\\b",
|
||||
"\\bhi\\s+{name}\\b",
|
||||
"\\bdear\\s+{name}\\b"
|
||||
],
|
||||
"pronoun_patterns": [
|
||||
"\\bshe\\b",
|
||||
"\\bher\\b",
|
||||
"\\bhers\\b",
|
||||
"\\bhe\\b",
|
||||
"\\bhim\\b",
|
||||
"\\bhis\\b",
|
||||
"\\bthey\\b",
|
||||
"\\bthem\\b",
|
||||
"\\btheir\\b"
|
||||
],
|
||||
"dialogue_patterns": [
|
||||
"^>\\s*{name}[:\\s]",
|
||||
"^{name}:\\s",
|
||||
"^\\[{name}\\]",
|
||||
"\"{name}\\s+said"
|
||||
],
|
||||
"direct_address_pattern": "\\bhey\\s+{name}\\b|\\bthanks?\\s+{name}\\b|\\bhi\\s+{name}\\b",
|
||||
"project_verb_patterns": [
|
||||
"\\bbuilding\\s+{name}\\b",
|
||||
"\\bbuilt\\s+{name}\\b",
|
||||
"\\bship(?:ping|ped)?\\s+{name}\\b",
|
||||
"\\blaunch(?:ing|ed)?\\s+{name}\\b",
|
||||
"\\bdeploy(?:ing|ed)?\\s+{name}\\b",
|
||||
"\\binstall(?:ing|ed)?\\s+{name}\\b",
|
||||
"\\bthe\\s+{name}\\s+architecture\\b",
|
||||
"\\bthe\\s+{name}\\s+pipeline\\b",
|
||||
"\\bthe\\s+{name}\\s+system\\b",
|
||||
"\\bthe\\s+{name}\\s+repo\\b",
|
||||
"\\b{name}\\s+v\\d+\\b",
|
||||
"\\b{name}\\.py\\b",
|
||||
"\\b{name}-core\\b",
|
||||
"\\b{name}-local\\b",
|
||||
"\\bimport\\s+{name}\\b",
|
||||
"\\bpip\\s+install\\s+{name}\\b"
|
||||
],
|
||||
"stopwords": [
|
||||
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to",
|
||||
"for", "of", "with", "by", "from", "as", "is", "was", "are", "were",
|
||||
"be", "been", "being", "have", "has", "had", "do", "does", "did",
|
||||
"will", "would", "could", "should", "may", "might", "must", "shall", "can",
|
||||
"this", "that", "these", "those", "it", "its", "they", "them", "their",
|
||||
"we", "our", "you", "your", "i", "my", "me", "he", "she", "his", "her",
|
||||
"who", "what", "when", "where", "why", "how", "which",
|
||||
"if", "then", "so", "not", "no", "yes", "ok", "okay",
|
||||
"just", "very", "really", "also", "already", "still", "even", "only",
|
||||
"here", "there", "now", "too", "up", "out", "about", "like",
|
||||
"use", "get", "got", "make", "made", "take", "put", "come", "go", "see",
|
||||
"know", "think", "true", "false", "none", "null", "new", "old", "all", "any", "some",
|
||||
"return", "print", "def", "class", "import",
|
||||
"step", "usage", "run", "check", "find", "add", "set", "list",
|
||||
"args", "dict", "str", "int", "bool", "path", "file", "type", "name",
|
||||
"note", "example", "option", "result", "error", "warning", "info",
|
||||
"every", "each", "more", "less", "next", "last", "first", "second",
|
||||
"stack", "layer", "mode", "test", "stop", "start", "copy", "move",
|
||||
"source", "target", "output", "input", "data", "item", "key", "value",
|
||||
"returns", "raises", "yields", "self", "cls", "kwargs",
|
||||
"world", "well", "want", "topic", "choose", "social", "cars", "phones",
|
||||
"healthcare", "ex", "machina", "deus", "human", "humans", "people",
|
||||
"things", "something", "nothing", "everything", "anything", "someone",
|
||||
"everyone", "anyone", "way", "time", "day", "life", "place", "thing",
|
||||
"part", "kind", "sort", "case", "point", "idea", "fact", "sense",
|
||||
"question", "answer", "reason", "number", "version", "system",
|
||||
"hey", "hi", "hello", "thanks", "thank", "right", "let",
|
||||
"click", "hit", "press", "tap", "drag", "drop", "open", "close",
|
||||
"save", "load", "launch", "install", "download", "upload", "scroll",
|
||||
"select", "enter", "submit", "cancel", "confirm", "delete", "paste",
|
||||
"write", "read", "search", "show", "hide",
|
||||
"desktop", "documents", "downloads", "users", "home", "library",
|
||||
"applications", "preferences", "settings", "terminal",
|
||||
"actor", "vector", "remote", "control", "duration", "fetch",
|
||||
"agents", "tools", "others", "guards", "ethics", "regulation",
|
||||
"learning", "thinking", "memory", "language", "intelligence",
|
||||
"technology", "society", "culture", "future", "history", "science",
|
||||
"model", "models", "network", "networks", "training", "inference"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
"status_palace": "궁전: {path}",
|
||||
"status_wings": "날개 {count}개",
|
||||
"status_closets": "벽장 {count}개",
|
||||
"status_drawers": "서랍 {drawers}개",
|
||||
"status_drawers": "서랍 {count}개",
|
||||
"init_complete": "{path}에 궁전 초기화 완료",
|
||||
"init_exists": "{path}에 궁전이 이미 존재합니다",
|
||||
"repair_complete": "수리 완료. {fixed}개 문제 해결.",
|
||||
|
||||
@@ -0,0 +1,161 @@
|
||||
{
|
||||
"lang": "ru",
|
||||
"label": "Русский",
|
||||
"terms": {
|
||||
"palace": "дворец",
|
||||
"wing": "крыло",
|
||||
"hall": "зал",
|
||||
"closet": "шкаф",
|
||||
"drawer": "ящик",
|
||||
"mine": "раскопка",
|
||||
"search": "поиск",
|
||||
"status": "статус",
|
||||
"init": "создание",
|
||||
"repair": "починка",
|
||||
"migrate": "миграция",
|
||||
"entity": "сущность",
|
||||
"topic": "тема"
|
||||
},
|
||||
"cli": {
|
||||
"mine_start": "Раскопка {path}...",
|
||||
"mine_complete": "Готово. Шкафов: {closets}, ящиков: {drawers}.",
|
||||
"mine_skip": "Уже обработано. Используйте --force для повторной обработки.",
|
||||
"search_no_results": "Нет результатов по запросу: {query}",
|
||||
"search_results": "Найдено результатов: {count}",
|
||||
"status_palace": "Дворец: {path}",
|
||||
"status_wings": "Крыльев: {count}",
|
||||
"status_closets": "Шкафов: {count}",
|
||||
"status_drawers": "Ящиков: {count}",
|
||||
"init_complete": "Дворец создан в {path}",
|
||||
"init_exists": "Дворец уже существует в {path}",
|
||||
"repair_complete": "Починка завершена. Исправлено проблем: {fixed}.",
|
||||
"migrate_complete": "Миграция завершена.",
|
||||
"no_palace": "Дворец не найден. Выполните: mempalace init <директория>"
|
||||
},
|
||||
"aaak": {
|
||||
"instruction": "Сжать до индексного формата. Дефисы между словами, вертикальные черты между понятиями. Убрать предлоги и служебные слова. Имена и числа сохранять точно."
|
||||
},
|
||||
"regex": {
|
||||
"topic_pattern": "[А-ЯЁ][а-яё]{2,}|[A-Z][a-z]{2,}|[A-Za-z][A-Za-z0-9_]{2,}",
|
||||
"stop_words": "это этот эта эти тот та те тех некоторые много каждый другой только такой очень будет может должен надо хорошо также даже потом сейчас уже ещё обратно сделано делать брать давать знать думать хотеть нужно если когда просто правда ладно вообще конечно например значит кстати наверное видимо похоже получается собственно кажется",
|
||||
"quote_pattern": "«\\s*([^»]{10,200})\\s*»|\"([^\"]{10,200})\"",
|
||||
"action_pattern": "(?:построил|исправил|написал|добавил|запустил|протестировал|проверил|создал|удалил|обновил|настроил|развернул|перенёс|собрал)\\s+[\\wа-яёА-ЯЁ\\s]{3,30}"
|
||||
},
|
||||
"entity": {
|
||||
"candidate_pattern": "[А-ЯЁ][а-яё]{1,19}",
|
||||
"multi_word_pattern": "[А-ЯЁ][а-яё]+(?:\\s+[А-ЯЁ][а-яё]+)+",
|
||||
"person_verb_patterns": [
|
||||
"\\b{name}\\s+сказал[аи]?\\b",
|
||||
"\\b{name}\\s+спросил[аи]?\\b",
|
||||
"\\b{name}\\s+ответил[аи]?\\b",
|
||||
"\\b{name}\\s+рассказал[аи]?\\b",
|
||||
"\\b{name}\\s+засмеял(ся|ась|ись)\\b",
|
||||
"\\b{name}\\s+улыбнул(ся|ась|ись)\\b",
|
||||
"\\b{name}\\s+заплакал[аи]?\\b",
|
||||
"\\b{name}\\s+почувствовал[аи]?\\b",
|
||||
"\\b{name}\\s+думает\\b",
|
||||
"\\b{name}\\s+хочет\\b",
|
||||
"\\b{name}\\s+любит\\b",
|
||||
"\\b{name}\\s+ненавидит\\b",
|
||||
"\\b{name}\\s+знает\\b",
|
||||
"\\b{name}\\s+решил[аи]?\\b",
|
||||
"\\b{name}\\s+написал[аи]?\\b"
|
||||
],
|
||||
"pronoun_patterns": [
|
||||
"\\bона\\b",
|
||||
"\\bеё\\b",
|
||||
"\\bей\\b",
|
||||
"\\bон\\b",
|
||||
"\\bего\\b",
|
||||
"\\bему\\b",
|
||||
"\\bони\\b",
|
||||
"\\bих\\b",
|
||||
"\\bим\\b"
|
||||
],
|
||||
"dialogue_patterns": [
|
||||
"^>\\s*{name}[:\\s]",
|
||||
"^{name}:\\s",
|
||||
"^\\[{name}\\]",
|
||||
"\"{name}\\s+сказал"
|
||||
],
|
||||
"direct_address_pattern": "\\bпривет\\s+{name}\\b|\\bспасибо\\s+{name}\\b|\\bздравствуй(те)?\\s+{name}\\b|\\bуважаемый\\s+{name}\\b|\\bуважаемая\\s+{name}\\b|\\bдорогой\\s+{name}\\b|\\bдорогая\\s+{name}\\b",
|
||||
"project_verb_patterns": [
|
||||
"\\bсобираю\\s+{name}\\b",
|
||||
"\\bсобрал\\s+{name}\\b",
|
||||
"\\bзапускаю\\s+{name}\\b",
|
||||
"\\bзапустил\\s+{name}\\b",
|
||||
"\\bразвернул\\s+{name}\\b",
|
||||
"\\bустановил\\s+{name}\\b",
|
||||
"\\bсистема\\s+{name}\\b",
|
||||
"\\bпроект\\s+{name}\\b",
|
||||
"\\bimport\\s+{name}\\b",
|
||||
"\\bpip\\s+install\\s+{name}\\b"
|
||||
],
|
||||
"stopwords": [
|
||||
"привет",
|
||||
"здравствуйте",
|
||||
"спасибо",
|
||||
"пожалуйста",
|
||||
"да",
|
||||
"нет",
|
||||
"может",
|
||||
"наверное",
|
||||
"здесь",
|
||||
"там",
|
||||
"тут",
|
||||
"сейчас",
|
||||
"сегодня",
|
||||
"вчера",
|
||||
"завтра",
|
||||
"всегда",
|
||||
"никогда",
|
||||
"ещё",
|
||||
"тоже",
|
||||
"очень",
|
||||
"мало",
|
||||
"хорошо",
|
||||
"плохо",
|
||||
"так",
|
||||
"потом",
|
||||
"перед",
|
||||
"после",
|
||||
"между",
|
||||
"около",
|
||||
"вместе",
|
||||
"без",
|
||||
"для",
|
||||
"над",
|
||||
"под",
|
||||
"при",
|
||||
"про",
|
||||
"через",
|
||||
"против",
|
||||
"вместо",
|
||||
"кроме",
|
||||
"среди",
|
||||
"вокруг",
|
||||
"вдоль",
|
||||
"ради",
|
||||
"напротив",
|
||||
"благодаря",
|
||||
"согласно",
|
||||
"навстречу",
|
||||
"или",
|
||||
"либо",
|
||||
"но",
|
||||
"однако",
|
||||
"зато",
|
||||
"хотя",
|
||||
"если",
|
||||
"когда",
|
||||
"пока",
|
||||
"чтобы",
|
||||
"потому",
|
||||
"поэтому",
|
||||
"причём",
|
||||
"притом",
|
||||
"будто",
|
||||
"словно"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -1,84 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Quick smoke test for i18n dictionaries + Dialect integration."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent to path so we can import mempalace
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
|
||||
|
||||
from mempalace.i18n import load_lang, t, available_languages
|
||||
from mempalace.dialect import Dialect
|
||||
|
||||
|
||||
def test_all_languages_load():
|
||||
"""Every JSON file loads without error and has required keys."""
|
||||
required_sections = ["terms", "cli", "aaak"]
|
||||
required_terms = ["palace", "wing", "closet", "drawer"]
|
||||
|
||||
langs = available_languages()
|
||||
assert len(langs) >= 7, f"Expected 7+ languages, got {len(langs)}"
|
||||
|
||||
for lang in langs:
|
||||
strings = load_lang(lang)
|
||||
for section in required_sections:
|
||||
assert section in strings, f"{lang}: missing section '{section}'"
|
||||
for term in required_terms:
|
||||
assert term in strings["terms"], f"{lang}: missing term '{term}'"
|
||||
assert len(strings["terms"][term]) > 0, f"{lang}: empty term '{term}'"
|
||||
assert "instruction" in strings["aaak"], f"{lang}: missing aaak.instruction"
|
||||
|
||||
print(f" PASS: {len(langs)} languages load correctly")
|
||||
|
||||
|
||||
def test_interpolation():
|
||||
"""String interpolation works for all languages."""
|
||||
for lang in available_languages():
|
||||
load_lang(lang)
|
||||
result = t("cli.mine_complete", closets=5, drawers=100)
|
||||
assert "5" in result, f"{lang}: closets count missing from mine_complete"
|
||||
assert "100" in result, f"{lang}: drawers count missing from mine_complete"
|
||||
|
||||
print(" PASS: interpolation works for all languages")
|
||||
|
||||
|
||||
def test_dialect_loads_lang():
|
||||
"""Dialect class picks up the language instruction."""
|
||||
for lang in available_languages():
|
||||
d = Dialect(lang=lang)
|
||||
assert d.lang == lang, f"Expected lang={lang}, got {d.lang}"
|
||||
assert len(d.aaak_instruction) > 10, f"{lang}: AAAK instruction too short"
|
||||
|
||||
print(" PASS: Dialect loads language instruction for all languages")
|
||||
|
||||
|
||||
def test_dialect_compress_samples():
|
||||
"""Compress sample text in different languages, verify output isn't empty."""
|
||||
samples = {
|
||||
"en": "We decided to migrate from SQLite to PostgreSQL for better concurrent writes. Ben approved the PR yesterday.",
|
||||
"fr": "Nous avons décidé de migrer de SQLite vers PostgreSQL pour une meilleure écriture concurrente. Ben a approuvé le PR hier.",
|
||||
"ko": "더 나은 동시 쓰기를 위해 SQLite에서 PostgreSQL로 마이그레이션하기로 했습니다. 벤이 어제 PR을 승인했습니다.",
|
||||
"ja": "同時書き込みの改善のため、SQLiteからPostgreSQLに移行することを決定しました。ベンが昨日PRを承認しました。",
|
||||
"es": "Decidimos migrar de SQLite a PostgreSQL para mejor escritura concurrente. Ben aprobó el PR ayer.",
|
||||
"de": "Wir haben beschlossen, von SQLite auf PostgreSQL zu migrieren für bessere gleichzeitige Schreibvorgänge. Ben hat den PR gestern genehmigt.",
|
||||
"zh-CN": "我们决定从SQLite迁移到PostgreSQL以获得更好的并发写入。Ben昨天批准了PR。",
|
||||
}
|
||||
|
||||
for lang, text in samples.items():
|
||||
d = Dialect(lang=lang)
|
||||
compressed = d.compress(text)
|
||||
assert len(compressed) > 0, f"{lang}: compression returned empty"
|
||||
assert len(compressed) < len(text) * 2, f"{lang}: compression expanded text"
|
||||
print(f" {lang}: {len(text)} chars → {len(compressed)} chars")
|
||||
print(f" {compressed[:80]}")
|
||||
|
||||
print(" PASS: compression works for all sample languages")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("i18n smoke tests:")
|
||||
test_all_languages_load()
|
||||
test_interpolation()
|
||||
test_dialect_loads_lang()
|
||||
test_dialect_compress_samples()
|
||||
print("\nAll tests passed.")
|
||||
@@ -50,7 +50,12 @@ DEFAULT_KG_PATH = os.path.expanduser("~/.mempalace/knowledge_graph.sqlite3")
|
||||
class KnowledgeGraph:
|
||||
def __init__(self, db_path: str = None):
|
||||
self.db_path = db_path or DEFAULT_KG_PATH
|
||||
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
db_parent = Path(self.db_path).parent
|
||||
db_parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
db_parent.chmod(0o700)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
self._connection = None
|
||||
self._lock = threading.Lock()
|
||||
self._init_db()
|
||||
@@ -99,9 +104,10 @@ class KnowledgeGraph:
|
||||
|
||||
def close(self):
|
||||
"""Close the database connection."""
|
||||
if self._connection is not None:
|
||||
self._connection.close()
|
||||
self._connection = None
|
||||
with self._lock:
|
||||
if self._connection is not None:
|
||||
self._connection.close()
|
||||
self._connection = None
|
||||
|
||||
def _entity_id(self, name: str) -> str:
|
||||
return name.lower().replace(" ", "_").replace("'", "")
|
||||
@@ -260,7 +266,6 @@ class KnowledgeGraph:
|
||||
def query_relationship(self, predicate: str, as_of: str = None):
|
||||
"""Get all triples with a given relationship type."""
|
||||
pred = predicate.lower().replace(" ", "_")
|
||||
conn = self._conn()
|
||||
query = """
|
||||
SELECT t.*, s.name as sub_name, o.name as obj_name
|
||||
FROM triples t
|
||||
@@ -274,45 +279,48 @@ class KnowledgeGraph:
|
||||
params.extend([as_of, as_of])
|
||||
|
||||
results = []
|
||||
for row in conn.execute(query, params).fetchall():
|
||||
results.append(
|
||||
{
|
||||
"subject": row["sub_name"],
|
||||
"predicate": pred,
|
||||
"object": row["obj_name"],
|
||||
"valid_from": row["valid_from"],
|
||||
"valid_to": row["valid_to"],
|
||||
"current": row["valid_to"] is None,
|
||||
}
|
||||
)
|
||||
with self._lock:
|
||||
conn = self._conn()
|
||||
for row in conn.execute(query, params).fetchall():
|
||||
results.append(
|
||||
{
|
||||
"subject": row["sub_name"],
|
||||
"predicate": pred,
|
||||
"object": row["obj_name"],
|
||||
"valid_from": row["valid_from"],
|
||||
"valid_to": row["valid_to"],
|
||||
"current": row["valid_to"] is None,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
def timeline(self, entity_name: str = None):
|
||||
"""Get all facts in chronological order, optionally filtered by entity."""
|
||||
conn = self._conn()
|
||||
if entity_name:
|
||||
eid = self._entity_id(entity_name)
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT t.*, s.name as sub_name, o.name as obj_name
|
||||
FROM triples t
|
||||
JOIN entities s ON t.subject = s.id
|
||||
JOIN entities o ON t.object = o.id
|
||||
WHERE (t.subject = ? OR t.object = ?)
|
||||
ORDER BY t.valid_from ASC NULLS LAST
|
||||
LIMIT 100
|
||||
""",
|
||||
(eid, eid),
|
||||
).fetchall()
|
||||
else:
|
||||
rows = conn.execute("""
|
||||
SELECT t.*, s.name as sub_name, o.name as obj_name
|
||||
FROM triples t
|
||||
JOIN entities s ON t.subject = s.id
|
||||
JOIN entities o ON t.object = o.id
|
||||
ORDER BY t.valid_from ASC NULLS LAST
|
||||
LIMIT 100
|
||||
""").fetchall()
|
||||
with self._lock:
|
||||
conn = self._conn()
|
||||
if entity_name:
|
||||
eid = self._entity_id(entity_name)
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT t.*, s.name as sub_name, o.name as obj_name
|
||||
FROM triples t
|
||||
JOIN entities s ON t.subject = s.id
|
||||
JOIN entities o ON t.object = o.id
|
||||
WHERE (t.subject = ? OR t.object = ?)
|
||||
ORDER BY t.valid_from ASC NULLS LAST
|
||||
LIMIT 100
|
||||
""",
|
||||
(eid, eid),
|
||||
).fetchall()
|
||||
else:
|
||||
rows = conn.execute("""
|
||||
SELECT t.*, s.name as sub_name, o.name as obj_name
|
||||
FROM triples t
|
||||
JOIN entities s ON t.subject = s.id
|
||||
JOIN entities o ON t.object = o.id
|
||||
ORDER BY t.valid_from ASC NULLS LAST
|
||||
LIMIT 100
|
||||
""").fetchall()
|
||||
|
||||
return [
|
||||
{
|
||||
@@ -329,19 +337,20 @@ class KnowledgeGraph:
|
||||
# ── Stats ─────────────────────────────────────────────────────────────
|
||||
|
||||
def stats(self):
|
||||
conn = self._conn()
|
||||
entities = conn.execute("SELECT COUNT(*) as cnt FROM entities").fetchone()["cnt"]
|
||||
triples = conn.execute("SELECT COUNT(*) as cnt FROM triples").fetchone()["cnt"]
|
||||
current = conn.execute(
|
||||
"SELECT COUNT(*) as cnt FROM triples WHERE valid_to IS NULL"
|
||||
).fetchone()["cnt"]
|
||||
expired = triples - current
|
||||
predicates = [
|
||||
r["predicate"]
|
||||
for r in conn.execute(
|
||||
"SELECT DISTINCT predicate FROM triples ORDER BY predicate"
|
||||
).fetchall()
|
||||
]
|
||||
with self._lock:
|
||||
conn = self._conn()
|
||||
entities = conn.execute("SELECT COUNT(*) as cnt FROM entities").fetchone()["cnt"]
|
||||
triples = conn.execute("SELECT COUNT(*) as cnt FROM triples").fetchone()["cnt"]
|
||||
current = conn.execute(
|
||||
"SELECT COUNT(*) as cnt FROM triples WHERE valid_to IS NULL"
|
||||
).fetchone()["cnt"]
|
||||
expired = triples - current
|
||||
predicates = [
|
||||
r["predicate"]
|
||||
for r in conn.execute(
|
||||
"SELECT DISTINCT predicate FROM triples ORDER BY predicate"
|
||||
).fetchall()
|
||||
]
|
||||
return {
|
||||
"entities": entities,
|
||||
"triples": triples,
|
||||
|
||||
+7
-7
@@ -23,7 +23,7 @@ from collections import defaultdict
|
||||
|
||||
from .config import MempalaceConfig
|
||||
from .palace import get_collection as _get_collection
|
||||
from .searcher import build_where_filter
|
||||
from .searcher import _first_or_empty, build_where_filter
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -272,9 +272,9 @@ class Layer3:
|
||||
except Exception as e:
|
||||
return f"Search error: {e}"
|
||||
|
||||
docs = results["documents"][0]
|
||||
metas = results["metadatas"][0]
|
||||
dists = results["distances"][0]
|
||||
docs = _first_or_empty(results, "documents")
|
||||
metas = _first_or_empty(results, "metadatas")
|
||||
dists = _first_or_empty(results, "distances")
|
||||
|
||||
if not docs:
|
||||
return "No results found."
|
||||
@@ -323,9 +323,9 @@ class Layer3:
|
||||
|
||||
hits = []
|
||||
for doc, meta, dist in zip(
|
||||
results["documents"][0],
|
||||
results["metadatas"][0],
|
||||
results["distances"][0],
|
||||
_first_or_empty(results, "documents"),
|
||||
_first_or_empty(results, "metadatas"),
|
||||
_first_or_empty(results, "distances"),
|
||||
):
|
||||
hits.append(
|
||||
{
|
||||
|
||||
+209
-34
@@ -20,24 +20,57 @@ Tools (maintenance):
|
||||
mempalace_reconnect — force cache invalidation and reconnect after external writes
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import logging
|
||||
import hashlib
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from .config import MempalaceConfig, sanitize_name, sanitize_content
|
||||
from .version import __version__
|
||||
import chromadb
|
||||
from .query_sanitizer import sanitize_query
|
||||
from .searcher import search_memories
|
||||
from .palace_graph import traverse, find_tunnels, graph_stats
|
||||
# --- MCP stdio protection (issue #225) -----------------------------------
|
||||
# The MCP protocol multiplexes JSON-RPC over stdio: stdout MUST carry only
|
||||
# valid JSON-RPC messages, stderr is for human-readable logs. Some
|
||||
# transitive dependencies (chromadb → onnxruntime, posthog telemetry) print
|
||||
# banners and error messages directly to stdout — sometimes at C level —
|
||||
# which breaks Claude Desktop's JSON parser. Redirect stdout → stderr at
|
||||
# both the Python and file-descriptor level before heavy imports, then
|
||||
# restore the real stdout in main() before entering the protocol loop.
|
||||
_REAL_STDOUT = sys.stdout
|
||||
_REAL_STDOUT_FD = None
|
||||
try:
|
||||
_REAL_STDOUT_FD = os.dup(1)
|
||||
os.dup2(2, 1)
|
||||
except (OSError, AttributeError):
|
||||
# Environments without fd-level stdio (embedded interpreters, some test
|
||||
# harnesses). The Python-level redirect below still applies.
|
||||
pass
|
||||
sys.stdout = sys.stderr
|
||||
|
||||
from .knowledge_graph import KnowledgeGraph
|
||||
import argparse # noqa: E402 (deferred until after stdio protection above)
|
||||
import json # noqa: E402
|
||||
import logging # noqa: E402
|
||||
import hashlib # noqa: E402
|
||||
import time # noqa: E402
|
||||
from datetime import datetime # noqa: E402
|
||||
from pathlib import Path # noqa: E402
|
||||
|
||||
from .config import ( # noqa: E402
|
||||
MempalaceConfig,
|
||||
sanitize_kg_value,
|
||||
sanitize_name,
|
||||
sanitize_content,
|
||||
)
|
||||
from .version import __version__ # noqa: E402
|
||||
from .backends.chroma import ChromaBackend, ChromaCollection # noqa: E402
|
||||
from .query_sanitizer import sanitize_query # noqa: E402
|
||||
from .searcher import search_memories # noqa: E402
|
||||
from .palace_graph import ( # noqa: E402
|
||||
traverse,
|
||||
find_tunnels,
|
||||
graph_stats,
|
||||
create_tunnel,
|
||||
list_tunnels,
|
||||
delete_tunnel,
|
||||
follow_tunnels,
|
||||
)
|
||||
|
||||
from .knowledge_graph import KnowledgeGraph # noqa: E402
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(message)s", stream=sys.stderr)
|
||||
logger = logging.getLogger("mempalace_mcp")
|
||||
@@ -88,14 +121,14 @@ try:
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
_WAL_FILE = _WAL_DIR / "write_log.jsonl"
|
||||
# Pre-create WAL file with restricted permissions to avoid race condition
|
||||
if not _WAL_FILE.exists():
|
||||
_WAL_FILE.touch(mode=0o600)
|
||||
else:
|
||||
try:
|
||||
_WAL_FILE.chmod(0o600)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
# Atomically create WAL file with restricted permissions (no TOCTOU race).
|
||||
# os.open with O_CREAT|O_WRONLY and mode 0o600 creates the file if absent
|
||||
# or opens it if present, both in a single syscall.
|
||||
try:
|
||||
_fd = os.open(str(_WAL_FILE), os.O_CREAT | os.O_WRONLY, 0o600)
|
||||
os.close(_fd)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
|
||||
# Keys whose values should be redacted in WAL entries to avoid logging sensitive content
|
||||
_WAL_REDACT_KEYS = frozenset(
|
||||
@@ -169,7 +202,7 @@ def _get_client():
|
||||
mtime_changed = current_mtime != 0.0 and abs(current_mtime - _palace_db_mtime) > 0.01
|
||||
|
||||
if _client_cache is None or inode_changed or mtime_changed:
|
||||
_client_cache = chromadb.PersistentClient(path=_config.palace_path)
|
||||
_client_cache = ChromaBackend.make_client(_config.palace_path)
|
||||
_collection_cache = None
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
@@ -184,13 +217,15 @@ def _get_collection(create=False):
|
||||
try:
|
||||
client = _get_client()
|
||||
if create:
|
||||
_collection_cache = client.get_or_create_collection(
|
||||
_config.collection_name, metadata={"hnsw:space": "cosine"}
|
||||
_collection_cache = ChromaCollection(
|
||||
client.get_or_create_collection(
|
||||
_config.collection_name, metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
)
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
elif _collection_cache is None:
|
||||
_collection_cache = client.get_collection(_config.collection_name)
|
||||
_collection_cache = ChromaCollection(client.get_collection(_config.collection_name))
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
return _collection_cache
|
||||
@@ -259,7 +294,11 @@ def _sanitize_optional_name(value: str = None, field_name: str = "name") -> str:
|
||||
|
||||
|
||||
def tool_status():
|
||||
col = _get_collection()
|
||||
# Use create=True only when a palace DB already exists on disk -- this
|
||||
# bootstraps the ChromaDB collection on a valid-but-empty palace without
|
||||
# accidentally creating a palace in a non-existent directory (#830).
|
||||
db_exists = os.path.isfile(os.path.join(_config.palace_path, "chroma.sqlite3"))
|
||||
col = _get_collection(create=db_exists)
|
||||
if not col:
|
||||
return _no_palace()
|
||||
count = col.count()
|
||||
@@ -496,6 +535,66 @@ def tool_graph_stats():
|
||||
return graph_stats(col=col)
|
||||
|
||||
|
||||
def tool_create_tunnel(
|
||||
source_wing: str,
|
||||
source_room: str,
|
||||
target_wing: str,
|
||||
target_room: str,
|
||||
label: str = "",
|
||||
source_drawer_id: str = None,
|
||||
target_drawer_id: str = None,
|
||||
):
|
||||
"""Create an explicit cross-wing tunnel between two palace locations.
|
||||
|
||||
Use when you notice content in one project relates to another project.
|
||||
Example: an API design discussion in project_api connects to the
|
||||
database schema in project_database.
|
||||
"""
|
||||
try:
|
||||
source_wing = sanitize_name(source_wing, "source_wing")
|
||||
source_room = sanitize_name(source_room, "source_room")
|
||||
target_wing = sanitize_name(target_wing, "target_wing")
|
||||
target_room = sanitize_name(target_room, "target_room")
|
||||
except ValueError as e:
|
||||
return {"error": str(e)}
|
||||
return create_tunnel(
|
||||
source_wing,
|
||||
source_room,
|
||||
target_wing,
|
||||
target_room,
|
||||
label=label,
|
||||
source_drawer_id=source_drawer_id,
|
||||
target_drawer_id=target_drawer_id,
|
||||
)
|
||||
|
||||
|
||||
def tool_list_tunnels(wing: str = None):
|
||||
"""List all explicit cross-wing tunnels, optionally filtered by wing."""
|
||||
try:
|
||||
wing = _sanitize_optional_name(wing, "wing")
|
||||
except ValueError as e:
|
||||
return {"error": str(e)}
|
||||
return list_tunnels(wing)
|
||||
|
||||
|
||||
def tool_delete_tunnel(tunnel_id: str):
|
||||
"""Delete an explicit tunnel by its ID."""
|
||||
if not tunnel_id or not isinstance(tunnel_id, str):
|
||||
return {"error": "tunnel_id is required"}
|
||||
return delete_tunnel(tunnel_id)
|
||||
|
||||
|
||||
def tool_follow_tunnels(wing: str, room: str):
|
||||
"""Follow explicit tunnels from a room to see connected drawers in other wings."""
|
||||
try:
|
||||
wing = sanitize_name(wing, "wing")
|
||||
room = sanitize_name(room, "room")
|
||||
except ValueError as e:
|
||||
return {"error": str(e)}
|
||||
col = _get_collection()
|
||||
return follow_tunnels(wing, room, col=col)
|
||||
|
||||
|
||||
# ==================== WRITE TOOLS ====================
|
||||
|
||||
|
||||
@@ -740,7 +839,7 @@ def tool_update_drawer(drawer_id: str, content: str = None, wing: str = None, ro
|
||||
def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
|
||||
"""Query the knowledge graph for an entity's relationships."""
|
||||
try:
|
||||
entity = sanitize_name(entity, "entity")
|
||||
entity = sanitize_kg_value(entity, "entity")
|
||||
except ValueError as e:
|
||||
return {"error": str(e)}
|
||||
if direction not in ("outgoing", "incoming", "both"):
|
||||
@@ -754,9 +853,9 @@ def tool_kg_add(
|
||||
):
|
||||
"""Add a relationship to the knowledge graph."""
|
||||
try:
|
||||
subject = sanitize_name(subject, "subject")
|
||||
subject = sanitize_kg_value(subject, "subject")
|
||||
predicate = sanitize_name(predicate, "predicate")
|
||||
object = sanitize_name(object, "object")
|
||||
object = sanitize_kg_value(object, "object")
|
||||
except ValueError as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
@@ -779,9 +878,9 @@ def tool_kg_add(
|
||||
def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = None):
|
||||
"""Mark a fact as no longer true (set end date)."""
|
||||
try:
|
||||
subject = sanitize_name(subject, "subject")
|
||||
subject = sanitize_kg_value(subject, "subject")
|
||||
predicate = sanitize_name(predicate, "predicate")
|
||||
object = sanitize_name(object, "object")
|
||||
object = sanitize_kg_value(object, "object")
|
||||
except ValueError as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
_wal_log(
|
||||
@@ -800,7 +899,7 @@ def tool_kg_timeline(entity: str = None):
|
||||
"""Get chronological timeline of facts, optionally for one entity."""
|
||||
if entity is not None:
|
||||
try:
|
||||
entity = sanitize_name(entity, "entity")
|
||||
entity = sanitize_kg_value(entity, "entity")
|
||||
except ValueError as e:
|
||||
return {"error": str(e)}
|
||||
results = _kg.timeline(entity)
|
||||
@@ -836,7 +935,10 @@ def tool_diary_write(agent_name: str, entry: str, topic: str = "general"):
|
||||
return _no_palace()
|
||||
|
||||
now = datetime.now()
|
||||
entry_id = f"diary_{wing}_{now.strftime('%Y%m%d_%H%M%S')}_{hashlib.sha256(entry[:50].encode()).hexdigest()[:12]}"
|
||||
entry_id = (
|
||||
f"diary_{wing}_{now.strftime('%Y%m%d_%H%M%S%f')}_"
|
||||
f"{hashlib.sha256(entry.encode()).hexdigest()[:12]}"
|
||||
)
|
||||
|
||||
_wal_log(
|
||||
"diary_write",
|
||||
@@ -1181,6 +1283,65 @@ TOOLS = {
|
||||
"input_schema": {"type": "object", "properties": {}},
|
||||
"handler": tool_graph_stats,
|
||||
},
|
||||
"mempalace_create_tunnel": {
|
||||
"description": "Create a cross-wing tunnel linking two palace locations. Use when content in one project relates to another — e.g., an API design in project_api connects to a database schema in project_database.",
|
||||
"input_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"source_wing": {"type": "string", "description": "Wing of the source"},
|
||||
"source_room": {"type": "string", "description": "Room in the source wing"},
|
||||
"target_wing": {"type": "string", "description": "Wing of the target"},
|
||||
"target_room": {"type": "string", "description": "Room in the target wing"},
|
||||
"label": {"type": "string", "description": "Description of the connection"},
|
||||
"source_drawer_id": {
|
||||
"type": "string",
|
||||
"description": "Optional specific drawer ID",
|
||||
},
|
||||
"target_drawer_id": {
|
||||
"type": "string",
|
||||
"description": "Optional specific drawer ID",
|
||||
},
|
||||
},
|
||||
"required": ["source_wing", "source_room", "target_wing", "target_room"],
|
||||
},
|
||||
"handler": tool_create_tunnel,
|
||||
},
|
||||
"mempalace_list_tunnels": {
|
||||
"description": "List all explicit cross-wing tunnels. Optionally filter by wing.",
|
||||
"input_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"wing": {
|
||||
"type": "string",
|
||||
"description": "Filter tunnels by wing (shows tunnels where wing is source or target)",
|
||||
},
|
||||
},
|
||||
},
|
||||
"handler": tool_list_tunnels,
|
||||
},
|
||||
"mempalace_delete_tunnel": {
|
||||
"description": "Delete an explicit tunnel by its ID.",
|
||||
"input_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"tunnel_id": {"type": "string", "description": "Tunnel ID to delete"},
|
||||
},
|
||||
"required": ["tunnel_id"],
|
||||
},
|
||||
"handler": tool_delete_tunnel,
|
||||
},
|
||||
"mempalace_follow_tunnels": {
|
||||
"description": "Follow tunnels from a room to see what it connects to in other wings. Returns connected rooms with drawer previews.",
|
||||
"input_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"wing": {"type": "string", "description": "Wing to start from"},
|
||||
"room": {"type": "string", "description": "Room to follow tunnels from"},
|
||||
},
|
||||
"required": ["wing", "room"],
|
||||
},
|
||||
"handler": tool_follow_tunnels,
|
||||
},
|
||||
"mempalace_search": {
|
||||
"description": "Semantic search. Returns verbatim drawer content with similarity scores. IMPORTANT: 'query' must contain ONLY search keywords. Use 'context' for background. Results with cosine distance > max_distance are filtered out.",
|
||||
"input_schema": {
|
||||
@@ -1509,7 +1670,21 @@ def handle_request(request):
|
||||
}
|
||||
|
||||
|
||||
def _restore_stdout():
|
||||
"""Restore real stdout for MCP JSON-RPC output (see issue #225)."""
|
||||
global _REAL_STDOUT, _REAL_STDOUT_FD
|
||||
if _REAL_STDOUT_FD is not None:
|
||||
try:
|
||||
os.dup2(_REAL_STDOUT_FD, 1)
|
||||
os.close(_REAL_STDOUT_FD)
|
||||
except OSError:
|
||||
pass
|
||||
_REAL_STDOUT_FD = None
|
||||
sys.stdout = _REAL_STDOUT
|
||||
|
||||
|
||||
def main():
|
||||
_restore_stdout()
|
||||
logger.info("MemPalace MCP Server starting...")
|
||||
while True:
|
||||
try:
|
||||
|
||||
+13
-11
@@ -33,13 +33,15 @@ def extract_drawers_from_sqlite(db_path: str) -> list:
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
# Get all embedding IDs and their documents
|
||||
rows = conn.execute("""
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT e.embedding_id,
|
||||
MAX(CASE WHEN em.key = 'chroma:document' THEN em.string_value END) as document
|
||||
FROM embeddings e
|
||||
JOIN embedding_metadata em ON em.id = e.id
|
||||
GROUP BY e.embedding_id
|
||||
""").fetchall()
|
||||
"""
|
||||
).fetchall()
|
||||
|
||||
drawers = []
|
||||
for row in rows:
|
||||
@@ -132,7 +134,7 @@ def confirm_destructive_action(
|
||||
|
||||
def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
|
||||
"""Migrate a palace to the currently installed ChromaDB version."""
|
||||
import chromadb
|
||||
from .backends.chroma import ChromaBackend
|
||||
|
||||
palace_path = os.path.abspath(os.path.expanduser(palace_path))
|
||||
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||
@@ -150,19 +152,19 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
|
||||
|
||||
# Detect version
|
||||
source_version = detect_chromadb_version(db_path)
|
||||
target_version = ChromaBackend.backend_version()
|
||||
print(f" Source: ChromaDB {source_version}")
|
||||
print(f" Target: ChromaDB {chromadb.__version__}")
|
||||
print(f" Target: ChromaDB {target_version}")
|
||||
|
||||
# Try reading with current chromadb first
|
||||
try:
|
||||
client = chromadb.PersistentClient(path=palace_path)
|
||||
col = client.get_collection("mempalace_drawers")
|
||||
col = ChromaBackend().get_collection(palace_path, "mempalace_drawers")
|
||||
count = col.count()
|
||||
print(f"\n Palace is already readable by chromadb {chromadb.__version__}.")
|
||||
print(f"\n Palace is already readable by chromadb {target_version}.")
|
||||
print(f" {count} drawers found. No migration needed.")
|
||||
return True
|
||||
except Exception:
|
||||
print(f"\n Palace is NOT readable by chromadb {chromadb.__version__}.")
|
||||
print(f"\n Palace is NOT readable by chromadb {target_version}.")
|
||||
print(" Extracting from SQLite directly...")
|
||||
|
||||
# Extract all drawers via raw SQL
|
||||
@@ -206,8 +208,8 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
|
||||
|
||||
temp_palace = tempfile.mkdtemp(prefix="mempalace_migrate_")
|
||||
print(f" Creating fresh palace in {temp_palace}...")
|
||||
client = chromadb.PersistentClient(path=temp_palace)
|
||||
col = client.get_or_create_collection("mempalace_drawers")
|
||||
fresh_backend = ChromaBackend()
|
||||
col = fresh_backend.get_or_create_collection(temp_palace, "mempalace_drawers")
|
||||
|
||||
# Re-import in batches
|
||||
batch_size = 500
|
||||
@@ -225,7 +227,7 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
|
||||
# Verify before swapping
|
||||
final_count = col.count()
|
||||
del col
|
||||
del client
|
||||
del fresh_backend
|
||||
|
||||
# Swap: remove old palace, move new one into place
|
||||
print(" Swapping old palace for migrated version...")
|
||||
|
||||
+236
-28
@@ -15,7 +15,17 @@ from pathlib import Path
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
|
||||
from .palace import SKIP_DIRS, get_collection, file_already_mined
|
||||
from .palace import (
|
||||
NORMALIZE_VERSION,
|
||||
SKIP_DIRS,
|
||||
build_closet_lines,
|
||||
file_already_mined,
|
||||
get_closets_collection,
|
||||
get_collection,
|
||||
mine_lock,
|
||||
purge_file_closets,
|
||||
upsert_closet_lines,
|
||||
)
|
||||
|
||||
READABLE_EXTENSIONS = {
|
||||
".txt",
|
||||
@@ -254,16 +264,32 @@ def load_config(project_dir: str) -> dict:
|
||||
"""Load mempalace.yaml from project directory (falls back to mempal.yaml)."""
|
||||
import yaml
|
||||
|
||||
config_path = Path(project_dir).expanduser().resolve() / "mempalace.yaml"
|
||||
resolved_project_dir = Path(project_dir).expanduser().resolve()
|
||||
config_path = resolved_project_dir / "mempalace.yaml"
|
||||
if not config_path.exists():
|
||||
# Fallback to legacy name
|
||||
legacy_path = Path(project_dir).expanduser().resolve() / "mempal.yaml"
|
||||
legacy_path = resolved_project_dir / "mempal.yaml"
|
||||
if legacy_path.exists():
|
||||
config_path = legacy_path
|
||||
else:
|
||||
print(f"ERROR: No mempalace.yaml found in {project_dir}")
|
||||
print(f"Run: mempalace init {project_dir}")
|
||||
sys.exit(1)
|
||||
wing_name = resolved_project_dir.name
|
||||
print(
|
||||
f" No mempalace.yaml found in {resolved_project_dir} "
|
||||
f"— using auto-detected defaults (wing='{wing_name}'). "
|
||||
"Directories with the same basename will share a wing; "
|
||||
"add mempalace.yaml to disambiguate.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return {
|
||||
"wing": wing_name,
|
||||
"rooms": [
|
||||
{
|
||||
"name": "general",
|
||||
"description": "All project files",
|
||||
"keywords": ["general"],
|
||||
}
|
||||
],
|
||||
}
|
||||
with open(config_path) as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
@@ -368,6 +394,143 @@ def chunk_text(content: str, source_file: str) -> list:
|
||||
# =============================================================================
|
||||
|
||||
|
||||
_ENTITY_REGISTRY_PATH = os.path.join(os.path.expanduser("~"), ".mempalace", "known_entities.json")
|
||||
_ENTITY_REGISTRY_CACHE: dict = {"mtime": None, "names": frozenset(), "raw": {}}
|
||||
_ENTITY_EXTRACT_WINDOW = 5000 # chars of content scanned for capitalized words
|
||||
_ENTITY_METADATA_LIMIT = 25 # max entities packed into the metadata field
|
||||
|
||||
|
||||
def _refresh_known_entities_cache() -> None:
|
||||
"""Reload ``~/.mempalace/known_entities.json`` into the module cache if
|
||||
its mtime changed since the last read. Shared by ``_load_known_entities``
|
||||
(flat set) and ``_load_known_entities_raw`` (category dict), so callers
|
||||
can pick whichever shape they need without duplicating the mtime-gated
|
||||
disk read.
|
||||
"""
|
||||
try:
|
||||
mtime = os.path.getmtime(_ENTITY_REGISTRY_PATH)
|
||||
except OSError:
|
||||
if _ENTITY_REGISTRY_CACHE["mtime"] is not None:
|
||||
_ENTITY_REGISTRY_CACHE["mtime"] = None
|
||||
_ENTITY_REGISTRY_CACHE["names"] = frozenset()
|
||||
_ENTITY_REGISTRY_CACHE["raw"] = {}
|
||||
return
|
||||
|
||||
if _ENTITY_REGISTRY_CACHE["mtime"] == mtime:
|
||||
return
|
||||
|
||||
names: set = set()
|
||||
raw: dict = {}
|
||||
try:
|
||||
import json
|
||||
|
||||
with open(_ENTITY_REGISTRY_PATH, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if isinstance(data, dict):
|
||||
raw = data
|
||||
for cat in data.values():
|
||||
if isinstance(cat, list):
|
||||
names.update(str(n) for n in cat if n)
|
||||
elif isinstance(cat, dict):
|
||||
names.update(str(k) for k in cat.keys() if k)
|
||||
except Exception:
|
||||
names = set()
|
||||
raw = {}
|
||||
|
||||
_ENTITY_REGISTRY_CACHE["mtime"] = mtime
|
||||
_ENTITY_REGISTRY_CACHE["names"] = frozenset(names)
|
||||
_ENTITY_REGISTRY_CACHE["raw"] = raw
|
||||
|
||||
|
||||
def _load_known_entities() -> frozenset:
|
||||
"""Flat set of every known entity name (across all categories).
|
||||
|
||||
Cached by mtime; invalidated when the registry file changes.
|
||||
"""
|
||||
_refresh_known_entities_cache()
|
||||
return _ENTITY_REGISTRY_CACHE["names"]
|
||||
|
||||
|
||||
def _load_known_entities_raw() -> dict:
|
||||
"""Full category-dict view of the registry, shape
|
||||
``{"category": ["Name1", ...], ...}``. Cached by mtime.
|
||||
|
||||
Consumed by modules (e.g., fact_checker) that need to reason about
|
||||
categories rather than a flat name set. Never returns a mutable
|
||||
reference to the cache — callers get a shallow copy.
|
||||
"""
|
||||
_refresh_known_entities_cache()
|
||||
return dict(_ENTITY_REGISTRY_CACHE["raw"])
|
||||
|
||||
|
||||
_HALL_KEYWORDS_CACHE = None
|
||||
|
||||
|
||||
def detect_hall(content: str) -> str:
|
||||
"""Route content to a hall based on keyword scoring.
|
||||
|
||||
Halls connect rooms within a wing — they categorize the TYPE of content
|
||||
(emotional, technical, family, etc.) while rooms categorize the TOPIC.
|
||||
"""
|
||||
global _HALL_KEYWORDS_CACHE
|
||||
if _HALL_KEYWORDS_CACHE is None:
|
||||
from .config import MempalaceConfig
|
||||
|
||||
_HALL_KEYWORDS_CACHE = MempalaceConfig().hall_keywords
|
||||
content_lower = content[:3000].lower()
|
||||
|
||||
scores = {}
|
||||
for hall, keywords in _HALL_KEYWORDS_CACHE.items():
|
||||
score = sum(1 for kw in keywords if kw in content_lower)
|
||||
if score > 0:
|
||||
scores[hall] = score
|
||||
|
||||
if scores:
|
||||
return max(scores, key=scores.get)
|
||||
return "general"
|
||||
|
||||
|
||||
def _extract_entities_for_metadata(content: str) -> str:
|
||||
"""Extract entity names from content for metadata tagging.
|
||||
|
||||
Combines the user's known-entity registry (cached across calls) with
|
||||
capitalized words appearing ≥2 times in the first ``_ENTITY_EXTRACT_WINDOW``
|
||||
chars. Filters out the closet stoplist (``When``, ``After``, ``The``, …)
|
||||
so sentence-starters don't masquerade as proper nouns.
|
||||
|
||||
Returns semicolon-separated string suitable for ChromaDB metadata
|
||||
filtering. The list is truncated to ``_ENTITY_METADATA_LIMIT`` entries
|
||||
*before* joining so a name is never cut in half.
|
||||
"""
|
||||
import re
|
||||
|
||||
from .palace import _ENTITY_STOPLIST
|
||||
|
||||
matched: set = set()
|
||||
|
||||
known = _load_known_entities()
|
||||
for name in known:
|
||||
if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content):
|
||||
matched.add(name)
|
||||
|
||||
window = content[:_ENTITY_EXTRACT_WINDOW]
|
||||
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
|
||||
freq: dict = {}
|
||||
for w in words:
|
||||
if w in _ENTITY_STOPLIST:
|
||||
continue
|
||||
freq[w] = freq.get(w, 0) + 1
|
||||
for w, c in freq.items():
|
||||
if c >= 2 and len(w) > 2:
|
||||
matched.add(w)
|
||||
|
||||
if not matched:
|
||||
return ""
|
||||
# Truncate the *list*, not the joined string — never split a name.
|
||||
capped = sorted(matched)[:_ENTITY_METADATA_LIMIT]
|
||||
return ";".join(capped)
|
||||
|
||||
|
||||
def add_drawer(
|
||||
collection, wing: str, room: str, content: str, source_file: str, chunk_index: int, agent: str
|
||||
):
|
||||
@@ -381,12 +544,19 @@ def add_drawer(
|
||||
"chunk_index": chunk_index,
|
||||
"added_by": agent,
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
"normalize_version": NORMALIZE_VERSION,
|
||||
}
|
||||
# Store file mtime so we can detect modifications later.
|
||||
try:
|
||||
metadata["source_mtime"] = os.path.getmtime(source_file)
|
||||
except OSError:
|
||||
pass
|
||||
# Tag with hall for graph connectivity within wings
|
||||
metadata["hall"] = detect_hall(content)
|
||||
# Tag with entity names for filterable search
|
||||
entities = _extract_entities_for_metadata(content)
|
||||
if entities:
|
||||
metadata["entities"] = entities
|
||||
collection.upsert(
|
||||
documents=[content],
|
||||
ids=[drawer_id],
|
||||
@@ -410,6 +580,7 @@ def process_file(
|
||||
rooms: list,
|
||||
agent: str,
|
||||
dry_run: bool,
|
||||
closets_col=None,
|
||||
) -> tuple:
|
||||
"""Read, chunk, route, and file one file. Returns (drawer_count, room_name)."""
|
||||
|
||||
@@ -434,29 +605,63 @@ def process_file(
|
||||
print(f" [DRY RUN] {filepath.name} → room:{room} ({len(chunks)} drawers)")
|
||||
return len(chunks), room
|
||||
|
||||
# Purge stale drawers for this file before re-inserting the fresh chunks.
|
||||
# Converts modified-file re-mines from upsert-over-existing-IDs (which hits
|
||||
# hnswlib's thread-unsafe updatePoint path and can segfault on macOS ARM
|
||||
# with chromadb 0.6.3) into a clean delete+insert, bypassing the update
|
||||
# path entirely.
|
||||
try:
|
||||
collection.delete(where={"source_file": source_file})
|
||||
except Exception:
|
||||
pass
|
||||
# Lock this file so concurrent agents don't interleave delete+insert.
|
||||
# Without the lock, two agents can both pass file_already_mined(),
|
||||
# both delete, and both insert — creating duplicates or losing data.
|
||||
with mine_lock(source_file):
|
||||
# Re-check after acquiring lock — another agent may have just finished
|
||||
if file_already_mined(collection, source_file, check_mtime=True):
|
||||
return 0, room
|
||||
|
||||
drawers_added = 0
|
||||
for chunk in chunks:
|
||||
added = add_drawer(
|
||||
collection=collection,
|
||||
wing=wing,
|
||||
room=room,
|
||||
content=chunk["content"],
|
||||
source_file=source_file,
|
||||
chunk_index=chunk["chunk_index"],
|
||||
agent=agent,
|
||||
)
|
||||
if added:
|
||||
drawers_added += 1
|
||||
# Purge stale drawers for this file before re-inserting the fresh chunks.
|
||||
# Converts modified-file re-mines from upsert-over-existing-IDs (which hits
|
||||
# hnswlib's thread-unsafe updatePoint path and can segfault on macOS ARM
|
||||
# with chromadb 0.6.3) into a clean delete+insert, bypassing the update
|
||||
# path entirely.
|
||||
try:
|
||||
collection.delete(where={"source_file": source_file})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
drawers_added = 0
|
||||
for chunk in chunks:
|
||||
added = add_drawer(
|
||||
collection=collection,
|
||||
wing=wing,
|
||||
room=room,
|
||||
content=chunk["content"],
|
||||
source_file=source_file,
|
||||
chunk_index=chunk["chunk_index"],
|
||||
agent=agent,
|
||||
)
|
||||
if added:
|
||||
drawers_added += 1
|
||||
|
||||
# Build closet — the searchable index pointing to these drawers.
|
||||
# Purge first: a re-mine (mtime change or normalize_version bump) must
|
||||
# fully replace the prior closets, not append to them.
|
||||
if closets_col and drawers_added > 0:
|
||||
drawer_ids = [
|
||||
f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}"
|
||||
for c in chunks
|
||||
]
|
||||
closet_lines = build_closet_lines(source_file, drawer_ids, content, wing, room)
|
||||
closet_id_base = (
|
||||
f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}"
|
||||
)
|
||||
entities = _extract_entities_for_metadata(content)
|
||||
closet_meta = {
|
||||
"wing": wing,
|
||||
"room": room,
|
||||
"source_file": source_file,
|
||||
"drawer_count": drawers_added,
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
"normalize_version": NORMALIZE_VERSION,
|
||||
}
|
||||
if entities:
|
||||
closet_meta["entities"] = entities
|
||||
purge_file_closets(closets_col, source_file)
|
||||
upsert_closet_lines(closets_col, closet_id_base, closet_lines, closet_meta)
|
||||
|
||||
return drawers_added, room
|
||||
|
||||
@@ -578,8 +783,10 @@ def mine(
|
||||
|
||||
if not dry_run:
|
||||
collection = get_collection(palace_path)
|
||||
closets_col = get_closets_collection(palace_path)
|
||||
else:
|
||||
collection = None
|
||||
closets_col = None
|
||||
|
||||
total_drawers = 0
|
||||
files_skipped = 0
|
||||
@@ -594,6 +801,7 @@ def mine(
|
||||
rooms=rooms,
|
||||
agent=agent,
|
||||
dry_run=dry_run,
|
||||
closets_col=closets_col,
|
||||
)
|
||||
if drawers == 0 and not dry_run:
|
||||
files_skipped += 1
|
||||
|
||||
+113
-7
@@ -16,9 +16,98 @@ No API key. No internet. Everything local.
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# Provenance footer appended to Slack transcript output so downstream consumers
|
||||
# know the speaker roles are positionally assigned, not verified.
|
||||
_SLACK_PROVENANCE_FOOTER = (
|
||||
"\n[source: slack-export | multi-party chat — speaker roles are positional, not verified]"
|
||||
)
|
||||
|
||||
|
||||
# ─── Noise stripping ─────────────────────────────────────────────────────
|
||||
# Claude Code and other tools inject system tags, hook output, and UI chrome
|
||||
# into transcripts. These waste drawer space and pollute search results.
|
||||
#
|
||||
# Verbatim is sacred — every pattern here is anchored to line boundaries and
|
||||
# refuses to cross blank lines, so a stray unclosed tag in one message can
|
||||
# never eat content from neighboring messages. When in doubt, leave text
|
||||
# alone.
|
||||
|
||||
_NOISE_TAGS = (
|
||||
"system-reminder",
|
||||
"command-message",
|
||||
"command-name",
|
||||
"task-notification",
|
||||
"user-prompt-submit-hook",
|
||||
"hook_output",
|
||||
)
|
||||
|
||||
|
||||
def _tag_pattern(name: str) -> "re.Pattern[str]":
|
||||
# Opening tag must begin a line (optionally after a `> ` blockquote marker,
|
||||
# since _messages_to_transcript prefixes lines with `> `). Body is lazy but
|
||||
# forbidden from crossing a blank line, so a dangling open tag can't span
|
||||
# multiple messages. Closing tag eats optional trailing whitespace + newline.
|
||||
return re.compile(
|
||||
rf"(?m)^(?:> )?<{name}(?:\s[^>]*)?>" rf"(?:(?!\n\s*\n)[\s\S])*?" rf"</{name}>[ \t]*\n?"
|
||||
)
|
||||
|
||||
|
||||
_NOISE_TAG_PATTERNS = [_tag_pattern(t) for t in _NOISE_TAGS]
|
||||
|
||||
# Strings that identify an entire noise line when found at its start.
|
||||
# Matched case-sensitively and anchored to line-start so user prose mentioning
|
||||
# e.g. "current time:" in a sentence is untouched.
|
||||
_NOISE_LINE_PREFIXES = (
|
||||
"CURRENT TIME:",
|
||||
"VERIFIED FACTS (do not contradict)",
|
||||
"AGENT SPECIALIZATION:",
|
||||
"Checking verified facts...",
|
||||
"Injecting timestamp...",
|
||||
"Starting background pipeline...",
|
||||
"Checking emotional weights...",
|
||||
"Auto-save reminder...",
|
||||
"Checking pipeline...",
|
||||
"MemPalace auto-save checkpoint.",
|
||||
)
|
||||
|
||||
_NOISE_LINE_PATTERNS = [
|
||||
re.compile(rf"(?m)^(?:> )?{re.escape(p)}.*\n?") for p in _NOISE_LINE_PREFIXES
|
||||
]
|
||||
|
||||
# Claude Code TUI hook-run chrome, e.g. "Ran 2 Stop hook", "Ran 1 PreCompact hook".
|
||||
# Line-anchored, case-sensitive, explicit hook names — prose like
|
||||
# "our CI has a stop hook" stays intact.
|
||||
_HOOK_LINE_RE = re.compile(
|
||||
r"(?m)^(?:> )?Ran \d+ (?:Stop|PreCompact|PreToolUse|PostToolUse|UserPromptSubmit|Notification|SessionStart|SessionEnd) hook[s]?.*\n?"
|
||||
)
|
||||
|
||||
# "… +N lines" collapsed-output marker, line-anchored.
|
||||
_COLLAPSED_LINES_RE = re.compile(r"(?m)^(?:> )?…\s*\+\d+ lines.*\n?")
|
||||
|
||||
|
||||
def strip_noise(text: str) -> str:
|
||||
"""Remove system tags, hook output, and Claude Code UI chrome from text.
|
||||
|
||||
All patterns are line-anchored. User prose that happens to mention these
|
||||
strings inline (e.g., documenting them) is preserved verbatim.
|
||||
"""
|
||||
for pat in _NOISE_TAG_PATTERNS:
|
||||
text = pat.sub("", text)
|
||||
for pat in _NOISE_LINE_PATTERNS:
|
||||
text = pat.sub("", text)
|
||||
text = _HOOK_LINE_RE.sub("", text)
|
||||
text = _COLLAPSED_LINES_RE.sub("", text)
|
||||
# Strip the Claude Code collapsed-output chrome "[N tokens] (ctrl+o to expand)".
|
||||
# Narrow shape — a bare "(ctrl+o to expand)" in user prose stays intact.
|
||||
text = re.sub(r"\s*\[\d+\s+tokens?\]\s*\(ctrl\+o to expand\)", "", text)
|
||||
# Collapse runs of blank lines created by the removals
|
||||
text = re.sub(r"\n{4,}", "\n\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def normalize(filepath: str) -> str:
|
||||
"""
|
||||
@@ -40,12 +129,14 @@ def normalize(filepath: str) -> str:
|
||||
if not content.strip():
|
||||
return content
|
||||
|
||||
# Already has > markers — pass through
|
||||
# Already has > markers — pass through unchanged.
|
||||
lines = content.split("\n")
|
||||
if sum(1 for line in lines if line.strip().startswith(">")) >= 3:
|
||||
return content
|
||||
|
||||
# Try JSON normalization
|
||||
# Try JSON normalization. strip_noise is applied inside the Claude Code
|
||||
# JSONL parser (the only format that injects system tags/hook chrome);
|
||||
# other formats pass through verbatim.
|
||||
ext = Path(filepath).suffix.lower()
|
||||
if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["):
|
||||
normalized = _try_normalize_json(content)
|
||||
@@ -112,6 +203,10 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]:
|
||||
isinstance(b, dict) and b.get("type") == "tool_result" for b in msg_content
|
||||
)
|
||||
text = _extract_content(msg_content, tool_use_map=tool_use_map)
|
||||
# Strip Claude Code system-injected noise per message, never across
|
||||
# message boundaries — prevents span-eating.
|
||||
if text:
|
||||
text = strip_noise(text)
|
||||
if text:
|
||||
if is_tool_only and messages and messages[-1][0] == "assistant":
|
||||
# Append tool results to the previous assistant message
|
||||
@@ -121,6 +216,8 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]:
|
||||
messages.append(("user", text))
|
||||
elif msg_type == "assistant":
|
||||
text = _extract_content(msg_content, tool_use_map=tool_use_map)
|
||||
if text:
|
||||
text = strip_noise(text)
|
||||
if text:
|
||||
# If previous message is also assistant (multi-turn tool loop),
|
||||
# merge into the same assistant turn
|
||||
@@ -276,8 +373,13 @@ def _try_chatgpt_json(data) -> Optional[str]:
|
||||
def _try_slack_json(data) -> Optional[str]:
|
||||
"""
|
||||
Slack channel export: [{"type": "message", "user": "...", "text": "..."}]
|
||||
Optimized for 2-person DMs. In channels with 3+ people, alternating
|
||||
speakers are labeled user/assistant to preserve the exchange structure.
|
||||
|
||||
Slack exports are multi-party chats where no speaker is inherently the
|
||||
"user" or "assistant". To preserve exchange-pair chunking (which relies
|
||||
on ``>`` markers from the ``user`` role), we still alternate roles, but
|
||||
prefix each message with the speaker ID so downstream consumers can
|
||||
distinguish the original author. A provenance header marks the
|
||||
transcript as a Slack import.
|
||||
"""
|
||||
if not isinstance(data, list):
|
||||
return None
|
||||
@@ -287,7 +389,10 @@ def _try_slack_json(data) -> Optional[str]:
|
||||
for item in data:
|
||||
if not isinstance(item, dict) or item.get("type") != "message":
|
||||
continue
|
||||
user_id = item.get("user", item.get("username", ""))
|
||||
raw_user_id = item.get("user", item.get("username", ""))
|
||||
# Sanitize speaker ID: strip brackets, newlines, and control chars
|
||||
# to prevent chunk-boundary injection via crafted exports
|
||||
user_id = re.sub(r"[\[\]\n\r\x00-\x1f]", "_", raw_user_id).strip()
|
||||
text = item.get("text", "").strip()
|
||||
if not text or not user_id:
|
||||
continue
|
||||
@@ -300,9 +405,10 @@ def _try_slack_json(data) -> Optional[str]:
|
||||
else:
|
||||
seen_users[user_id] = "user"
|
||||
last_role = seen_users[user_id]
|
||||
messages.append((seen_users[user_id], text))
|
||||
# Prefix with speaker ID so the original author is preserved
|
||||
messages.append((seen_users[user_id], f"[{user_id}] {text}"))
|
||||
if len(messages) >= 2:
|
||||
return _messages_to_transcript(messages)
|
||||
return _messages_to_transcript(messages) + _SLACK_PROVENANCE_FOOTER
|
||||
return None
|
||||
|
||||
|
||||
|
||||
+244
-4
@@ -4,6 +4,8 @@ palace.py — Shared palace operations.
|
||||
Consolidates collection access patterns used by both miners and the MCP server.
|
||||
"""
|
||||
|
||||
import contextlib
|
||||
import hashlib
|
||||
import os
|
||||
|
||||
from .backends.chroma import ChromaBackend
|
||||
@@ -36,6 +38,16 @@ SKIP_DIRS = {
|
||||
|
||||
_DEFAULT_BACKEND = ChromaBackend()
|
||||
|
||||
# Schema version for drawer normalization. Bump when the normalization
|
||||
# pipeline changes in a way that existing drawers should be rebuilt to pick up
|
||||
# (e.g., new noise-stripping rules). `file_already_mined` treats drawers with
|
||||
# a missing or stale `normalize_version` as "not mined", so the next mine pass
|
||||
# silently rebuilds them — users don't need to manually erase + re-mine.
|
||||
#
|
||||
# v2 (2026-04): introduced strip_noise() for Claude Code JSONL; previous
|
||||
# drawers stored system tags / hook chrome verbatim.
|
||||
NORMALIZE_VERSION = 2
|
||||
|
||||
|
||||
def get_collection(
|
||||
palace_path: str,
|
||||
@@ -50,19 +62,247 @@ def get_collection(
|
||||
)
|
||||
|
||||
|
||||
def get_closets_collection(palace_path: str, create: bool = True):
|
||||
"""Get the closets collection — the searchable index layer."""
|
||||
return get_collection(palace_path, collection_name="mempalace_closets", create=create)
|
||||
|
||||
|
||||
CLOSET_CHAR_LIMIT = 1500 # fill closet until ~1500 chars, then start a new one
|
||||
CLOSET_EXTRACT_WINDOW = 5000 # how many chars of source content to scan for entities/topics
|
||||
|
||||
# Common capitalized words that look like proper nouns but are usually
|
||||
# sentence-starters or filler. Filtered out of entity extraction.
|
||||
_ENTITY_STOPLIST = frozenset(
|
||||
{
|
||||
"The",
|
||||
"This",
|
||||
"That",
|
||||
"These",
|
||||
"Those",
|
||||
"When",
|
||||
"Where",
|
||||
"What",
|
||||
"Why",
|
||||
"Who",
|
||||
"Which",
|
||||
"How",
|
||||
"After",
|
||||
"Before",
|
||||
"Then",
|
||||
"Now",
|
||||
"Here",
|
||||
"There",
|
||||
"And",
|
||||
"But",
|
||||
"Or",
|
||||
"Yet",
|
||||
"So",
|
||||
"If",
|
||||
"Else",
|
||||
"Yes",
|
||||
"No",
|
||||
"Maybe",
|
||||
"Okay",
|
||||
"User",
|
||||
"Assistant",
|
||||
"System",
|
||||
"Tool",
|
||||
"Monday",
|
||||
"Tuesday",
|
||||
"Wednesday",
|
||||
"Thursday",
|
||||
"Friday",
|
||||
"Saturday",
|
||||
"Sunday",
|
||||
"January",
|
||||
"February",
|
||||
"March",
|
||||
"April",
|
||||
"May",
|
||||
"June",
|
||||
"July",
|
||||
"August",
|
||||
"September",
|
||||
"October",
|
||||
"November",
|
||||
"December",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def build_closet_lines(source_file, drawer_ids, content, wing, room):
|
||||
"""Build compact closet pointer lines from drawer content.
|
||||
|
||||
Returns a LIST of lines (not joined). Each line is one complete topic
|
||||
pointer — never split across closets.
|
||||
|
||||
Format: topic|entities|→drawer_ids
|
||||
"""
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
drawer_ref = ",".join(drawer_ids[:3])
|
||||
window = content[:CLOSET_EXTRACT_WINDOW]
|
||||
|
||||
# Extract proper nouns (capitalized words, 2+ occurrences). Filter out
|
||||
# common sentence-starters that aren't real entities.
|
||||
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
|
||||
word_freq = {}
|
||||
for w in words:
|
||||
if w in _ENTITY_STOPLIST:
|
||||
continue
|
||||
word_freq[w] = word_freq.get(w, 0) + 1
|
||||
entities = sorted(
|
||||
[w for w, c in word_freq.items() if c >= 2],
|
||||
key=lambda w: -word_freq[w],
|
||||
)[:5]
|
||||
entity_str = ";".join(entities) if entities else ""
|
||||
|
||||
# Extract key phrases — action verbs + context
|
||||
topics = []
|
||||
for pattern in [
|
||||
r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated|reviewed|deployed|configured|removed|updated)\s+[\w\s]{3,40}",
|
||||
]:
|
||||
topics.extend(re.findall(pattern, window, re.IGNORECASE))
|
||||
# Also grab section headers if present
|
||||
for header in re.findall(r"^#{1,3}\s+(.{5,60})$", window, re.MULTILINE):
|
||||
topics.append(header.strip())
|
||||
# Dedupe preserving order
|
||||
topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:12]
|
||||
|
||||
# Extract quotes
|
||||
quotes = re.findall(r'"([^"]{15,150})"', window)
|
||||
|
||||
# Build pointer lines — each one is atomic, never split
|
||||
lines = []
|
||||
for topic in topics:
|
||||
lines.append(f"{topic}|{entity_str}|→{drawer_ref}")
|
||||
for quote in quotes[:3]:
|
||||
lines.append(f'"{quote}"|{entity_str}|→{drawer_ref}')
|
||||
|
||||
# Always have at least one line
|
||||
if not lines:
|
||||
name = Path(source_file).stem[:40]
|
||||
lines.append(f"{wing}/{room}/{name}|{entity_str}|→{drawer_ref}")
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def purge_file_closets(closets_col, source_file: str) -> None:
|
||||
"""Delete every closet associated with ``source_file``.
|
||||
|
||||
Call this before ``upsert_closet_lines`` on a re-mine so stale topics
|
||||
from a prior schema/version don't survive in the closet collection.
|
||||
Mirrors the drawer-purge step in process_file().
|
||||
"""
|
||||
try:
|
||||
closets_col.delete(where={"source_file": source_file})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def upsert_closet_lines(closets_col, closet_id_base, lines, metadata):
|
||||
"""Write topic lines to closets, packed greedily without splitting a line.
|
||||
|
||||
Closets are deterministically numbered (``..._01``, ``..._02``, …) and
|
||||
each ``upsert`` fully overwrites the prior content at that ID. Callers
|
||||
are expected to ``purge_file_closets`` first when re-mining a source
|
||||
file so stale-numbered closets from larger prior runs don't leak.
|
||||
|
||||
Returns the number of closets written.
|
||||
"""
|
||||
closet_num = 1
|
||||
current_lines: list = []
|
||||
current_chars = 0
|
||||
closets_written = 0
|
||||
|
||||
def _flush():
|
||||
nonlocal closets_written
|
||||
if not current_lines:
|
||||
return
|
||||
closet_id = f"{closet_id_base}_{closet_num:02d}"
|
||||
text = "\n".join(current_lines)
|
||||
closets_col.upsert(documents=[text], ids=[closet_id], metadatas=[metadata])
|
||||
closets_written += 1
|
||||
|
||||
for line in lines:
|
||||
line_len = len(line)
|
||||
# Would this line fit whole in the current closet?
|
||||
if current_chars > 0 and current_chars + line_len + 1 > CLOSET_CHAR_LIMIT:
|
||||
_flush()
|
||||
closet_num += 1
|
||||
current_lines = []
|
||||
current_chars = 0
|
||||
|
||||
current_lines.append(line)
|
||||
current_chars += line_len + 1 # +1 for newline
|
||||
|
||||
_flush()
|
||||
return closets_written
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def mine_lock(source_file: str):
|
||||
"""Cross-platform file lock for mine operations.
|
||||
|
||||
Prevents multiple agents from mining the same file simultaneously,
|
||||
which causes duplicate drawers when the delete+insert cycle interleaves.
|
||||
"""
|
||||
lock_dir = os.path.join(os.path.expanduser("~"), ".mempalace", "locks")
|
||||
os.makedirs(lock_dir, exist_ok=True)
|
||||
lock_path = os.path.join(
|
||||
lock_dir, hashlib.sha256(source_file.encode()).hexdigest()[:16] + ".lock"
|
||||
)
|
||||
|
||||
lf = open(lock_path, "w")
|
||||
try:
|
||||
if os.name == "nt":
|
||||
import msvcrt
|
||||
|
||||
msvcrt.locking(lf.fileno(), msvcrt.LK_LOCK, 1)
|
||||
else:
|
||||
import fcntl
|
||||
|
||||
fcntl.flock(lf, fcntl.LOCK_EX)
|
||||
yield
|
||||
finally:
|
||||
try:
|
||||
if os.name == "nt":
|
||||
import msvcrt
|
||||
|
||||
msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
|
||||
else:
|
||||
import fcntl
|
||||
|
||||
fcntl.flock(lf, fcntl.LOCK_UN)
|
||||
except Exception:
|
||||
pass
|
||||
lf.close()
|
||||
|
||||
|
||||
def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool:
|
||||
"""Check if a file has already been filed in the palace.
|
||||
|
||||
When check_mtime=True (used by project miner), returns False if the file
|
||||
has been modified since it was last mined, so it gets re-mined.
|
||||
When check_mtime=False (used by convo miner), just checks existence.
|
||||
Returns False (so the file gets re-mined) when:
|
||||
- no drawers exist for this source_file
|
||||
- the stored `normalize_version` is missing or older than the current
|
||||
schema (triggers silent rebuild after a normalization upgrade)
|
||||
- `check_mtime=True` and the file's mtime differs from the stored one
|
||||
|
||||
When check_mtime=True (used by project miner), also re-mines on content
|
||||
change. When check_mtime=False (used by convo miner), transcripts are
|
||||
assumed immutable, so only the version gate triggers a rebuild.
|
||||
"""
|
||||
try:
|
||||
results = collection.get(where={"source_file": source_file}, limit=1)
|
||||
if not results.get("ids"):
|
||||
return False
|
||||
stored_meta = results.get("metadatas", [{}])[0] or {}
|
||||
# Pre-v2 drawers have no version field — treat them as stale.
|
||||
stored_version = stored_meta.get("normalize_version", 1)
|
||||
if stored_version < NORMALIZE_VERSION:
|
||||
return False
|
||||
if check_mtime:
|
||||
stored_meta = results.get("metadatas", [{}])[0]
|
||||
stored_mtime = stored_meta.get("source_mtime")
|
||||
if stored_mtime is None:
|
||||
return False
|
||||
|
||||
+230
-1
@@ -15,10 +15,15 @@ Enables queries like:
|
||||
No external graph DB needed — built from ChromaDB metadata.
|
||||
"""
|
||||
|
||||
from collections import defaultdict, Counter
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from .config import MempalaceConfig
|
||||
from .palace import get_collection as _get_palace_collection
|
||||
from .palace import mine_lock
|
||||
|
||||
|
||||
def _get_collection(config=None):
|
||||
@@ -228,3 +233,227 @@ def _fuzzy_match(query: str, nodes: dict, n: int = 5):
|
||||
scored.append((room, 0.5))
|
||||
scored.sort(key=lambda x: -x[1])
|
||||
return [r for r, _ in scored[:n]]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# EXPLICIT TUNNELS — agent-created cross-wing links
|
||||
# =============================================================================
|
||||
# Passive tunnels are discovered from shared room names across wings.
|
||||
# Explicit tunnels are created by agents when they notice a connection
|
||||
# between two specific drawers or rooms in different wings/projects.
|
||||
#
|
||||
# Stored as a JSON file at ~/.mempalace/tunnels.json so they persist
|
||||
# across palace rebuilds (not in ChromaDB which can be recreated).
|
||||
|
||||
|
||||
_TUNNEL_FILE = os.path.join(os.path.expanduser("~"), ".mempalace", "tunnels.json")
|
||||
|
||||
|
||||
def _load_tunnels():
|
||||
"""Load explicit tunnels from disk.
|
||||
|
||||
Returns an empty list if the file is missing or corrupt (e.g. truncated
|
||||
by a crash mid-write on a system that lacks atomic-rename semantics).
|
||||
"""
|
||||
if not os.path.exists(_TUNNEL_FILE):
|
||||
return []
|
||||
try:
|
||||
with open(_TUNNEL_FILE, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
except Exception:
|
||||
return []
|
||||
return data if isinstance(data, list) else []
|
||||
|
||||
|
||||
def _save_tunnels(tunnels):
|
||||
"""Persist explicit tunnels atomically.
|
||||
|
||||
Writes to ``tunnels.json.tmp`` then ``os.replace``s it into place, so
|
||||
a crash mid-write can never leave a partial/empty tunnels.json that
|
||||
silently wipes every tunnel on next read.
|
||||
"""
|
||||
os.makedirs(os.path.dirname(_TUNNEL_FILE), exist_ok=True)
|
||||
tmp_path = _TUNNEL_FILE + ".tmp"
|
||||
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||
json.dump(tunnels, f, indent=2)
|
||||
f.flush()
|
||||
try:
|
||||
os.fsync(f.fileno())
|
||||
except OSError:
|
||||
# Not all filesystems (or Windows file handles) support fsync — tolerate.
|
||||
pass
|
||||
os.replace(tmp_path, _TUNNEL_FILE)
|
||||
|
||||
|
||||
def _endpoint_key(wing: str, room: str) -> str:
|
||||
return f"{wing}/{room}"
|
||||
|
||||
|
||||
def _canonical_tunnel_id(
|
||||
source_wing: str, source_room: str, target_wing: str, target_room: str
|
||||
) -> str:
|
||||
"""Compute a symmetric tunnel ID.
|
||||
|
||||
Tunnels are conceptually undirected — "auth relates to users" is the
|
||||
same connection as "users relates to auth". Sort the two endpoints
|
||||
before hashing so ``create_tunnel(A, B)`` and ``create_tunnel(B, A)``
|
||||
resolve to the same ID and dedup into one record.
|
||||
"""
|
||||
src = _endpoint_key(source_wing, source_room)
|
||||
tgt = _endpoint_key(target_wing, target_room)
|
||||
a, b = sorted((src, tgt))
|
||||
return hashlib.sha256(f"{a}↔{b}".encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
def _require_name(value: str, field: str) -> str:
|
||||
"""Reject empty / non-string endpoint identifiers."""
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
raise ValueError(f"{field} must be a non-empty string")
|
||||
return value.strip()
|
||||
|
||||
|
||||
def create_tunnel(
|
||||
source_wing: str,
|
||||
source_room: str,
|
||||
target_wing: str,
|
||||
target_room: str,
|
||||
label: str = "",
|
||||
source_drawer_id: str = None,
|
||||
target_drawer_id: str = None,
|
||||
):
|
||||
"""Create an explicit (symmetric) tunnel between two locations in the palace.
|
||||
|
||||
Tunnels are undirected: ``create_tunnel(A, B)`` and ``create_tunnel(B, A)``
|
||||
resolve to the same canonical ID. A second call with the same endpoints
|
||||
updates the stored label (and drawer IDs, if provided) rather than
|
||||
creating a duplicate.
|
||||
|
||||
The ``source`` / ``target`` fields on the returned dict preserve the
|
||||
argument order the caller used, so callers can display it directionally
|
||||
if they like. The ID and dedup are symmetric.
|
||||
|
||||
Args:
|
||||
source_wing: Wing of the source (e.g., "project_api").
|
||||
source_room: Room in the source wing.
|
||||
target_wing: Wing of the target (e.g., "project_database").
|
||||
target_room: Room in the target wing.
|
||||
label: Description of the connection.
|
||||
source_drawer_id: Optional specific drawer ID.
|
||||
target_drawer_id: Optional specific drawer ID.
|
||||
|
||||
Returns:
|
||||
The stored tunnel dict.
|
||||
|
||||
Raises:
|
||||
ValueError: if any wing or room is empty or non-string.
|
||||
"""
|
||||
source_wing = _require_name(source_wing, "source_wing")
|
||||
source_room = _require_name(source_room, "source_room")
|
||||
target_wing = _require_name(target_wing, "target_wing")
|
||||
target_room = _require_name(target_room, "target_room")
|
||||
|
||||
tunnel_id = _canonical_tunnel_id(source_wing, source_room, target_wing, target_room)
|
||||
|
||||
tunnel = {
|
||||
"id": tunnel_id,
|
||||
"source": {"wing": source_wing, "room": source_room},
|
||||
"target": {"wing": target_wing, "room": target_room},
|
||||
"label": label,
|
||||
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
if source_drawer_id:
|
||||
tunnel["source"]["drawer_id"] = source_drawer_id
|
||||
if target_drawer_id:
|
||||
tunnel["target"]["drawer_id"] = target_drawer_id
|
||||
|
||||
# Serialize the load → mutate → save cycle. Without this, two concurrent
|
||||
# create_tunnel calls can both read the same snapshot and the later
|
||||
# writer silently drops the earlier writer's tunnel.
|
||||
with mine_lock(_TUNNEL_FILE):
|
||||
tunnels = _load_tunnels()
|
||||
for existing in tunnels:
|
||||
if existing.get("id") == tunnel_id:
|
||||
# Preserve original creation timestamp on label updates.
|
||||
tunnel["created_at"] = existing.get("created_at", tunnel["created_at"])
|
||||
tunnel["updated_at"] = datetime.now(timezone.utc).isoformat()
|
||||
existing.clear()
|
||||
existing.update(tunnel)
|
||||
_save_tunnels(tunnels)
|
||||
return existing
|
||||
tunnels.append(tunnel)
|
||||
_save_tunnels(tunnels)
|
||||
return tunnel
|
||||
|
||||
|
||||
def list_tunnels(wing: str = None):
|
||||
"""List all explicit tunnels, optionally filtered by wing.
|
||||
|
||||
Returns tunnels where ``wing`` appears as either source or target
|
||||
(tunnels are symmetric, so either endpoint is a valid filter match).
|
||||
"""
|
||||
tunnels = _load_tunnels()
|
||||
if wing:
|
||||
tunnels = [t for t in tunnels if t["source"]["wing"] == wing or t["target"]["wing"] == wing]
|
||||
return tunnels
|
||||
|
||||
|
||||
def delete_tunnel(tunnel_id: str):
|
||||
"""Delete an explicit tunnel by ID. Returns ``{"deleted": <id>}``."""
|
||||
with mine_lock(_TUNNEL_FILE):
|
||||
tunnels = _load_tunnels()
|
||||
tunnels = [t for t in tunnels if t.get("id") != tunnel_id]
|
||||
_save_tunnels(tunnels)
|
||||
return {"deleted": tunnel_id}
|
||||
|
||||
|
||||
def follow_tunnels(wing: str, room: str, col=None, config=None):
|
||||
"""Follow explicit tunnels from a room — returns connected drawers.
|
||||
|
||||
Given a location (wing/room), finds all tunnels leading from or to it,
|
||||
and optionally fetches the connected drawer content.
|
||||
"""
|
||||
tunnels = _load_tunnels()
|
||||
connections = []
|
||||
|
||||
for t in tunnels:
|
||||
src = t["source"]
|
||||
tgt = t["target"]
|
||||
|
||||
if src["wing"] == wing and src["room"] == room:
|
||||
connections.append(
|
||||
{
|
||||
"direction": "outgoing",
|
||||
"connected_wing": tgt["wing"],
|
||||
"connected_room": tgt["room"],
|
||||
"label": t.get("label", ""),
|
||||
"drawer_id": tgt.get("drawer_id"),
|
||||
"tunnel_id": t["id"],
|
||||
}
|
||||
)
|
||||
elif tgt["wing"] == wing and tgt["room"] == room:
|
||||
connections.append(
|
||||
{
|
||||
"direction": "incoming",
|
||||
"connected_wing": src["wing"],
|
||||
"connected_room": src["room"],
|
||||
"label": t.get("label", ""),
|
||||
"drawer_id": src.get("drawer_id"),
|
||||
"tunnel_id": t["id"],
|
||||
}
|
||||
)
|
||||
|
||||
# If we have a collection, fetch drawer content for connected items
|
||||
if col and connections:
|
||||
drawer_ids = [c["drawer_id"] for c in connections if c.get("drawer_id")]
|
||||
if drawer_ids:
|
||||
try:
|
||||
results = col.get(ids=drawer_ids, include=["documents", "metadatas"])
|
||||
drawer_map = dict(zip(results["ids"], results["documents"]))
|
||||
for c in connections:
|
||||
did = c.get("drawer_id")
|
||||
if did and did in drawer_map:
|
||||
c["drawer_preview"] = drawer_map[did][:300]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return connections
|
||||
|
||||
+7
-9
@@ -32,7 +32,7 @@ import os
|
||||
import shutil
|
||||
import time
|
||||
|
||||
import chromadb
|
||||
from .backends.chroma import ChromaBackend
|
||||
|
||||
|
||||
COLLECTION_NAME = "mempalace_drawers"
|
||||
@@ -90,8 +90,7 @@ def scan_palace(palace_path=None, only_wing=None):
|
||||
print(f"\n Palace: {palace_path}")
|
||||
print(" Loading...")
|
||||
|
||||
client = chromadb.PersistentClient(path=palace_path)
|
||||
col = client.get_collection(COLLECTION_NAME)
|
||||
col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
|
||||
|
||||
where = {"wing": only_wing} if only_wing else None
|
||||
total = col.count()
|
||||
@@ -174,8 +173,7 @@ def prune_corrupt(palace_path=None, confirm=False):
|
||||
print(" Re-run with --confirm to actually delete.")
|
||||
return
|
||||
|
||||
client = chromadb.PersistentClient(path=palace_path)
|
||||
col = client.get_collection(COLLECTION_NAME)
|
||||
col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
|
||||
before = col.count()
|
||||
print(f" Collection size before: {before:,}")
|
||||
|
||||
@@ -222,9 +220,9 @@ def rebuild_index(palace_path=None):
|
||||
print(f"{'=' * 55}\n")
|
||||
print(f" Palace: {palace_path}")
|
||||
|
||||
client = chromadb.PersistentClient(path=palace_path)
|
||||
backend = ChromaBackend()
|
||||
try:
|
||||
col = client.get_collection(COLLECTION_NAME)
|
||||
col = backend.get_collection(palace_path, COLLECTION_NAME)
|
||||
total = col.count()
|
||||
except Exception as e:
|
||||
print(f" Error reading palace: {e}")
|
||||
@@ -264,8 +262,8 @@ def rebuild_index(palace_path=None):
|
||||
|
||||
# Rebuild with correct HNSW settings
|
||||
print(" Rebuilding collection with hnsw:space=cosine...")
|
||||
client.delete_collection(COLLECTION_NAME)
|
||||
new_col = client.create_collection(COLLECTION_NAME, metadata={"hnsw:space": "cosine"})
|
||||
backend.delete_collection(palace_path, COLLECTION_NAME)
|
||||
new_col = backend.create_collection(palace_path, COLLECTION_NAME)
|
||||
|
||||
filed = 0
|
||||
for i in range(0, len(all_ids), batch_size):
|
||||
|
||||
+363
-29
@@ -2,14 +2,23 @@
|
||||
"""
|
||||
searcher.py — Find anything. Exact words.
|
||||
|
||||
Semantic search against the palace.
|
||||
Returns verbatim text — the actual words, never summaries.
|
||||
Hybrid search: BM25 keyword matching + vector semantic similarity. The
|
||||
drawer query is the floor — always runs — and closet hits add a rank-based
|
||||
boost when they agree. Closets are a ranking *signal*, never a gate, so
|
||||
weak closets (regex extraction on narrative content) can only help, never
|
||||
hide drawers the direct path would have found.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from .palace import get_collection
|
||||
from .palace import get_closets_collection, get_collection
|
||||
|
||||
# Closet pointer line format: "topic|entities|→drawer_id_a,drawer_id_b"
|
||||
# Multiple lines may join with newlines inside one closet document.
|
||||
_CLOSET_DRAWER_REF_RE = re.compile(r"→([\w,]+)")
|
||||
|
||||
logger = logging.getLogger("mempalace_mcp")
|
||||
|
||||
@@ -18,6 +27,125 @@ class SearchError(Exception):
|
||||
"""Raised when search cannot proceed (e.g. no palace found)."""
|
||||
|
||||
|
||||
_TOKEN_RE = re.compile(r"\w{2,}", re.UNICODE)
|
||||
|
||||
|
||||
def _first_or_empty(results: dict, key: str) -> list:
|
||||
"""Return the first inner list of a ChromaDB query result, or [].
|
||||
|
||||
ChromaDB returns shapes like ``{"documents": [["a", "b"]], ...}`` for a
|
||||
successful query, but ``{"documents": [], ...}`` (empty outer list) when
|
||||
the collection is empty or the filter excludes everything. Indexing
|
||||
``[0]`` blindly raises IndexError in that case (issue #195).
|
||||
"""
|
||||
outer = results.get(key)
|
||||
if not outer:
|
||||
return []
|
||||
return outer[0] or []
|
||||
|
||||
|
||||
def _tokenize(text: str) -> list:
|
||||
"""Lowercase + strip to alphanumeric tokens of length ≥ 2."""
|
||||
return _TOKEN_RE.findall(text.lower())
|
||||
|
||||
|
||||
def _bm25_scores(
|
||||
query: str,
|
||||
documents: list,
|
||||
k1: float = 1.5,
|
||||
b: float = 0.75,
|
||||
) -> list:
|
||||
"""Compute Okapi-BM25 scores for ``query`` against each document.
|
||||
|
||||
IDF is computed over the *provided corpus* using the Lucene/BM25+
|
||||
smoothed formula ``log((N - df + 0.5) / (df + 0.5) + 1)``, which is
|
||||
always non-negative. This is well-defined for re-ranking a small
|
||||
candidate set returned by vector retrieval — IDF then reflects how
|
||||
discriminative each query term is *within the candidates*, exactly
|
||||
what's needed to reorder them.
|
||||
|
||||
Parameters mirror Okapi-BM25 conventions:
|
||||
k1 — term-frequency saturation (1.2-2.0 typical, 1.5 default)
|
||||
b — length normalization (0.0 = none, 1.0 = full, 0.75 default)
|
||||
|
||||
Returns a list of scores in the same order as ``documents``.
|
||||
"""
|
||||
n_docs = len(documents)
|
||||
query_terms = set(_tokenize(query))
|
||||
if not query_terms or n_docs == 0:
|
||||
return [0.0] * n_docs
|
||||
|
||||
tokenized = [_tokenize(d) for d in documents]
|
||||
doc_lens = [len(toks) for toks in tokenized]
|
||||
if not any(doc_lens):
|
||||
return [0.0] * n_docs
|
||||
avgdl = sum(doc_lens) / n_docs or 1.0
|
||||
|
||||
# Document frequency: how many docs contain each query term?
|
||||
df = {term: 0 for term in query_terms}
|
||||
for toks in tokenized:
|
||||
seen = set(toks) & query_terms
|
||||
for term in seen:
|
||||
df[term] += 1
|
||||
|
||||
idf = {term: math.log((n_docs - df[term] + 0.5) / (df[term] + 0.5) + 1) for term in query_terms}
|
||||
|
||||
scores = []
|
||||
for toks, dl in zip(tokenized, doc_lens):
|
||||
if dl == 0:
|
||||
scores.append(0.0)
|
||||
continue
|
||||
tf: dict = {}
|
||||
for t in toks:
|
||||
if t in query_terms:
|
||||
tf[t] = tf.get(t, 0) + 1
|
||||
score = 0.0
|
||||
for term, freq in tf.items():
|
||||
num = freq * (k1 + 1)
|
||||
den = freq + k1 * (1 - b + b * dl / avgdl)
|
||||
score += idf[term] * num / den
|
||||
scores.append(score)
|
||||
return scores
|
||||
|
||||
|
||||
def _hybrid_rank(
|
||||
results: list,
|
||||
query: str,
|
||||
vector_weight: float = 0.6,
|
||||
bm25_weight: float = 0.4,
|
||||
) -> list:
|
||||
"""Re-rank ``results`` by a convex combination of vector similarity and BM25.
|
||||
|
||||
* Vector similarity uses absolute cosine sim ``max(0, 1 - distance)`` —
|
||||
ChromaDB's hnsw cosine distance lives in ``[0, 2]`` (0 = identical).
|
||||
Absolute (not relative-to-max) means adding/removing a candidate
|
||||
can't reshuffle the others.
|
||||
* BM25 is real Okapi-BM25 with corpus-relative IDF over the candidates
|
||||
themselves. Since the absolute scale is unbounded, BM25 is min-max
|
||||
normalized within the candidate set so weights are commensurable.
|
||||
|
||||
Mutates each result dict to add ``bm25_score`` and reorders the list
|
||||
in place. Returns the same list for convenience.
|
||||
"""
|
||||
if not results:
|
||||
return results
|
||||
|
||||
docs = [r.get("text", "") for r in results]
|
||||
bm25_raw = _bm25_scores(query, docs)
|
||||
max_bm25 = max(bm25_raw) if bm25_raw else 0.0
|
||||
bm25_norm = [s / max_bm25 for s in bm25_raw] if max_bm25 > 0 else [0.0] * len(bm25_raw)
|
||||
|
||||
scored = []
|
||||
for r, raw, norm in zip(results, bm25_raw, bm25_norm):
|
||||
vec_sim = max(0.0, 1.0 - r.get("distance", 1.0))
|
||||
r["bm25_score"] = round(raw, 3)
|
||||
scored.append((vector_weight * vec_sim + bm25_weight * norm, r))
|
||||
|
||||
scored.sort(key=lambda pair: pair[0], reverse=True)
|
||||
results[:] = [r for _, r in scored]
|
||||
return results
|
||||
|
||||
|
||||
def build_where_filter(wing: str = None, room: str = None) -> dict:
|
||||
"""Build ChromaDB where filter for wing/room filtering."""
|
||||
if wing and room:
|
||||
@@ -29,6 +157,85 @@ def build_where_filter(wing: str = None, room: str = None) -> dict:
|
||||
return {}
|
||||
|
||||
|
||||
def _extract_drawer_ids_from_closet(closet_doc: str) -> list:
|
||||
"""Parse all `→drawer_id_a,drawer_id_b` pointers out of a closet document.
|
||||
|
||||
Preserves order and dedupes.
|
||||
"""
|
||||
seen: dict = {}
|
||||
for match in _CLOSET_DRAWER_REF_RE.findall(closet_doc):
|
||||
for did in match.split(","):
|
||||
did = did.strip()
|
||||
if did and did not in seen:
|
||||
seen[did] = None
|
||||
return list(seen.keys())
|
||||
|
||||
|
||||
def _expand_with_neighbors(drawers_col, matched_doc: str, matched_meta: dict, radius: int = 1):
|
||||
"""Expand a matched drawer with its ±radius sibling chunks in the same source file.
|
||||
|
||||
Motivation — "drawer-grep context" feature: a closet hit returns one
|
||||
drawer, but the chunk boundary may clip mid-thought (e.g., the matched
|
||||
chunk says "here's a breakdown:" and the actual breakdown lives in the
|
||||
next chunk). Fetching the small neighborhood around the match gives
|
||||
callers enough context without forcing a follow-up ``get_drawer`` call.
|
||||
|
||||
Returns a dict with:
|
||||
``text`` combined chunks in chunk_index order
|
||||
``drawer_index`` the matched chunk's index in the source file
|
||||
``total_drawers`` total drawer count for the source file (or None)
|
||||
|
||||
On any ChromaDB failure or missing metadata, falls back to returning the
|
||||
matched drawer alone so search never breaks because neighbor expansion
|
||||
failed.
|
||||
"""
|
||||
src = matched_meta.get("source_file")
|
||||
chunk_idx = matched_meta.get("chunk_index")
|
||||
if not src or not isinstance(chunk_idx, int):
|
||||
return {"text": matched_doc, "drawer_index": chunk_idx, "total_drawers": None}
|
||||
|
||||
target_indexes = [chunk_idx + offset for offset in range(-radius, radius + 1)]
|
||||
try:
|
||||
neighbors = drawers_col.get(
|
||||
where={
|
||||
"$and": [
|
||||
{"source_file": src},
|
||||
{"chunk_index": {"$in": target_indexes}},
|
||||
]
|
||||
},
|
||||
include=["documents", "metadatas"],
|
||||
)
|
||||
except Exception:
|
||||
return {"text": matched_doc, "drawer_index": chunk_idx, "total_drawers": None}
|
||||
|
||||
indexed_docs = []
|
||||
for doc, meta in zip(neighbors.get("documents") or [], neighbors.get("metadatas") or []):
|
||||
ci = meta.get("chunk_index")
|
||||
if isinstance(ci, int):
|
||||
indexed_docs.append((ci, doc))
|
||||
indexed_docs.sort(key=lambda pair: pair[0])
|
||||
|
||||
if not indexed_docs:
|
||||
combined_text = matched_doc
|
||||
else:
|
||||
combined_text = "\n\n".join(doc for _, doc in indexed_docs)
|
||||
|
||||
# Cheap total_drawers lookup: metadata-only scan of the source file.
|
||||
total_drawers = None
|
||||
try:
|
||||
all_meta = drawers_col.get(where={"source_file": src}, include=["metadatas"])
|
||||
ids = all_meta.get("ids") or []
|
||||
total_drawers = len(ids) if ids else None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"text": combined_text,
|
||||
"drawer_index": chunk_idx,
|
||||
"total_drawers": total_drawers,
|
||||
}
|
||||
|
||||
|
||||
def search(query: str, palace_path: str, wing: str = None, room: str = None, n_results: int = 5):
|
||||
"""
|
||||
Search the palace. Returns verbatim drawer content.
|
||||
@@ -58,9 +265,9 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
|
||||
print(f"\n Search error: {e}")
|
||||
raise SearchError(f"Search error: {e}") from e
|
||||
|
||||
docs = results["documents"][0]
|
||||
metas = results["metadatas"][0]
|
||||
dists = results["distances"][0]
|
||||
docs = _first_or_empty(results, "documents")
|
||||
metas = _first_or_empty(results, "metadatas")
|
||||
dists = _first_or_empty(results, "distances")
|
||||
|
||||
if not docs:
|
||||
print(f'\n No results found for: "{query}"')
|
||||
@@ -117,7 +324,7 @@ def search_memories(
|
||||
0.0 disables filtering. Typical useful range: 0.3–1.0.
|
||||
"""
|
||||
try:
|
||||
col = get_collection(palace_path, create=False)
|
||||
drawers_col = get_collection(palace_path, create=False)
|
||||
except Exception as e:
|
||||
logger.error("No palace found at %s: %s", palace_path, e)
|
||||
return {
|
||||
@@ -127,42 +334,169 @@ def search_memories(
|
||||
|
||||
where = build_where_filter(wing, room)
|
||||
|
||||
# Hybrid retrieval: always query drawers directly (the floor), then use
|
||||
# closet hits to boost rankings. Closets are a ranking SIGNAL, never a
|
||||
# GATE — direct drawer search is always the baseline.
|
||||
#
|
||||
# This avoids the "weak-closets regression" where narrative content
|
||||
# produces low-signal closets (regex extraction matches few topics)
|
||||
# and closet-first routing hides drawers that direct search would find.
|
||||
try:
|
||||
kwargs = {
|
||||
dkwargs = {
|
||||
"query_texts": [query],
|
||||
"n_results": n_results,
|
||||
"n_results": n_results * 3, # over-fetch for re-ranking
|
||||
"include": ["documents", "metadatas", "distances"],
|
||||
}
|
||||
if where:
|
||||
kwargs["where"] = where
|
||||
|
||||
results = col.query(**kwargs)
|
||||
dkwargs["where"] = where
|
||||
drawer_results = drawers_col.query(**dkwargs)
|
||||
except Exception as e:
|
||||
return {"error": f"Search error: {e}"}
|
||||
|
||||
docs = results["documents"][0]
|
||||
metas = results["metadatas"][0]
|
||||
dists = results["distances"][0]
|
||||
# Gather closet hits (best-per-source) to build a boost lookup.
|
||||
closet_boost_by_source: dict = {} # source_file -> (rank, closet_dist, preview)
|
||||
try:
|
||||
closets_col = get_closets_collection(palace_path, create=False)
|
||||
ckwargs = {
|
||||
"query_texts": [query],
|
||||
"n_results": n_results * 2,
|
||||
"include": ["documents", "metadatas", "distances"],
|
||||
}
|
||||
if where:
|
||||
ckwargs["where"] = where
|
||||
closet_results = closets_col.query(**ckwargs)
|
||||
for rank, (cdoc, cmeta, cdist) in enumerate(
|
||||
zip(
|
||||
_first_or_empty(closet_results, "documents"),
|
||||
_first_or_empty(closet_results, "metadatas"),
|
||||
_first_or_empty(closet_results, "distances"),
|
||||
)
|
||||
):
|
||||
source = cmeta.get("source_file", "")
|
||||
if source and source not in closet_boost_by_source:
|
||||
closet_boost_by_source[source] = (rank, cdist, cdoc[:200])
|
||||
except Exception:
|
||||
pass # no closets yet — hybrid degrades to pure drawer search
|
||||
|
||||
hits = []
|
||||
for doc, meta, dist in zip(docs, metas, dists):
|
||||
# Filter on raw distance before rounding to avoid precision loss
|
||||
# Rank-based boost. The ordinal signal ("which closet matched best") is
|
||||
# more reliable than absolute distance on narrative content, where
|
||||
# closet distances cluster in 1.2-1.5 range regardless of match quality.
|
||||
CLOSET_RANK_BOOSTS = [0.40, 0.25, 0.15, 0.08, 0.04]
|
||||
CLOSET_DISTANCE_CAP = 1.5 # cosine dist > 1.5 = too weak to use as signal
|
||||
|
||||
scored: list = []
|
||||
for doc, meta, dist in zip(
|
||||
_first_or_empty(drawer_results, "documents"),
|
||||
_first_or_empty(drawer_results, "metadatas"),
|
||||
_first_or_empty(drawer_results, "distances"),
|
||||
):
|
||||
# Filter on raw distance before rounding to avoid precision loss.
|
||||
if max_distance > 0.0 and dist > max_distance:
|
||||
continue
|
||||
hits.append(
|
||||
{
|
||||
"text": doc,
|
||||
"wing": meta.get("wing", "unknown"),
|
||||
"room": meta.get("room", "unknown"),
|
||||
"source_file": Path(meta.get("source_file", "?")).name,
|
||||
"similarity": round(max(0.0, 1 - dist), 3),
|
||||
"distance": round(dist, 4),
|
||||
}
|
||||
)
|
||||
|
||||
source = meta.get("source_file", "") or ""
|
||||
boost = 0.0
|
||||
matched_via = "drawer"
|
||||
closet_preview = None
|
||||
if source in closet_boost_by_source:
|
||||
c_rank, c_dist, c_preview = closet_boost_by_source[source]
|
||||
if c_dist <= CLOSET_DISTANCE_CAP and c_rank < len(CLOSET_RANK_BOOSTS):
|
||||
boost = CLOSET_RANK_BOOSTS[c_rank]
|
||||
matched_via = "drawer+closet"
|
||||
closet_preview = c_preview
|
||||
|
||||
effective_dist = dist - boost
|
||||
entry = {
|
||||
"text": doc,
|
||||
"wing": meta.get("wing", "unknown"),
|
||||
"room": meta.get("room", "unknown"),
|
||||
"source_file": Path(source).name if source else "?",
|
||||
"created_at": meta.get("filed_at", "unknown"),
|
||||
"similarity": round(max(0.0, 1 - effective_dist), 3),
|
||||
"distance": round(dist, 4),
|
||||
"effective_distance": round(effective_dist, 4),
|
||||
"closet_boost": round(boost, 3),
|
||||
"matched_via": matched_via,
|
||||
# Internal: retain the full source_file path + chunk_index so the
|
||||
# enrichment step below doesn't have to reverse-lookup via
|
||||
# basename-suffix matching (which silently collides when two
|
||||
# files share a basename across different directories).
|
||||
"_sort_key": effective_dist,
|
||||
"_source_file_full": source,
|
||||
"_chunk_index": meta.get("chunk_index"),
|
||||
}
|
||||
if closet_preview:
|
||||
entry["closet_preview"] = closet_preview
|
||||
scored.append(entry)
|
||||
|
||||
scored.sort(key=lambda h: h["_sort_key"])
|
||||
hits = scored[:n_results]
|
||||
|
||||
# Drawer-grep enrichment: for closet-boosted hits whose source has
|
||||
# multiple drawers, return the keyword-best chunk + its immediate
|
||||
# neighbors instead of just the drawer vector search landed on. The
|
||||
# closet said "this source is relevant"; vector may have picked the
|
||||
# wrong chunk within it; grep picks the right one.
|
||||
MAX_HYDRATION_CHARS = 10000
|
||||
for h in hits:
|
||||
if h["matched_via"] == "drawer":
|
||||
continue
|
||||
full_source = h.get("_source_file_full") or ""
|
||||
if not full_source:
|
||||
continue
|
||||
try:
|
||||
source_drawers = drawers_col.get(
|
||||
where={"source_file": full_source},
|
||||
include=["documents", "metadatas"],
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
docs = source_drawers.get("documents") or []
|
||||
metas_ = source_drawers.get("metadatas") or []
|
||||
if len(docs) <= 1:
|
||||
continue
|
||||
|
||||
# Sort by chunk_index so best_idx + neighbors are positional.
|
||||
indexed = []
|
||||
for idx, (d, m) in enumerate(zip(docs, metas_)):
|
||||
ci = m.get("chunk_index", idx) if isinstance(m, dict) else idx
|
||||
if not isinstance(ci, int):
|
||||
ci = idx
|
||||
indexed.append((ci, d))
|
||||
indexed.sort(key=lambda p: p[0])
|
||||
ordered_docs = [d for _, d in indexed]
|
||||
|
||||
query_terms = set(_tokenize(query))
|
||||
best_idx, best_score = 0, -1
|
||||
for idx, d in enumerate(ordered_docs):
|
||||
d_lower = d.lower()
|
||||
s = sum(1 for t in query_terms if t in d_lower)
|
||||
if s > best_score:
|
||||
best_score, best_idx = s, idx
|
||||
|
||||
start = max(0, best_idx - 1)
|
||||
end = min(len(ordered_docs), best_idx + 2)
|
||||
expanded = "\n\n".join(ordered_docs[start:end])
|
||||
if len(expanded) > MAX_HYDRATION_CHARS:
|
||||
expanded = (
|
||||
expanded[:MAX_HYDRATION_CHARS]
|
||||
+ f"\n\n[...truncated. {len(ordered_docs)} total drawers. "
|
||||
"Use mempalace_get_drawer for full content.]"
|
||||
)
|
||||
h["text"] = expanded
|
||||
h["drawer_index"] = best_idx
|
||||
h["total_drawers"] = len(ordered_docs)
|
||||
|
||||
# BM25 hybrid re-rank within the final candidate set.
|
||||
hits = _hybrid_rank(hits, query)
|
||||
for h in hits:
|
||||
h.pop("_sort_key", None)
|
||||
h.pop("_source_file_full", None)
|
||||
h.pop("_chunk_index", None)
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"filters": {"wing": wing, "room": room},
|
||||
"total_before_filter": len(docs),
|
||||
"total_before_filter": len(_first_or_empty(drawer_results, "documents")),
|
||||
"results": hits,
|
||||
}
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
"""Single source of truth for the MemPalace package version."""
|
||||
|
||||
__version__ = "3.1.0"
|
||||
__version__ = "3.3.0"
|
||||
|
||||
Reference in New Issue
Block a user