21d4a23430
Merges develop (#820 version sync, #785 strip_noise + NORMALIZE_VERSION, #784 file locking) and addresses six concerns surfaced during PR review of the closet feature: 1. Closet append-on-rebuild bug — upsert_closet_lines used to APPEND to existing closets (mismatched the doc's "fully replaced" promise). With NORMALIZE_VERSION rebuilds on develop, this would have stacked stale v1 topics on top of fresh v2 content forever. Fix: - Drop the read-and-append branch from upsert_closet_lines (now a pure numbered-id overwrite). - Add purge_file_closets(closets_col, source_file) helper that wipes every closet for a source file by where-filter. - process_file calls purge_file_closets before upsert on every mine, mirroring the existing drawer purge. 2. Searcher returned whole-file blobs from the closet path while the direct path returned chunk-level drawers. Refactored: - _extract_drawer_ids_from_closet parses the `→drawer_a,drawer_b` pointers out of closet documents. - _closet_first_hits hydrates exactly those drawer IDs (chunk-level), not collection.get(where=source_file) (which returned everything). - Same hit shape as direct-search path; both now carry matched_via. 3. max_distance was bypassed on the closet path. Now applied per-hit; when every closet candidate gets filtered, _closet_first_hits returns None and the caller falls through to direct drawer search. 4. Entity extraction caught sentence-starters like "When", "The", "After" as proper nouns. Added _ENTITY_STOPLIST (~40 common false positives + day/month names + role words). Real names like Igor / Milla still survive — covered by tests. 5. CLOSETS.md drifted from the code (claimed "replaced via upsert" but code appended; claimed BM25 hybrid that doesn't exist; claimed a 10K char hydration cap that wasn't enforced). Rewritten to describe what actually ships, with explicit notes on the BM25 / convo-closet follow-ups. 6. Zero tests for ~250 lines. Added tests/test_closets.py with 17 cases: - build_closet_lines: pointer shape, header extraction, stoplist filtering (with regression case for "When/After/The"), real-name survival, fallback-line guarantee, drawer-ref slicing. - upsert_closet_lines: pure overwrite semantics (regression for the append bug), char-limit packing without splitting lines. - purge_file_closets: scoped to source_file, doesn't touch others. - End-to-end miner rebuild: re-mining a file with fewer topics fully purges leftover numbered closets from the larger first run. - _extract_drawer_ids_from_closet: parsing + dedup edge cases. - search_memories closet-first: fallback when empty, chunk-level hits with matched_via, no whole-file glue, max_distance enforced. Merge resolutions: miner.py imports combined NORMALIZE_VERSION/mine_lock from develop with the closet helpers from this branch. process_file auto-merged cleanly (closet block sits inside develop's lock body). 724/724 tests pass. ruff + format clean under CI-pinned 0.4.x.
314 lines
9.1 KiB
Python
314 lines
9.1 KiB
Python
"""
|
|
palace.py — Shared palace operations.
|
|
|
|
Consolidates collection access patterns used by both miners and the MCP server.
|
|
"""
|
|
|
|
import contextlib
|
|
import hashlib
|
|
import os
|
|
|
|
from .backends.chroma import ChromaBackend
|
|
|
|
SKIP_DIRS = {
|
|
".git",
|
|
"node_modules",
|
|
"__pycache__",
|
|
".venv",
|
|
"venv",
|
|
"env",
|
|
"dist",
|
|
"build",
|
|
".next",
|
|
"coverage",
|
|
".mempalace",
|
|
".ruff_cache",
|
|
".mypy_cache",
|
|
".pytest_cache",
|
|
".cache",
|
|
".tox",
|
|
".nox",
|
|
".idea",
|
|
".vscode",
|
|
".ipynb_checkpoints",
|
|
".eggs",
|
|
"htmlcov",
|
|
"target",
|
|
}
|
|
|
|
_DEFAULT_BACKEND = ChromaBackend()
|
|
|
|
# Schema version for drawer normalization. Bump when the normalization
|
|
# pipeline changes in a way that existing drawers should be rebuilt to pick up
|
|
# (e.g., new noise-stripping rules). `file_already_mined` treats drawers with
|
|
# a missing or stale `normalize_version` as "not mined", so the next mine pass
|
|
# silently rebuilds them — users don't need to manually erase + re-mine.
|
|
#
|
|
# v2 (2026-04): introduced strip_noise() for Claude Code JSONL; previous
|
|
# drawers stored system tags / hook chrome verbatim.
|
|
NORMALIZE_VERSION = 2
|
|
|
|
|
|
def get_collection(
|
|
palace_path: str,
|
|
collection_name: str = "mempalace_drawers",
|
|
create: bool = True,
|
|
):
|
|
"""Get the palace collection through the backend layer."""
|
|
return _DEFAULT_BACKEND.get_collection(
|
|
palace_path,
|
|
collection_name=collection_name,
|
|
create=create,
|
|
)
|
|
|
|
|
|
def get_closets_collection(palace_path: str, create: bool = True):
|
|
"""Get the closets collection — the searchable index layer."""
|
|
return get_collection(palace_path, collection_name="mempalace_closets", create=create)
|
|
|
|
|
|
CLOSET_CHAR_LIMIT = 1500 # fill closet until ~1500 chars, then start a new one
|
|
CLOSET_EXTRACT_WINDOW = 5000 # how many chars of source content to scan for entities/topics
|
|
|
|
# Common capitalized words that look like proper nouns but are usually
|
|
# sentence-starters or filler. Filtered out of entity extraction.
|
|
_ENTITY_STOPLIST = frozenset(
|
|
{
|
|
"The",
|
|
"This",
|
|
"That",
|
|
"These",
|
|
"Those",
|
|
"When",
|
|
"Where",
|
|
"What",
|
|
"Why",
|
|
"Who",
|
|
"Which",
|
|
"How",
|
|
"After",
|
|
"Before",
|
|
"Then",
|
|
"Now",
|
|
"Here",
|
|
"There",
|
|
"And",
|
|
"But",
|
|
"Or",
|
|
"Yet",
|
|
"So",
|
|
"If",
|
|
"Else",
|
|
"Yes",
|
|
"No",
|
|
"Maybe",
|
|
"Okay",
|
|
"User",
|
|
"Assistant",
|
|
"System",
|
|
"Tool",
|
|
"Monday",
|
|
"Tuesday",
|
|
"Wednesday",
|
|
"Thursday",
|
|
"Friday",
|
|
"Saturday",
|
|
"Sunday",
|
|
"January",
|
|
"February",
|
|
"March",
|
|
"April",
|
|
"May",
|
|
"June",
|
|
"July",
|
|
"August",
|
|
"September",
|
|
"October",
|
|
"November",
|
|
"December",
|
|
}
|
|
)
|
|
|
|
|
|
def build_closet_lines(source_file, drawer_ids, content, wing, room):
|
|
"""Build compact closet pointer lines from drawer content.
|
|
|
|
Returns a LIST of lines (not joined). Each line is one complete topic
|
|
pointer — never split across closets.
|
|
|
|
Format: topic|entities|→drawer_ids
|
|
"""
|
|
import re
|
|
from pathlib import Path
|
|
|
|
drawer_ref = ",".join(drawer_ids[:3])
|
|
window = content[:CLOSET_EXTRACT_WINDOW]
|
|
|
|
# Extract proper nouns (capitalized words, 2+ occurrences). Filter out
|
|
# common sentence-starters that aren't real entities.
|
|
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
|
|
word_freq = {}
|
|
for w in words:
|
|
if w in _ENTITY_STOPLIST:
|
|
continue
|
|
word_freq[w] = word_freq.get(w, 0) + 1
|
|
entities = sorted(
|
|
[w for w, c in word_freq.items() if c >= 2],
|
|
key=lambda w: -word_freq[w],
|
|
)[:5]
|
|
entity_str = ";".join(entities) if entities else ""
|
|
|
|
# Extract key phrases — action verbs + context
|
|
topics = []
|
|
for pattern in [
|
|
r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated|reviewed|deployed|configured|removed|updated)\s+[\w\s]{3,40}",
|
|
]:
|
|
topics.extend(re.findall(pattern, window, re.IGNORECASE))
|
|
# Also grab section headers if present
|
|
for header in re.findall(r"^#{1,3}\s+(.{5,60})$", window, re.MULTILINE):
|
|
topics.append(header.strip())
|
|
# Dedupe preserving order
|
|
topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:12]
|
|
|
|
# Extract quotes
|
|
quotes = re.findall(r'"([^"]{15,150})"', window)
|
|
|
|
# Build pointer lines — each one is atomic, never split
|
|
lines = []
|
|
for topic in topics:
|
|
lines.append(f"{topic}|{entity_str}|→{drawer_ref}")
|
|
for quote in quotes[:3]:
|
|
lines.append(f'"{quote}"|{entity_str}|→{drawer_ref}')
|
|
|
|
# Always have at least one line
|
|
if not lines:
|
|
name = Path(source_file).stem[:40]
|
|
lines.append(f"{wing}/{room}/{name}|{entity_str}|→{drawer_ref}")
|
|
|
|
return lines
|
|
|
|
|
|
def purge_file_closets(closets_col, source_file: str) -> None:
|
|
"""Delete every closet associated with ``source_file``.
|
|
|
|
Call this before ``upsert_closet_lines`` on a re-mine so stale topics
|
|
from a prior schema/version don't survive in the closet collection.
|
|
Mirrors the drawer-purge step in process_file().
|
|
"""
|
|
try:
|
|
closets_col.delete(where={"source_file": source_file})
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def upsert_closet_lines(closets_col, closet_id_base, lines, metadata):
|
|
"""Write topic lines to closets, packed greedily without splitting a line.
|
|
|
|
Closets are deterministically numbered (``..._01``, ``..._02``, …) and
|
|
each ``upsert`` fully overwrites the prior content at that ID. Callers
|
|
are expected to ``purge_file_closets`` first when re-mining a source
|
|
file so stale-numbered closets from larger prior runs don't leak.
|
|
|
|
Returns the number of closets written.
|
|
"""
|
|
closet_num = 1
|
|
current_lines: list = []
|
|
current_chars = 0
|
|
closets_written = 0
|
|
|
|
def _flush():
|
|
nonlocal closets_written
|
|
if not current_lines:
|
|
return
|
|
closet_id = f"{closet_id_base}_{closet_num:02d}"
|
|
text = "\n".join(current_lines)
|
|
closets_col.upsert(documents=[text], ids=[closet_id], metadatas=[metadata])
|
|
closets_written += 1
|
|
|
|
for line in lines:
|
|
line_len = len(line)
|
|
# Would this line fit whole in the current closet?
|
|
if current_chars > 0 and current_chars + line_len + 1 > CLOSET_CHAR_LIMIT:
|
|
_flush()
|
|
closet_num += 1
|
|
current_lines = []
|
|
current_chars = 0
|
|
|
|
current_lines.append(line)
|
|
current_chars += line_len + 1 # +1 for newline
|
|
|
|
_flush()
|
|
return closets_written
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def mine_lock(source_file: str):
|
|
"""Cross-platform file lock for mine operations.
|
|
|
|
Prevents multiple agents from mining the same file simultaneously,
|
|
which causes duplicate drawers when the delete+insert cycle interleaves.
|
|
"""
|
|
lock_dir = os.path.join(os.path.expanduser("~"), ".mempalace", "locks")
|
|
os.makedirs(lock_dir, exist_ok=True)
|
|
lock_path = os.path.join(
|
|
lock_dir, hashlib.sha256(source_file.encode()).hexdigest()[:16] + ".lock"
|
|
)
|
|
|
|
lf = open(lock_path, "w")
|
|
try:
|
|
if os.name == "nt":
|
|
import msvcrt
|
|
|
|
msvcrt.locking(lf.fileno(), msvcrt.LK_LOCK, 1)
|
|
else:
|
|
import fcntl
|
|
|
|
fcntl.flock(lf, fcntl.LOCK_EX)
|
|
yield
|
|
finally:
|
|
try:
|
|
if os.name == "nt":
|
|
import msvcrt
|
|
|
|
msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
|
|
else:
|
|
import fcntl
|
|
|
|
fcntl.flock(lf, fcntl.LOCK_UN)
|
|
except Exception:
|
|
pass
|
|
lf.close()
|
|
|
|
|
|
def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool:
|
|
"""Check if a file has already been filed in the palace.
|
|
|
|
Returns False (so the file gets re-mined) when:
|
|
- no drawers exist for this source_file
|
|
- the stored `normalize_version` is missing or older than the current
|
|
schema (triggers silent rebuild after a normalization upgrade)
|
|
- `check_mtime=True` and the file's mtime differs from the stored one
|
|
|
|
When check_mtime=True (used by project miner), also re-mines on content
|
|
change. When check_mtime=False (used by convo miner), transcripts are
|
|
assumed immutable, so only the version gate triggers a rebuild.
|
|
"""
|
|
try:
|
|
results = collection.get(where={"source_file": source_file}, limit=1)
|
|
if not results.get("ids"):
|
|
return False
|
|
stored_meta = results.get("metadatas", [{}])[0] or {}
|
|
# Pre-v2 drawers have no version field — treat them as stale.
|
|
stored_version = stored_meta.get("normalize_version", 1)
|
|
if stored_version < NORMALIZE_VERSION:
|
|
return False
|
|
if check_mtime:
|
|
stored_mtime = stored_meta.get("source_mtime")
|
|
if stored_mtime is None:
|
|
return False
|
|
current_mtime = os.path.getmtime(source_file)
|
|
return abs(float(stored_mtime) - current_mtime) < 0.001
|
|
return True
|
|
except Exception:
|
|
return False
|