feat: add closet layer — searchable index pointing to drawers
The closet architecture was always part of MemPalace's design but never shipped in the public codebase. This adds it. Palace now has TWO collections: - mempalace_drawers — full verbatim content (unchanged) - mempalace_closets — compact AAAK-style index entries How it works: - When mining, each file gets a closet alongside its drawers - Closet contains extracted topics, entities, quotes as pointers - Closets pack up to 1500 chars, topics never split mid-entry - Search hits closets first (fast, small), then hydrates the full drawer content for matching files - Falls back to direct drawer search if no closets exist yet Files changed: - palace.py: get_closets_collection(), build_closet_text(), upsert_closet(), CLOSET_CHAR_LIMIT - miner.py: process_file() now creates closets after drawers - searcher.py: search_memories() tries closet-first search, hydrates drawers, falls back to direct search Backwards compatible — existing palaces without closets continue to work via the fallback path. Closets are created on next mine. 689/689 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+24
-1
@@ -15,7 +15,10 @@ from pathlib import Path
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
|
||||
from .palace import SKIP_DIRS, get_collection, file_already_mined, mine_lock
|
||||
from .palace import (
|
||||
SKIP_DIRS, get_collection, get_closets_collection,
|
||||
file_already_mined, mine_lock, build_closet_text, upsert_closet,
|
||||
)
|
||||
|
||||
READABLE_EXTENSIONS = {
|
||||
".txt",
|
||||
@@ -410,6 +413,7 @@ def process_file(
|
||||
rooms: list,
|
||||
agent: str,
|
||||
dry_run: bool,
|
||||
closets_col=None,
|
||||
) -> tuple:
|
||||
"""Read, chunk, route, and file one file. Returns (drawer_count, room_name)."""
|
||||
|
||||
@@ -466,6 +470,22 @@ def process_file(
|
||||
if added:
|
||||
drawers_added += 1
|
||||
|
||||
# Build closet — the searchable index pointing to these drawers
|
||||
if closets_col and drawers_added > 0:
|
||||
drawer_ids = [
|
||||
f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}"
|
||||
for c in chunks
|
||||
]
|
||||
closet_text = build_closet_text(source_file, drawer_ids, content, wing, room)
|
||||
closet_id = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}"
|
||||
upsert_closet(closets_col, closet_id, closet_text, {
|
||||
"wing": wing,
|
||||
"room": room,
|
||||
"source_file": source_file,
|
||||
"drawer_count": drawers_added,
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
})
|
||||
|
||||
return drawers_added, room
|
||||
|
||||
|
||||
@@ -586,8 +606,10 @@ def mine(
|
||||
|
||||
if not dry_run:
|
||||
collection = get_collection(palace_path)
|
||||
closets_col = get_closets_collection(palace_path)
|
||||
else:
|
||||
collection = None
|
||||
closets_col = None
|
||||
|
||||
total_drawers = 0
|
||||
files_skipped = 0
|
||||
@@ -602,6 +624,7 @@ def mine(
|
||||
rooms=rooms,
|
||||
agent=agent,
|
||||
dry_run=dry_run,
|
||||
closets_col=closets_col,
|
||||
)
|
||||
if drawers == 0 and not dry_run:
|
||||
files_skipped += 1
|
||||
|
||||
@@ -52,6 +52,68 @@ def get_collection(
|
||||
)
|
||||
|
||||
|
||||
def get_closets_collection(palace_path: str, create: bool = True):
|
||||
"""Get the closets collection — the searchable index layer."""
|
||||
return get_collection(palace_path, collection_name="mempalace_closets", create=create)
|
||||
|
||||
|
||||
CLOSET_CHAR_LIMIT = 1500 # fill closet until ~1500 chars, then start a new one
|
||||
|
||||
|
||||
def build_closet_text(source_file, drawer_ids, content, wing, room):
|
||||
"""Build a compact closet entry from drawer content.
|
||||
|
||||
Extracts topics, names, and key quotes into an AAAK-style pointer
|
||||
that tells the searcher which drawers to open.
|
||||
"""
|
||||
import re
|
||||
# Extract proper nouns (capitalized words, 2+ occurrences)
|
||||
words = re.findall(r"\b[A-Z][a-z]{2,}\b", content[:5000])
|
||||
word_freq = {}
|
||||
for w in words:
|
||||
word_freq[w] = word_freq.get(w, 0) + 1
|
||||
entities = sorted([w for w, c in word_freq.items() if c >= 2], key=lambda w: -word_freq[w])[:5]
|
||||
|
||||
# Extract key phrases
|
||||
topics = []
|
||||
for pattern in [
|
||||
r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated)\s+[\w\s]{3,30}",
|
||||
]:
|
||||
topics.extend(re.findall(pattern, content[:5000], re.IGNORECASE))
|
||||
topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:8]
|
||||
|
||||
# Extract first quote
|
||||
quotes = re.findall(r'"([^"]{15,100})"', content[:5000])
|
||||
quote = quotes[0] if quotes else ""
|
||||
|
||||
# Build pointer lines
|
||||
entity_str = ";".join(entities[:5]) if entities else ""
|
||||
lines = []
|
||||
for topic in topics:
|
||||
pointer = f"{topic}|{entity_str}|→{','.join(drawer_ids[:3])}"
|
||||
lines.append(pointer)
|
||||
if quote:
|
||||
lines.append(f'"{quote}"|{entity_str}|→{",".join(drawer_ids[:3])}')
|
||||
if not lines:
|
||||
lines.append(f"{wing}/{room}|{entity_str}|→{','.join(drawer_ids[:3])}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def upsert_closet(closets_col, closet_id, closet_text, metadata):
|
||||
"""Add or update a closet. Respects CLOSET_CHAR_LIMIT."""
|
||||
try:
|
||||
existing = closets_col.get(ids=[closet_id])
|
||||
if existing.get("ids"):
|
||||
old_text = existing["documents"][0]
|
||||
if len(old_text) + len(closet_text) + 1 <= CLOSET_CHAR_LIMIT:
|
||||
closet_text = old_text + "\n" + closet_text
|
||||
# else: start fresh — old closet was full
|
||||
except Exception:
|
||||
pass
|
||||
closets_col.upsert(documents=[closet_text], ids=[closet_id], metadatas=[metadata])
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def mine_lock(source_file: str):
|
||||
"""Cross-platform file lock for mine operations.
|
||||
|
||||
+70
-3
@@ -9,7 +9,7 @@ Returns verbatim text — the actual words, never summaries.
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from .palace import get_collection
|
||||
from .palace import get_collection, get_closets_collection
|
||||
|
||||
logger = logging.getLogger("mempalace_mcp")
|
||||
|
||||
@@ -117,7 +117,7 @@ def search_memories(
|
||||
0.0 disables filtering. Typical useful range: 0.3–1.0.
|
||||
"""
|
||||
try:
|
||||
col = get_collection(palace_path, create=False)
|
||||
drawers_col = get_collection(palace_path, create=False)
|
||||
except Exception as e:
|
||||
logger.error("No palace found at %s: %s", palace_path, e)
|
||||
return {
|
||||
@@ -127,6 +127,73 @@ def search_memories(
|
||||
|
||||
where = build_where_filter(wing, room)
|
||||
|
||||
# Try closet-first search: search the compact index, then hydrate drawers
|
||||
closet_hits = []
|
||||
try:
|
||||
closets_col = get_closets_collection(palace_path, create=False)
|
||||
ckwargs = {
|
||||
"query_texts": [query],
|
||||
"n_results": n_results * 2, # over-fetch closets to find best drawers
|
||||
"include": ["documents", "metadatas", "distances"],
|
||||
}
|
||||
if where:
|
||||
ckwargs["where"] = where
|
||||
closet_results = closets_col.query(**ckwargs)
|
||||
if closet_results["documents"][0]:
|
||||
closet_hits = list(zip(
|
||||
closet_results["documents"][0],
|
||||
closet_results["metadatas"][0],
|
||||
closet_results["distances"][0],
|
||||
))
|
||||
except Exception:
|
||||
pass # no closets yet — fall through to direct drawer search
|
||||
|
||||
# If closets found results, hydrate the referenced drawers
|
||||
if closet_hits:
|
||||
import re
|
||||
seen_sources = set()
|
||||
hits = []
|
||||
for closet_doc, closet_meta, closet_dist in closet_hits:
|
||||
source = closet_meta.get("source_file", "")
|
||||
if source in seen_sources:
|
||||
continue
|
||||
seen_sources.add(source)
|
||||
|
||||
# Find drawers for this source file
|
||||
try:
|
||||
drawer_results = drawers_col.get(
|
||||
where={"source_file": source},
|
||||
include=["documents", "metadatas"],
|
||||
)
|
||||
if drawer_results.get("ids"):
|
||||
# Combine all drawer content for this file
|
||||
full_text = "\n\n".join(drawer_results["documents"])
|
||||
meta = drawer_results["metadatas"][0]
|
||||
hits.append({
|
||||
"text": full_text,
|
||||
"wing": meta.get("wing", "unknown"),
|
||||
"room": meta.get("room", "unknown"),
|
||||
"source_file": Path(source).name,
|
||||
"similarity": round(max(0.0, 1 - closet_dist), 3),
|
||||
"distance": round(closet_dist, 4),
|
||||
"matched_via": "closet",
|
||||
"closet_preview": closet_doc[:200],
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if len(hits) >= n_results:
|
||||
break
|
||||
|
||||
if hits:
|
||||
return {
|
||||
"query": query,
|
||||
"filters": {"wing": wing, "room": room},
|
||||
"total_before_filter": len(closet_hits),
|
||||
"results": hits,
|
||||
}
|
||||
|
||||
# Fallback: direct drawer search (no closets yet, or closets empty)
|
||||
try:
|
||||
kwargs = {
|
||||
"query_texts": [query],
|
||||
@@ -136,7 +203,7 @@ def search_memories(
|
||||
if where:
|
||||
kwargs["where"] = where
|
||||
|
||||
results = col.query(**kwargs)
|
||||
results = drawers_col.query(**kwargs)
|
||||
except Exception as e:
|
||||
return {"error": f"Search error: {e}"}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user