feat: add closet layer — searchable index pointing to drawers

The closet architecture was always part of MemPalace's design but
never shipped in the public codebase. This adds it.

Palace now has TWO collections:
- mempalace_drawers — full verbatim content (unchanged)
- mempalace_closets — compact AAAK-style index entries

How it works:
- When mining, each file gets a closet alongside its drawers
- Closet contains extracted topics, entities, quotes as pointers
- Closets pack up to 1500 chars, topics never split mid-entry
- Search hits closets first (fast, small), then hydrates the
  full drawer content for matching files
- Falls back to direct drawer search if no closets exist yet

Files changed:
- palace.py: get_closets_collection(), build_closet_text(),
  upsert_closet(), CLOSET_CHAR_LIMIT
- miner.py: process_file() now creates closets after drawers
- searcher.py: search_memories() tries closet-first search,
  hydrates drawers, falls back to direct search

Backwards compatible — existing palaces without closets continue
to work via the fallback path. Closets are created on next mine.

689/689 tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
MSL
2026-04-13 01:33:48 -07:00
committed by Igor Lins e Silva
parent 30a431924b
commit d3d7184f4e
3 changed files with 156 additions and 4 deletions
+24 -1
View File
@@ -15,7 +15,10 @@ from pathlib import Path
from datetime import datetime
from collections import defaultdict
from .palace import SKIP_DIRS, get_collection, file_already_mined, mine_lock
from .palace import (
SKIP_DIRS, get_collection, get_closets_collection,
file_already_mined, mine_lock, build_closet_text, upsert_closet,
)
READABLE_EXTENSIONS = {
".txt",
@@ -410,6 +413,7 @@ def process_file(
rooms: list,
agent: str,
dry_run: bool,
closets_col=None,
) -> tuple:
"""Read, chunk, route, and file one file. Returns (drawer_count, room_name)."""
@@ -466,6 +470,22 @@ def process_file(
if added:
drawers_added += 1
# Build closet — the searchable index pointing to these drawers
if closets_col and drawers_added > 0:
drawer_ids = [
f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}"
for c in chunks
]
closet_text = build_closet_text(source_file, drawer_ids, content, wing, room)
closet_id = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}"
upsert_closet(closets_col, closet_id, closet_text, {
"wing": wing,
"room": room,
"source_file": source_file,
"drawer_count": drawers_added,
"filed_at": datetime.now().isoformat(),
})
return drawers_added, room
@@ -586,8 +606,10 @@ def mine(
if not dry_run:
collection = get_collection(palace_path)
closets_col = get_closets_collection(palace_path)
else:
collection = None
closets_col = None
total_drawers = 0
files_skipped = 0
@@ -602,6 +624,7 @@ def mine(
rooms=rooms,
agent=agent,
dry_run=dry_run,
closets_col=closets_col,
)
if drawers == 0 and not dry_run:
files_skipped += 1
+62
View File
@@ -52,6 +52,68 @@ def get_collection(
)
def get_closets_collection(palace_path: str, create: bool = True):
"""Get the closets collection — the searchable index layer."""
return get_collection(palace_path, collection_name="mempalace_closets", create=create)
CLOSET_CHAR_LIMIT = 1500 # fill closet until ~1500 chars, then start a new one
def build_closet_text(source_file, drawer_ids, content, wing, room):
"""Build a compact closet entry from drawer content.
Extracts topics, names, and key quotes into an AAAK-style pointer
that tells the searcher which drawers to open.
"""
import re
# Extract proper nouns (capitalized words, 2+ occurrences)
words = re.findall(r"\b[A-Z][a-z]{2,}\b", content[:5000])
word_freq = {}
for w in words:
word_freq[w] = word_freq.get(w, 0) + 1
entities = sorted([w for w, c in word_freq.items() if c >= 2], key=lambda w: -word_freq[w])[:5]
# Extract key phrases
topics = []
for pattern in [
r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated)\s+[\w\s]{3,30}",
]:
topics.extend(re.findall(pattern, content[:5000], re.IGNORECASE))
topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:8]
# Extract first quote
quotes = re.findall(r'"([^"]{15,100})"', content[:5000])
quote = quotes[0] if quotes else ""
# Build pointer lines
entity_str = ";".join(entities[:5]) if entities else ""
lines = []
for topic in topics:
pointer = f"{topic}|{entity_str}|→{','.join(drawer_ids[:3])}"
lines.append(pointer)
if quote:
lines.append(f'"{quote}"|{entity_str}|→{",".join(drawer_ids[:3])}')
if not lines:
lines.append(f"{wing}/{room}|{entity_str}|→{','.join(drawer_ids[:3])}")
return "\n".join(lines)
def upsert_closet(closets_col, closet_id, closet_text, metadata):
"""Add or update a closet. Respects CLOSET_CHAR_LIMIT."""
try:
existing = closets_col.get(ids=[closet_id])
if existing.get("ids"):
old_text = existing["documents"][0]
if len(old_text) + len(closet_text) + 1 <= CLOSET_CHAR_LIMIT:
closet_text = old_text + "\n" + closet_text
# else: start fresh — old closet was full
except Exception:
pass
closets_col.upsert(documents=[closet_text], ids=[closet_id], metadatas=[metadata])
@contextlib.contextmanager
def mine_lock(source_file: str):
"""Cross-platform file lock for mine operations.
+70 -3
View File
@@ -9,7 +9,7 @@ Returns verbatim text — the actual words, never summaries.
import logging
from pathlib import Path
from .palace import get_collection
from .palace import get_collection, get_closets_collection
logger = logging.getLogger("mempalace_mcp")
@@ -117,7 +117,7 @@ def search_memories(
0.0 disables filtering. Typical useful range: 0.31.0.
"""
try:
col = get_collection(palace_path, create=False)
drawers_col = get_collection(palace_path, create=False)
except Exception as e:
logger.error("No palace found at %s: %s", palace_path, e)
return {
@@ -127,6 +127,73 @@ def search_memories(
where = build_where_filter(wing, room)
# Try closet-first search: search the compact index, then hydrate drawers
closet_hits = []
try:
closets_col = get_closets_collection(palace_path, create=False)
ckwargs = {
"query_texts": [query],
"n_results": n_results * 2, # over-fetch closets to find best drawers
"include": ["documents", "metadatas", "distances"],
}
if where:
ckwargs["where"] = where
closet_results = closets_col.query(**ckwargs)
if closet_results["documents"][0]:
closet_hits = list(zip(
closet_results["documents"][0],
closet_results["metadatas"][0],
closet_results["distances"][0],
))
except Exception:
pass # no closets yet — fall through to direct drawer search
# If closets found results, hydrate the referenced drawers
if closet_hits:
import re
seen_sources = set()
hits = []
for closet_doc, closet_meta, closet_dist in closet_hits:
source = closet_meta.get("source_file", "")
if source in seen_sources:
continue
seen_sources.add(source)
# Find drawers for this source file
try:
drawer_results = drawers_col.get(
where={"source_file": source},
include=["documents", "metadatas"],
)
if drawer_results.get("ids"):
# Combine all drawer content for this file
full_text = "\n\n".join(drawer_results["documents"])
meta = drawer_results["metadatas"][0]
hits.append({
"text": full_text,
"wing": meta.get("wing", "unknown"),
"room": meta.get("room", "unknown"),
"source_file": Path(source).name,
"similarity": round(max(0.0, 1 - closet_dist), 3),
"distance": round(closet_dist, 4),
"matched_via": "closet",
"closet_preview": closet_doc[:200],
})
except Exception:
pass
if len(hits) >= n_results:
break
if hits:
return {
"query": query,
"filters": {"wing": wing, "room": room},
"total_before_filter": len(closet_hits),
"results": hits,
}
# Fallback: direct drawer search (no closets yet, or closets empty)
try:
kwargs = {
"query_texts": [query],
@@ -136,7 +203,7 @@ def search_memories(
if where:
kwargs["where"] = where
results = col.query(**kwargs)
results = drawers_col.query(**kwargs)
except Exception as e:
return {"error": f"Search error: {e}"}