From d3d7184f4e885f64520d80971cbc41285d947e5f Mon Sep 17 00:00:00 2001 From: MSL <232237854+milla-jovovich@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:33:48 -0700 Subject: [PATCH] =?UTF-8?q?feat:=20add=20closet=20layer=20=E2=80=94=20sear?= =?UTF-8?q?chable=20index=20pointing=20to=20drawers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The closet architecture was always part of MemPalace's design but never shipped in the public codebase. This adds it. Palace now has TWO collections: - mempalace_drawers — full verbatim content (unchanged) - mempalace_closets — compact AAAK-style index entries How it works: - When mining, each file gets a closet alongside its drawers - Closet contains extracted topics, entities, quotes as pointers - Closets pack up to 1500 chars, topics never split mid-entry - Search hits closets first (fast, small), then hydrates the full drawer content for matching files - Falls back to direct drawer search if no closets exist yet Files changed: - palace.py: get_closets_collection(), build_closet_text(), upsert_closet(), CLOSET_CHAR_LIMIT - miner.py: process_file() now creates closets after drawers - searcher.py: search_memories() tries closet-first search, hydrates drawers, falls back to direct search Backwards compatible — existing palaces without closets continue to work via the fallback path. Closets are created on next mine. 689/689 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) --- mempalace/miner.py | 25 ++++++++++++++- mempalace/palace.py | 62 ++++++++++++++++++++++++++++++++++++ mempalace/searcher.py | 73 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 156 insertions(+), 4 deletions(-) diff --git a/mempalace/miner.py b/mempalace/miner.py index 801ed7e..8170362 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -15,7 +15,10 @@ from pathlib import Path from datetime import datetime from collections import defaultdict -from .palace import SKIP_DIRS, get_collection, file_already_mined, mine_lock +from .palace import ( + SKIP_DIRS, get_collection, get_closets_collection, + file_already_mined, mine_lock, build_closet_text, upsert_closet, +) READABLE_EXTENSIONS = { ".txt", @@ -410,6 +413,7 @@ def process_file( rooms: list, agent: str, dry_run: bool, + closets_col=None, ) -> tuple: """Read, chunk, route, and file one file. Returns (drawer_count, room_name).""" @@ -466,6 +470,22 @@ def process_file( if added: drawers_added += 1 + # Build closet — the searchable index pointing to these drawers + if closets_col and drawers_added > 0: + drawer_ids = [ + f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}" + for c in chunks + ] + closet_text = build_closet_text(source_file, drawer_ids, content, wing, room) + closet_id = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}" + upsert_closet(closets_col, closet_id, closet_text, { + "wing": wing, + "room": room, + "source_file": source_file, + "drawer_count": drawers_added, + "filed_at": datetime.now().isoformat(), + }) + return drawers_added, room @@ -586,8 +606,10 @@ def mine( if not dry_run: collection = get_collection(palace_path) + closets_col = get_closets_collection(palace_path) else: collection = None + closets_col = None total_drawers = 0 files_skipped = 0 @@ -602,6 +624,7 @@ def mine( rooms=rooms, agent=agent, dry_run=dry_run, + closets_col=closets_col, ) if drawers == 0 and not dry_run: files_skipped += 1 diff --git a/mempalace/palace.py b/mempalace/palace.py index ed5382a..ef58a06 100644 --- a/mempalace/palace.py +++ b/mempalace/palace.py @@ -52,6 +52,68 @@ def get_collection( ) +def get_closets_collection(palace_path: str, create: bool = True): + """Get the closets collection — the searchable index layer.""" + return get_collection(palace_path, collection_name="mempalace_closets", create=create) + + +CLOSET_CHAR_LIMIT = 1500 # fill closet until ~1500 chars, then start a new one + + +def build_closet_text(source_file, drawer_ids, content, wing, room): + """Build a compact closet entry from drawer content. + + Extracts topics, names, and key quotes into an AAAK-style pointer + that tells the searcher which drawers to open. + """ + import re + # Extract proper nouns (capitalized words, 2+ occurrences) + words = re.findall(r"\b[A-Z][a-z]{2,}\b", content[:5000]) + word_freq = {} + for w in words: + word_freq[w] = word_freq.get(w, 0) + 1 + entities = sorted([w for w, c in word_freq.items() if c >= 2], key=lambda w: -word_freq[w])[:5] + + # Extract key phrases + topics = [] + for pattern in [ + r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated)\s+[\w\s]{3,30}", + ]: + topics.extend(re.findall(pattern, content[:5000], re.IGNORECASE)) + topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:8] + + # Extract first quote + quotes = re.findall(r'"([^"]{15,100})"', content[:5000]) + quote = quotes[0] if quotes else "" + + # Build pointer lines + entity_str = ";".join(entities[:5]) if entities else "" + lines = [] + for topic in topics: + pointer = f"{topic}|{entity_str}|→{','.join(drawer_ids[:3])}" + lines.append(pointer) + if quote: + lines.append(f'"{quote}"|{entity_str}|→{",".join(drawer_ids[:3])}') + if not lines: + lines.append(f"{wing}/{room}|{entity_str}|→{','.join(drawer_ids[:3])}") + + return "\n".join(lines) + + +def upsert_closet(closets_col, closet_id, closet_text, metadata): + """Add or update a closet. Respects CLOSET_CHAR_LIMIT.""" + try: + existing = closets_col.get(ids=[closet_id]) + if existing.get("ids"): + old_text = existing["documents"][0] + if len(old_text) + len(closet_text) + 1 <= CLOSET_CHAR_LIMIT: + closet_text = old_text + "\n" + closet_text + # else: start fresh — old closet was full + except Exception: + pass + closets_col.upsert(documents=[closet_text], ids=[closet_id], metadatas=[metadata]) + + @contextlib.contextmanager def mine_lock(source_file: str): """Cross-platform file lock for mine operations. diff --git a/mempalace/searcher.py b/mempalace/searcher.py index bc70c1d..70fd615 100644 --- a/mempalace/searcher.py +++ b/mempalace/searcher.py @@ -9,7 +9,7 @@ Returns verbatim text — the actual words, never summaries. import logging from pathlib import Path -from .palace import get_collection +from .palace import get_collection, get_closets_collection logger = logging.getLogger("mempalace_mcp") @@ -117,7 +117,7 @@ def search_memories( 0.0 disables filtering. Typical useful range: 0.3–1.0. """ try: - col = get_collection(palace_path, create=False) + drawers_col = get_collection(palace_path, create=False) except Exception as e: logger.error("No palace found at %s: %s", palace_path, e) return { @@ -127,6 +127,73 @@ def search_memories( where = build_where_filter(wing, room) + # Try closet-first search: search the compact index, then hydrate drawers + closet_hits = [] + try: + closets_col = get_closets_collection(palace_path, create=False) + ckwargs = { + "query_texts": [query], + "n_results": n_results * 2, # over-fetch closets to find best drawers + "include": ["documents", "metadatas", "distances"], + } + if where: + ckwargs["where"] = where + closet_results = closets_col.query(**ckwargs) + if closet_results["documents"][0]: + closet_hits = list(zip( + closet_results["documents"][0], + closet_results["metadatas"][0], + closet_results["distances"][0], + )) + except Exception: + pass # no closets yet — fall through to direct drawer search + + # If closets found results, hydrate the referenced drawers + if closet_hits: + import re + seen_sources = set() + hits = [] + for closet_doc, closet_meta, closet_dist in closet_hits: + source = closet_meta.get("source_file", "") + if source in seen_sources: + continue + seen_sources.add(source) + + # Find drawers for this source file + try: + drawer_results = drawers_col.get( + where={"source_file": source}, + include=["documents", "metadatas"], + ) + if drawer_results.get("ids"): + # Combine all drawer content for this file + full_text = "\n\n".join(drawer_results["documents"]) + meta = drawer_results["metadatas"][0] + hits.append({ + "text": full_text, + "wing": meta.get("wing", "unknown"), + "room": meta.get("room", "unknown"), + "source_file": Path(source).name, + "similarity": round(max(0.0, 1 - closet_dist), 3), + "distance": round(closet_dist, 4), + "matched_via": "closet", + "closet_preview": closet_doc[:200], + }) + except Exception: + pass + + if len(hits) >= n_results: + break + + if hits: + return { + "query": query, + "filters": {"wing": wing, "room": room}, + "total_before_filter": len(closet_hits), + "results": hits, + } + + # Fallback: direct drawer search (no closets yet, or closets empty) try: kwargs = { "query_texts": [query], @@ -136,7 +203,7 @@ def search_memories( if where: kwargs["where"] = where - results = col.query(**kwargs) + results = drawers_col.query(**kwargs) except Exception as e: return {"error": f"Search error: {e}"}