From 124f5bf7ba5eec986c3d26fbbc66d6f0584ef62a Mon Sep 17 00:00:00 2001 From: MSL <232237854+milla-jovovich@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:40:58 -0700 Subject: [PATCH] fix: enforce atomic topics in closets, extract richer pointers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - upsert_closet replaced by upsert_closet_lines: checks each topic line individually against CLOSET_CHAR_LIMIT. If adding one line WHOLE would exceed the limit, starts a new closet. Never splits mid-topic. - build_closet_lines returns a list of atomic lines (not joined text) - Richer extraction: section headers, more action verbs, up to 3 quotes, up to 12 topics per file - Each line is complete: topic|entities|→drawer_refs Co-Authored-By: Claude Opus 4.6 (1M context) --- mempalace/miner.py | 9 ++-- mempalace/palace.py | 113 ++++++++++++++++++++++++++++++++------------ 2 files changed, 87 insertions(+), 35 deletions(-) diff --git a/mempalace/miner.py b/mempalace/miner.py index 8170362..37e507a 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -17,7 +17,7 @@ from collections import defaultdict from .palace import ( SKIP_DIRS, get_collection, get_closets_collection, - file_already_mined, mine_lock, build_closet_text, upsert_closet, + file_already_mined, mine_lock, build_closet_lines, upsert_closet_lines, ) READABLE_EXTENSIONS = { @@ -471,14 +471,15 @@ def process_file( drawers_added += 1 # Build closet — the searchable index pointing to these drawers + # Each topic line is atomic — never split across closets if closets_col and drawers_added > 0: drawer_ids = [ f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}" for c in chunks ] - closet_text = build_closet_text(source_file, drawer_ids, content, wing, room) - closet_id = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}" - upsert_closet(closets_col, closet_id, closet_text, { + closet_lines = build_closet_lines(source_file, drawer_ids, content, wing, room) + closet_id_base = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}" + upsert_closet_lines(closets_col, closet_id_base, closet_lines, { "wing": wing, "room": room, "source_file": source_file, diff --git a/mempalace/palace.py b/mempalace/palace.py index ef58a06..9bb08a5 100644 --- a/mempalace/palace.py +++ b/mempalace/palace.py @@ -60,58 +60,109 @@ def get_closets_collection(palace_path: str, create: bool = True): CLOSET_CHAR_LIMIT = 1500 # fill closet until ~1500 chars, then start a new one -def build_closet_text(source_file, drawer_ids, content, wing, room): - """Build a compact closet entry from drawer content. +def build_closet_lines(source_file, drawer_ids, content, wing, room): + """Build compact closet pointer lines from drawer content. - Extracts topics, names, and key quotes into an AAAK-style pointer - that tells the searcher which drawers to open. + Returns a LIST of lines (not joined). Each line is one complete topic + pointer — never split across closets. + + Format: topic|entities|→drawer_ids """ import re + from pathlib import Path + + drawer_ref = ",".join(drawer_ids[:3]) + # Extract proper nouns (capitalized words, 2+ occurrences) words = re.findall(r"\b[A-Z][a-z]{2,}\b", content[:5000]) word_freq = {} for w in words: word_freq[w] = word_freq.get(w, 0) + 1 - entities = sorted([w for w, c in word_freq.items() if c >= 2], key=lambda w: -word_freq[w])[:5] + entities = sorted( + [w for w, c in word_freq.items() if c >= 2], + key=lambda w: -word_freq[w], + )[:5] + entity_str = ";".join(entities) if entities else "" - # Extract key phrases + # Extract key phrases — action verbs + context topics = [] for pattern in [ - r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated)\s+[\w\s]{3,30}", + r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated|reviewed|deployed|configured|removed|updated)\s+[\w\s]{3,40}", ]: topics.extend(re.findall(pattern, content[:5000], re.IGNORECASE)) - topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:8] + # Also grab section headers if present + for header in re.findall(r"^#{1,3}\s+(.{5,60})$", content[:5000], re.MULTILINE): + topics.append(header.strip()) + # Dedupe preserving order + topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:12] - # Extract first quote - quotes = re.findall(r'"([^"]{15,100})"', content[:5000]) - quote = quotes[0] if quotes else "" + # Extract quotes + quotes = re.findall(r'"([^"]{15,150})"', content[:5000]) - # Build pointer lines - entity_str = ";".join(entities[:5]) if entities else "" + # Build pointer lines — each one is atomic, never split lines = [] for topic in topics: - pointer = f"{topic}|{entity_str}|→{','.join(drawer_ids[:3])}" - lines.append(pointer) - if quote: - lines.append(f'"{quote}"|{entity_str}|→{",".join(drawer_ids[:3])}') + lines.append(f"{topic}|{entity_str}|→{drawer_ref}") + for quote in quotes[:3]: + lines.append(f'"{quote}"|{entity_str}|→{drawer_ref}') + + # Always have at least one line if not lines: - lines.append(f"{wing}/{room}|{entity_str}|→{','.join(drawer_ids[:3])}") + name = Path(source_file).stem[:40] + lines.append(f"{wing}/{room}/{name}|{entity_str}|→{drawer_ref}") - return "\n".join(lines) + return lines -def upsert_closet(closets_col, closet_id, closet_text, metadata): - """Add or update a closet. Respects CLOSET_CHAR_LIMIT.""" - try: - existing = closets_col.get(ids=[closet_id]) - if existing.get("ids"): - old_text = existing["documents"][0] - if len(old_text) + len(closet_text) + 1 <= CLOSET_CHAR_LIMIT: - closet_text = old_text + "\n" + closet_text - # else: start fresh — old closet was full - except Exception: - pass - closets_col.upsert(documents=[closet_text], ids=[closet_id], metadatas=[metadata]) +def upsert_closet_lines(closets_col, closet_id_base, lines, metadata): + """Add topic lines to closets. Never splits a topic mid-line. + + If adding a line WHOLE would exceed CLOSET_CHAR_LIMIT, a new closet + is created. Some closets may have less than 1500 chars — that's fine. + Every topic is complete and readable. + + Returns the number of closets written. + """ + closet_num = 1 + current_lines = [] + current_chars = 0 + closets_written = 0 + + def _flush(): + nonlocal closets_written + if not current_lines: + return + closet_id = f"{closet_id_base}_{closet_num:02d}" + text = "\n".join(current_lines) + + # Check if closet already has content — append if room + try: + existing = closets_col.get(ids=[closet_id]) + if existing.get("ids") and existing["documents"][0]: + old = existing["documents"][0] + if len(old) + len(text) + 1 <= CLOSET_CHAR_LIMIT: + text = old + "\n" + text + except Exception: + pass + + closets_col.upsert(documents=[text], ids=[closet_id], metadatas=[metadata]) + closets_written += 1 + + for line in lines: + line_len = len(line) + # Would this line fit whole in the current closet? + if current_chars > 0 and current_chars + line_len + 1 > CLOSET_CHAR_LIMIT: + # Doesn't fit — flush current closet, start new one + _flush() + closet_num += 1 + current_lines = [] + current_chars = 0 + + current_lines.append(line) + current_chars += line_len + 1 # +1 for newline + + _flush() + return closets_written @contextlib.contextmanager