feat: entity metadata + diary ingest + BM25 hybrid search
Three features that close the gap between the architecture docs
and the actual codebase:
1. Entity metadata on drawers and closets
- _extract_entities_for_metadata() pulls names from known_entities.json
+ proper nouns appearing 2+ times
- Stamped as "entities" field in ChromaDB metadata
- Enables filterable search by person/project name
2. Day-based diary ingest (diary_ingest.py)
- ONE drawer per day, upserted as the day grows
- Closets pack topics atomically, never split mid-topic
- Tracks entry count in state file, only processes new entries
- Usage: python -m mempalace.diary_ingest --dir ~/summaries
3. BM25 hybrid search in searcher.py
- _bm25_score() keyword matching complements vector similarity
- _hybrid_rank() combines both signals (60% vector, 40% BM25)
- Catches exact name/term matches that embeddings miss
- Applied to both closet-first and direct drawer search paths
689/689 tests pass.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,173 @@
|
||||
"""
|
||||
diary_ingest.py — Ingest daily summary files into the palace.
|
||||
|
||||
Architecture:
|
||||
- ONE drawer per day — full verbatim content, upserted as the day grows
|
||||
- Closets pack topics up to 1500 chars, never split mid-topic
|
||||
- Only new entries are processed (tracks entry count in state file)
|
||||
- Entities extracted and stamped on metadata for filterable search
|
||||
|
||||
Usage:
|
||||
python -m mempalace.diary_ingest --dir ~/daily_summaries --palace ~/.mempalace/palace
|
||||
python -m mempalace.diary_ingest --dir ~/daily_summaries --palace ~/.mempalace/palace --force
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from .palace import (
|
||||
get_collection,
|
||||
get_closets_collection,
|
||||
build_closet_lines,
|
||||
upsert_closet_lines,
|
||||
CLOSET_CHAR_LIMIT,
|
||||
)
|
||||
from .miner import _extract_entities_for_metadata
|
||||
|
||||
|
||||
DIARY_ENTRY_RE = re.compile(r"^## .+", re.MULTILINE)
|
||||
|
||||
|
||||
def _split_entries(text):
|
||||
"""Split diary text into (header, body) pairs per ## entry."""
|
||||
parts = DIARY_ENTRY_RE.split(text)
|
||||
headers = DIARY_ENTRY_RE.findall(text)
|
||||
entries = []
|
||||
for i, header in enumerate(headers):
|
||||
body = parts[i + 1] if i + 1 < len(parts) else ""
|
||||
entries.append((header.strip(), body.strip()))
|
||||
return entries
|
||||
|
||||
|
||||
def ingest_diaries(
|
||||
diary_dir,
|
||||
palace_path,
|
||||
wing="diary",
|
||||
force=False,
|
||||
):
|
||||
"""Ingest daily summary files into the palace.
|
||||
|
||||
Each date file gets ONE drawer (upserted as day grows) and
|
||||
closets that pack topics atomically up to 1500 chars.
|
||||
"""
|
||||
diary_dir = Path(diary_dir).expanduser().resolve()
|
||||
if not diary_dir.exists():
|
||||
print(f"Diary directory not found: {diary_dir}")
|
||||
return
|
||||
|
||||
diary_files = sorted(diary_dir.glob("*.md"))
|
||||
if not diary_files:
|
||||
print(f"No .md files in {diary_dir}")
|
||||
return
|
||||
|
||||
# State tracks which entries have been closeted per file
|
||||
state_file = diary_dir / ".diary_ingest_state.json"
|
||||
state = {} if force else (
|
||||
json.loads(state_file.read_text()) if state_file.exists() else {}
|
||||
)
|
||||
|
||||
drawers_col = get_collection(palace_path)
|
||||
closets_col = get_closets_collection(palace_path)
|
||||
|
||||
days_updated = 0
|
||||
closets_created = 0
|
||||
|
||||
for diary_path in diary_files:
|
||||
text = diary_path.read_text(encoding="utf-8", errors="replace")
|
||||
if len(text.strip()) < 50:
|
||||
continue
|
||||
|
||||
date_match = re.match(r"(\d{4}-\d{2}-\d{2})", diary_path.stem)
|
||||
if not date_match:
|
||||
continue
|
||||
date_str = date_match.group(1)
|
||||
|
||||
# Skip if content hasn't changed
|
||||
prev_size = state.get(diary_path.name, {}).get("size", 0)
|
||||
curr_size = len(text)
|
||||
if curr_size == prev_size and not force:
|
||||
continue
|
||||
|
||||
now_iso = datetime.now(timezone.utc).isoformat()
|
||||
drawer_id = f"drawer_diary_{date_str}"
|
||||
|
||||
# Extract entities from full day text
|
||||
entities = _extract_entities_for_metadata(text)
|
||||
|
||||
# UPSERT the day's drawer (full verbatim, replaces as day grows)
|
||||
drawer_meta = {
|
||||
"date": date_str,
|
||||
"wing": wing,
|
||||
"room": "daily",
|
||||
"source_file": str(diary_path),
|
||||
"source_session": "daily_diary",
|
||||
"filed_at": now_iso,
|
||||
}
|
||||
if entities:
|
||||
drawer_meta["entities"] = entities
|
||||
drawers_col.upsert(
|
||||
documents=[text],
|
||||
ids=[drawer_id],
|
||||
metadatas=[drawer_meta],
|
||||
)
|
||||
|
||||
# Split into entries and find new ones
|
||||
entries = _split_entries(text)
|
||||
prev_entry_count = state.get(diary_path.name, {}).get("entry_count", 0)
|
||||
new_entries = entries[prev_entry_count:] if not force else entries
|
||||
|
||||
if new_entries:
|
||||
# Build closet lines from new entries
|
||||
all_lines = []
|
||||
for header, body in new_entries:
|
||||
entry_text = f"{header}\n{body}"
|
||||
entry_lines = build_closet_lines(
|
||||
str(diary_path), [drawer_id], entry_text, wing, "daily"
|
||||
)
|
||||
all_lines.extend(entry_lines)
|
||||
|
||||
if all_lines:
|
||||
closet_id_base = f"closet_diary_{date_str}"
|
||||
closet_meta = {
|
||||
"date": date_str,
|
||||
"wing": wing,
|
||||
"room": "daily",
|
||||
"source_file": str(diary_path),
|
||||
"filed_at": now_iso,
|
||||
}
|
||||
if entities:
|
||||
closet_meta["entities"] = entities
|
||||
n = upsert_closet_lines(
|
||||
closets_col, closet_id_base, all_lines, closet_meta
|
||||
)
|
||||
closets_created += n
|
||||
|
||||
state[diary_path.name] = {
|
||||
"size": curr_size,
|
||||
"entry_count": len(entries),
|
||||
"ingested_at": now_iso,
|
||||
}
|
||||
days_updated += 1
|
||||
|
||||
state_file.write_text(json.dumps(state, indent=2))
|
||||
if days_updated:
|
||||
print(f"Diary: {days_updated} days updated, {closets_created} new closets")
|
||||
|
||||
return {"days_updated": days_updated, "closets_created": closets_created}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Ingest daily summaries into the palace")
|
||||
parser.add_argument("--dir", required=True, help="Path to daily_summaries directory")
|
||||
parser.add_argument("--palace", default=os.path.expanduser("~/.mempalace/palace"))
|
||||
parser.add_argument("--wing", default="diary")
|
||||
parser.add_argument("--force", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
ingest_diaries(args.dir, args.palace, wing=args.wing, force=args.force)
|
||||
+47
-2
@@ -371,6 +371,43 @@ def chunk_text(content: str, source_file: str) -> list:
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _extract_entities_for_metadata(content: str) -> str:
|
||||
"""Extract entity names from content for metadata tagging.
|
||||
|
||||
Returns semicolon-separated string of entity names found in the text,
|
||||
suitable for ChromaDB metadata filtering.
|
||||
"""
|
||||
import re
|
||||
# Load known entities from registry if available
|
||||
known_names = set()
|
||||
registry_path = os.path.join(os.path.expanduser("~"), ".mempalace", "known_entities.json")
|
||||
if os.path.exists(registry_path):
|
||||
try:
|
||||
import json
|
||||
kd = json.loads(open(registry_path).read())
|
||||
for cat in kd.values():
|
||||
if isinstance(cat, list):
|
||||
known_names.update(cat)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
matched = set()
|
||||
# Match known entities
|
||||
for name in known_names:
|
||||
if re.search(r'(?<!\w)' + re.escape(name) + r'(?!\w)', content):
|
||||
matched.add(name)
|
||||
# Also catch capitalized words appearing 2+ times (likely proper nouns)
|
||||
words = re.findall(r"\b[A-Z][a-z]{2,}\b", content[:5000])
|
||||
freq = {}
|
||||
for w in words:
|
||||
freq[w] = freq.get(w, 0) + 1
|
||||
for w, c in freq.items():
|
||||
if c >= 2 and len(w) > 2:
|
||||
matched.add(w)
|
||||
|
||||
return ";".join(sorted(matched))[:500] if matched else ""
|
||||
|
||||
|
||||
def add_drawer(
|
||||
collection, wing: str, room: str, content: str, source_file: str, chunk_index: int, agent: str
|
||||
):
|
||||
@@ -390,6 +427,10 @@ def add_drawer(
|
||||
metadata["source_mtime"] = os.path.getmtime(source_file)
|
||||
except OSError:
|
||||
pass
|
||||
# Tag with entity names for filterable search
|
||||
entities = _extract_entities_for_metadata(content)
|
||||
if entities:
|
||||
metadata["entities"] = entities
|
||||
collection.upsert(
|
||||
documents=[content],
|
||||
ids=[drawer_id],
|
||||
@@ -479,13 +520,17 @@ def process_file(
|
||||
]
|
||||
closet_lines = build_closet_lines(source_file, drawer_ids, content, wing, room)
|
||||
closet_id_base = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}"
|
||||
upsert_closet_lines(closets_col, closet_id_base, closet_lines, {
|
||||
entities = _extract_entities_for_metadata(content)
|
||||
closet_meta = {
|
||||
"wing": wing,
|
||||
"room": room,
|
||||
"source_file": source_file,
|
||||
"drawer_count": drawers_added,
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
})
|
||||
}
|
||||
if entities:
|
||||
closet_meta["entities"] = entities
|
||||
upsert_closet_lines(closets_col, closet_id_base, closet_lines, closet_meta)
|
||||
|
||||
return drawers_added, room
|
||||
|
||||
|
||||
+62
-2
@@ -2,11 +2,14 @@
|
||||
"""
|
||||
searcher.py — Find anything. Exact words.
|
||||
|
||||
Semantic search against the palace.
|
||||
Returns verbatim text — the actual words, never summaries.
|
||||
Hybrid search: BM25 keyword matching + vector semantic similarity.
|
||||
Searches closets first (fast index), then hydrates full drawer content.
|
||||
Falls back to direct drawer search for palaces without closets.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from .palace import get_collection, get_closets_collection
|
||||
@@ -18,6 +21,59 @@ class SearchError(Exception):
|
||||
"""Raised when search cannot proceed (e.g. no palace found)."""
|
||||
|
||||
|
||||
def _bm25_score(query: str, document: str, k1: float = 1.5, b: float = 0.75, avg_dl: float = 500) -> float:
|
||||
"""Simple BM25 score for a single document against a query.
|
||||
|
||||
This is a lightweight keyword-matching signal that complements vector
|
||||
similarity. It catches exact matches that embeddings might miss
|
||||
(e.g., specific names, project codes, error messages).
|
||||
"""
|
||||
query_terms = set(re.findall(r'\w{2,}', query.lower()))
|
||||
doc_terms = re.findall(r'\w{2,}', document.lower())
|
||||
if not query_terms or not doc_terms:
|
||||
return 0.0
|
||||
doc_len = len(doc_terms)
|
||||
term_freq = {}
|
||||
for t in doc_terms:
|
||||
term_freq[t] = term_freq.get(t, 0) + 1
|
||||
|
||||
score = 0.0
|
||||
for term in query_terms:
|
||||
tf = term_freq.get(term, 0)
|
||||
if tf > 0:
|
||||
# Simplified IDF — treat each query term as moderately rare
|
||||
idf = math.log(2.0)
|
||||
numerator = tf * (k1 + 1)
|
||||
denominator = tf + k1 * (1 - b + b * doc_len / avg_dl)
|
||||
score += idf * numerator / denominator
|
||||
return score
|
||||
|
||||
|
||||
def _hybrid_rank(vector_results, query: str, vector_weight: float = 0.6, bm25_weight: float = 0.4):
|
||||
"""Re-rank results using both vector distance and BM25 keyword score.
|
||||
|
||||
Returns results sorted by combined score (higher = better).
|
||||
"""
|
||||
if not vector_results:
|
||||
return vector_results
|
||||
|
||||
# Normalize vector distances to 0-1 similarity
|
||||
max_dist = max(r.get("distance", 1.0) for r in vector_results) or 1.0
|
||||
for r in vector_results:
|
||||
vec_sim = max(0.0, 1 - r.get("distance", 1.0) / max(max_dist, 0.001))
|
||||
bm25 = _bm25_score(query, r.get("text", ""))
|
||||
# Normalize BM25 to roughly 0-1 range
|
||||
bm25_norm = min(bm25 / 3.0, 1.0)
|
||||
r["_hybrid_score"] = vector_weight * vec_sim + bm25_weight * bm25_norm
|
||||
r["bm25_score"] = round(bm25, 3)
|
||||
|
||||
vector_results.sort(key=lambda r: r["_hybrid_score"], reverse=True)
|
||||
# Clean up internal field
|
||||
for r in vector_results:
|
||||
del r["_hybrid_score"]
|
||||
return vector_results
|
||||
|
||||
|
||||
def build_where_filter(wing: str = None, room: str = None) -> dict:
|
||||
"""Build ChromaDB where filter for wing/room filtering."""
|
||||
if wing and room:
|
||||
@@ -186,6 +242,8 @@ def search_memories(
|
||||
break
|
||||
|
||||
if hits:
|
||||
# Re-rank with BM25 hybrid scoring
|
||||
hits = _hybrid_rank(hits, query)
|
||||
return {
|
||||
"query": query,
|
||||
"filters": {"wing": wing, "room": room},
|
||||
@@ -227,6 +285,8 @@ def search_memories(
|
||||
}
|
||||
)
|
||||
|
||||
# Re-rank with BM25 hybrid scoring
|
||||
hits = _hybrid_rank(hits, query)
|
||||
return {
|
||||
"query": query,
|
||||
"filters": {"wing": wing, "room": room},
|
||||
|
||||
Reference in New Issue
Block a user