Files
mempalace/mempalace/searcher.py
T

492 lines
16 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
searcher.py — Find anything. Exact words.
Hybrid search: BM25 keyword matching + vector semantic similarity.
Searches closets first (fast index), then hydrates full drawer content.
Falls back to direct drawer search for palaces without closets.
"""
import logging
import math
import re
from pathlib import Path
from .palace import get_closets_collection, get_collection
# Closet pointer line format: "topic|entities|→drawer_id_a,drawer_id_b"
# Multiple lines may join with newlines inside one closet document.
_CLOSET_DRAWER_REF_RE = re.compile(r"→([\w,]+)")
logger = logging.getLogger("mempalace_mcp")
class SearchError(Exception):
"""Raised when search cannot proceed (e.g. no palace found)."""
_TOKEN_RE = re.compile(r"\w{2,}", re.UNICODE)
def _tokenize(text: str) -> list:
"""Lowercase + strip to alphanumeric tokens of length ≥ 2."""
return _TOKEN_RE.findall(text.lower())
def _bm25_scores(
query: str,
documents: list,
k1: float = 1.5,
b: float = 0.75,
) -> list:
"""Compute Okapi-BM25 scores for ``query`` against each document.
IDF is computed over the *provided corpus* using the Lucene/BM25+
smoothed formula ``log((N - df + 0.5) / (df + 0.5) + 1)``, which is
always non-negative. This is well-defined for re-ranking a small
candidate set returned by vector retrieval — IDF then reflects how
discriminative each query term is *within the candidates*, exactly
what's needed to reorder them.
Parameters mirror Okapi-BM25 conventions:
k1 — term-frequency saturation (1.2-2.0 typical, 1.5 default)
b — length normalization (0.0 = none, 1.0 = full, 0.75 default)
Returns a list of scores in the same order as ``documents``.
"""
n_docs = len(documents)
query_terms = set(_tokenize(query))
if not query_terms or n_docs == 0:
return [0.0] * n_docs
tokenized = [_tokenize(d) for d in documents]
doc_lens = [len(toks) for toks in tokenized]
if not any(doc_lens):
return [0.0] * n_docs
avgdl = sum(doc_lens) / n_docs or 1.0
# Document frequency: how many docs contain each query term?
df = {term: 0 for term in query_terms}
for toks in tokenized:
seen = set(toks) & query_terms
for term in seen:
df[term] += 1
idf = {term: math.log((n_docs - df[term] + 0.5) / (df[term] + 0.5) + 1) for term in query_terms}
scores = []
for toks, dl in zip(tokenized, doc_lens):
if dl == 0:
scores.append(0.0)
continue
tf: dict = {}
for t in toks:
if t in query_terms:
tf[t] = tf.get(t, 0) + 1
score = 0.0
for term, freq in tf.items():
num = freq * (k1 + 1)
den = freq + k1 * (1 - b + b * dl / avgdl)
score += idf[term] * num / den
scores.append(score)
return scores
def _hybrid_rank(
results: list,
query: str,
vector_weight: float = 0.6,
bm25_weight: float = 0.4,
) -> list:
"""Re-rank ``results`` by a convex combination of vector similarity and BM25.
* Vector similarity uses absolute cosine sim ``max(0, 1 - distance)`` —
ChromaDB's hnsw cosine distance lives in ``[0, 2]`` (0 = identical).
Absolute (not relative-to-max) means adding/removing a candidate
can't reshuffle the others.
* BM25 is real Okapi-BM25 with corpus-relative IDF over the candidates
themselves. Since the absolute scale is unbounded, BM25 is min-max
normalized within the candidate set so weights are commensurable.
Mutates each result dict to add ``bm25_score`` and reorders the list
in place. Returns the same list for convenience.
"""
if not results:
return results
docs = [r.get("text", "") for r in results]
bm25_raw = _bm25_scores(query, docs)
max_bm25 = max(bm25_raw) if bm25_raw else 0.0
bm25_norm = [s / max_bm25 for s in bm25_raw] if max_bm25 > 0 else [0.0] * len(bm25_raw)
scored = []
for r, raw, norm in zip(results, bm25_raw, bm25_norm):
vec_sim = max(0.0, 1.0 - r.get("distance", 1.0))
r["bm25_score"] = round(raw, 3)
scored.append((vector_weight * vec_sim + bm25_weight * norm, r))
scored.sort(key=lambda pair: pair[0], reverse=True)
results[:] = [r for _, r in scored]
return results
def build_where_filter(wing: str = None, room: str = None) -> dict:
"""Build ChromaDB where filter for wing/room filtering."""
if wing and room:
return {"$and": [{"wing": wing}, {"room": room}]}
elif wing:
return {"wing": wing}
elif room:
return {"room": room}
return {}
def _extract_drawer_ids_from_closet(closet_doc: str) -> list:
"""Parse all `→drawer_id_a,drawer_id_b` pointers out of a closet document.
Preserves order and dedupes.
"""
seen: dict = {}
for match in _CLOSET_DRAWER_REF_RE.findall(closet_doc):
for did in match.split(","):
did = did.strip()
if did and did not in seen:
seen[did] = None
return list(seen.keys())
def _expand_with_neighbors(drawers_col, matched_doc: str, matched_meta: dict, radius: int = 1):
"""Expand a matched drawer with its ±radius sibling chunks in the same source file.
Motivation — "drawer-grep context" feature: a closet hit returns one
drawer, but the chunk boundary may clip mid-thought (e.g., the matched
chunk says "here's a breakdown:" and the actual breakdown lives in the
next chunk). Fetching the small neighborhood around the match gives
callers enough context without forcing a follow-up ``get_drawer`` call.
Returns a dict with:
``text`` combined chunks in chunk_index order
``drawer_index`` the matched chunk's index in the source file
``total_drawers`` total drawer count for the source file (or None)
On any ChromaDB failure or missing metadata, falls back to returning the
matched drawer alone so search never breaks because neighbor expansion
failed.
"""
src = matched_meta.get("source_file")
chunk_idx = matched_meta.get("chunk_index")
if not src or not isinstance(chunk_idx, int):
return {"text": matched_doc, "drawer_index": chunk_idx, "total_drawers": None}
target_indexes = [chunk_idx + offset for offset in range(-radius, radius + 1)]
try:
neighbors = drawers_col.get(
where={
"$and": [
{"source_file": src},
{"chunk_index": {"$in": target_indexes}},
]
},
include=["documents", "metadatas"],
)
except Exception:
return {"text": matched_doc, "drawer_index": chunk_idx, "total_drawers": None}
indexed_docs = []
for doc, meta in zip(neighbors.get("documents") or [], neighbors.get("metadatas") or []):
ci = meta.get("chunk_index")
if isinstance(ci, int):
indexed_docs.append((ci, doc))
indexed_docs.sort(key=lambda pair: pair[0])
if not indexed_docs:
combined_text = matched_doc
else:
combined_text = "\n\n".join(doc for _, doc in indexed_docs)
# Cheap total_drawers lookup: metadata-only scan of the source file.
total_drawers = None
try:
all_meta = drawers_col.get(where={"source_file": src}, include=["metadatas"])
ids = all_meta.get("ids") or []
total_drawers = len(ids) if ids else None
except Exception:
pass
return {
"text": combined_text,
"drawer_index": chunk_idx,
"total_drawers": total_drawers,
}
def _closet_first_hits(
palace_path: str,
query: str,
where: dict,
drawers_col,
n_results: int,
max_distance: float,
):
"""Run a closet-first search and return chunk-level drawer hits.
Returns:
non-empty list of hits when the closet path produced usable matches.
``None`` when the closet collection is empty/missing OR when every
candidate drawer was filtered out (e.g. by max_distance); the
caller should fall back to direct drawer search.
"""
try:
closets_col = get_closets_collection(palace_path, create=False)
except Exception:
return None
try:
ckwargs = {
"query_texts": [query],
"n_results": max(n_results * 2, 5),
"include": ["documents", "metadatas", "distances"],
}
if where:
ckwargs["where"] = where
closet_results = closets_col.query(**ckwargs)
except Exception:
return None
closet_docs = closet_results["documents"][0] if closet_results["documents"] else []
if not closet_docs:
return None
closet_metas = closet_results["metadatas"][0]
closet_dists = closet_results["distances"][0]
# Collect candidate drawer IDs in closet-rank order, dedupe, remember
# which closet (and its distance/preview) introduced each one.
drawer_id_order: list = []
drawer_provenance: dict = {}
for cdoc, cmeta, cdist in zip(closet_docs, closet_metas, closet_dists):
for did in _extract_drawer_ids_from_closet(cdoc):
if did in drawer_provenance:
continue
drawer_provenance[did] = (cdist, cdoc, cmeta)
drawer_id_order.append(did)
if not drawer_id_order:
return None
# Hydrate exactly those drawers — chunk-level, not whole-file.
try:
fetched = drawers_col.get(
ids=drawer_id_order,
include=["documents", "metadatas"],
)
except Exception:
return None
fetched_ids = fetched.get("ids") or []
fetched_docs = fetched.get("documents") or []
fetched_metas = fetched.get("metadatas") or []
fetched_map = {
did: (doc, meta) for did, doc, meta in zip(fetched_ids, fetched_docs, fetched_metas)
}
hits: list = []
for did in drawer_id_order:
if did not in fetched_map:
continue # closet pointed to a drawer that no longer exists
doc, meta = fetched_map[did]
cdist, cdoc, _ = drawer_provenance[did]
if max_distance > 0.0 and cdist > max_distance:
continue
# Expand with ±1 neighbor chunks from the same source file so a
# closet hit that lands mid-thought still returns enough context to
# be useful without a follow-up get_drawer call.
expansion = _expand_with_neighbors(drawers_col, doc, meta, radius=1)
hits.append(
{
"text": expansion["text"],
"wing": meta.get("wing", "unknown"),
"room": meta.get("room", "unknown"),
"source_file": Path(meta.get("source_file", "?")).name,
"similarity": round(max(0.0, 1 - cdist), 3),
"distance": round(cdist, 4),
"matched_via": "closet",
"closet_preview": cdoc[:200],
"drawer_index": expansion["drawer_index"],
"total_drawers": expansion["total_drawers"],
}
)
if len(hits) >= n_results:
break
return hits if hits else None
def search(query: str, palace_path: str, wing: str = None, room: str = None, n_results: int = 5):
"""
Search the palace. Returns verbatim drawer content.
Optionally filter by wing (project) or room (aspect).
"""
try:
2026-04-11 19:16:49 -04:00
col = get_collection(palace_path, create=False)
except Exception:
print(f"\n No palace found at {palace_path}")
print(" Run: mempalace init <dir> then mempalace mine <dir>")
raise SearchError(f"No palace found at {palace_path}")
where = build_where_filter(wing, room)
try:
kwargs = {
"query_texts": [query],
"n_results": n_results,
"include": ["documents", "metadatas", "distances"],
}
if where:
kwargs["where"] = where
results = col.query(**kwargs)
except Exception as e:
print(f"\n Search error: {e}")
raise SearchError(f"Search error: {e}") from e
docs = results["documents"][0]
metas = results["metadatas"][0]
dists = results["distances"][0]
if not docs:
print(f'\n No results found for: "{query}"')
return
print(f"\n{'=' * 60}")
print(f' Results for: "{query}"')
if wing:
print(f" Wing: {wing}")
if room:
print(f" Room: {room}")
print(f"{'=' * 60}\n")
for i, (doc, meta, dist) in enumerate(zip(docs, metas, dists), 1):
similarity = round(max(0.0, 1 - dist), 3)
source = Path(meta.get("source_file", "?")).name
wing_name = meta.get("wing", "?")
room_name = meta.get("room", "?")
print(f" [{i}] {wing_name} / {room_name}")
print(f" Source: {source}")
print(f" Match: {similarity}")
print()
# Print the verbatim text, indented
for line in doc.strip().split("\n"):
print(f" {line}")
print()
print(f" {'' * 56}")
print()
def search_memories(
query: str,
palace_path: str,
wing: str = None,
room: str = None,
n_results: int = 5,
max_distance: float = 0.0,
) -> dict:
"""Programmatic search — returns a dict instead of printing.
Used by the MCP server and other callers that need data.
Args:
query: Natural language search query.
palace_path: Path to the ChromaDB palace directory.
wing: Optional wing filter.
room: Optional room filter.
n_results: Max results to return.
max_distance: Max cosine distance threshold. The palace collection uses
cosine distance (hnsw:space=cosine) — 0 = identical, 2 = opposite.
Results with distance > this value are filtered out. A value of
0.0 disables filtering. Typical useful range: 0.31.0.
"""
try:
drawers_col = get_collection(palace_path, create=False)
except Exception as e:
logger.error("No palace found at %s: %s", palace_path, e)
return {
"error": "No palace found",
"hint": "Run: mempalace init <dir> && mempalace mine <dir>",
}
where = build_where_filter(wing, room)
# Closet-first search: scan the compact index, parse drawer pointers
# from each matching line, then hydrate exactly those drawers. This
# keeps the result shape chunk-level (consistent with direct search)
# and applies the same max_distance filter.
closet_hits = _closet_first_hits(
palace_path=palace_path,
query=query,
where=where,
drawers_col=drawers_col,
n_results=n_results,
max_distance=max_distance,
)
if closet_hits is not None:
# Re-rank chunk-level closet hits with the same hybrid scoring as
# the direct path. The vector half here uses the closet's distance
# (query↔topic-line) — that's intentional: closets are *meant* to
# be the semantic-narrowing signal, and BM25 then enforces actual
# keyword presence in the hydrated drawer text.
closet_hits = _hybrid_rank(closet_hits, query)
return {
"query": query,
"filters": {"wing": wing, "room": room},
"total_before_filter": len(closet_hits),
"results": closet_hits,
}
# Fallback: direct drawer search (no closets yet, or closets empty)
try:
kwargs = {
"query_texts": [query],
"n_results": n_results,
"include": ["documents", "metadatas", "distances"],
}
if where:
kwargs["where"] = where
results = drawers_col.query(**kwargs)
except Exception as e:
return {"error": f"Search error: {e}"}
docs = results["documents"][0]
metas = results["metadatas"][0]
dists = results["distances"][0]
hits = []
for doc, meta, dist in zip(docs, metas, dists):
# Filter on raw distance before rounding to avoid precision loss
if max_distance > 0.0 and dist > max_distance:
continue
hits.append(
{
"text": doc,
"wing": meta.get("wing", "unknown"),
"room": meta.get("room", "unknown"),
"source_file": Path(meta.get("source_file", "?")).name,
"similarity": round(max(0.0, 1 - dist), 3),
"distance": round(dist, 4),
"matched_via": "drawer",
}
)
# Re-rank with BM25 hybrid scoring
hits = _hybrid_rank(hits, query)
return {
"query": query,
"filters": {"wing": wing, "room": room},
"total_before_filter": len(docs),
"results": hits,
}