2026-04-04 18:16:04 -07:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
"""
|
|
|
|
|
|
searcher.py — Find anything. Exact words.
|
|
|
|
|
|
|
2026-04-13 18:40:36 -03:00
|
|
|
|
Hybrid search: BM25 keyword matching + vector semantic similarity. The
|
|
|
|
|
|
drawer query is the floor — always runs — and closet hits add a rank-based
|
|
|
|
|
|
boost when they agree. Closets are a ranking *signal*, never a gate, so
|
|
|
|
|
|
weak closets (regex extraction on narrative content) can only help, never
|
|
|
|
|
|
hide drawers the direct path would have found.
|
2026-04-04 18:16:04 -07:00
|
|
|
|
"""
|
|
|
|
|
|
|
2026-04-07 17:38:53 -03:00
|
|
|
|
import logging
|
2026-04-13 01:47:19 -07:00
|
|
|
|
import math
|
|
|
|
|
|
import re
|
2026-04-04 18:16:04 -07:00
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
2026-04-13 17:00:55 -03:00
|
|
|
|
from .palace import get_closets_collection, get_collection
|
|
|
|
|
|
|
|
|
|
|
|
# Closet pointer line format: "topic|entities|→drawer_id_a,drawer_id_b"
|
|
|
|
|
|
# Multiple lines may join with newlines inside one closet document.
|
|
|
|
|
|
_CLOSET_DRAWER_REF_RE = re.compile(r"→([\w,]+)")
|
2026-04-04 18:16:04 -07:00
|
|
|
|
|
2026-04-07 17:38:53 -03:00
|
|
|
|
logger = logging.getLogger("mempalace_mcp")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SearchError(Exception):
|
|
|
|
|
|
"""Raised when search cannot proceed (e.g. no palace found)."""
|
|
|
|
|
|
|
2026-04-04 18:16:04 -07:00
|
|
|
|
|
2026-04-13 17:37:45 -03:00
|
|
|
|
_TOKEN_RE = re.compile(r"\w{2,}", re.UNICODE)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-15 09:26:38 +02:00
|
|
|
|
def _first_or_empty(results: dict, key: str) -> list:
|
|
|
|
|
|
"""Return the first inner list of a ChromaDB query result, or [].
|
|
|
|
|
|
|
|
|
|
|
|
ChromaDB returns shapes like ``{"documents": [["a", "b"]], ...}`` for a
|
|
|
|
|
|
successful query, but ``{"documents": [], ...}`` (empty outer list) when
|
|
|
|
|
|
the collection is empty or the filter excludes everything. Indexing
|
|
|
|
|
|
``[0]`` blindly raises IndexError in that case (issue #195).
|
|
|
|
|
|
"""
|
|
|
|
|
|
outer = results.get(key)
|
|
|
|
|
|
if not outer:
|
|
|
|
|
|
return []
|
|
|
|
|
|
return outer[0] or []
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-13 17:37:45 -03:00
|
|
|
|
def _tokenize(text: str) -> list:
|
|
|
|
|
|
"""Lowercase + strip to alphanumeric tokens of length ≥ 2."""
|
|
|
|
|
|
return _TOKEN_RE.findall(text.lower())
|
2026-04-13 01:47:19 -07:00
|
|
|
|
|
|
|
|
|
|
|
2026-04-13 17:37:45 -03:00
|
|
|
|
def _bm25_scores(
|
|
|
|
|
|
query: str,
|
|
|
|
|
|
documents: list,
|
|
|
|
|
|
k1: float = 1.5,
|
|
|
|
|
|
b: float = 0.75,
|
|
|
|
|
|
) -> list:
|
|
|
|
|
|
"""Compute Okapi-BM25 scores for ``query`` against each document.
|
|
|
|
|
|
|
|
|
|
|
|
IDF is computed over the *provided corpus* using the Lucene/BM25+
|
|
|
|
|
|
smoothed formula ``log((N - df + 0.5) / (df + 0.5) + 1)``, which is
|
|
|
|
|
|
always non-negative. This is well-defined for re-ranking a small
|
|
|
|
|
|
candidate set returned by vector retrieval — IDF then reflects how
|
|
|
|
|
|
discriminative each query term is *within the candidates*, exactly
|
|
|
|
|
|
what's needed to reorder them.
|
|
|
|
|
|
|
|
|
|
|
|
Parameters mirror Okapi-BM25 conventions:
|
|
|
|
|
|
k1 — term-frequency saturation (1.2-2.0 typical, 1.5 default)
|
|
|
|
|
|
b — length normalization (0.0 = none, 1.0 = full, 0.75 default)
|
|
|
|
|
|
|
|
|
|
|
|
Returns a list of scores in the same order as ``documents``.
|
2026-04-13 01:47:19 -07:00
|
|
|
|
"""
|
2026-04-13 17:37:45 -03:00
|
|
|
|
n_docs = len(documents)
|
|
|
|
|
|
query_terms = set(_tokenize(query))
|
|
|
|
|
|
if not query_terms or n_docs == 0:
|
|
|
|
|
|
return [0.0] * n_docs
|
|
|
|
|
|
|
|
|
|
|
|
tokenized = [_tokenize(d) for d in documents]
|
|
|
|
|
|
doc_lens = [len(toks) for toks in tokenized]
|
|
|
|
|
|
if not any(doc_lens):
|
|
|
|
|
|
return [0.0] * n_docs
|
|
|
|
|
|
avgdl = sum(doc_lens) / n_docs or 1.0
|
|
|
|
|
|
|
|
|
|
|
|
# Document frequency: how many docs contain each query term?
|
|
|
|
|
|
df = {term: 0 for term in query_terms}
|
|
|
|
|
|
for toks in tokenized:
|
|
|
|
|
|
seen = set(toks) & query_terms
|
|
|
|
|
|
for term in seen:
|
|
|
|
|
|
df[term] += 1
|
|
|
|
|
|
|
|
|
|
|
|
idf = {term: math.log((n_docs - df[term] + 0.5) / (df[term] + 0.5) + 1) for term in query_terms}
|
|
|
|
|
|
|
|
|
|
|
|
scores = []
|
|
|
|
|
|
for toks, dl in zip(tokenized, doc_lens):
|
|
|
|
|
|
if dl == 0:
|
|
|
|
|
|
scores.append(0.0)
|
|
|
|
|
|
continue
|
|
|
|
|
|
tf: dict = {}
|
|
|
|
|
|
for t in toks:
|
|
|
|
|
|
if t in query_terms:
|
|
|
|
|
|
tf[t] = tf.get(t, 0) + 1
|
|
|
|
|
|
score = 0.0
|
|
|
|
|
|
for term, freq in tf.items():
|
|
|
|
|
|
num = freq * (k1 + 1)
|
|
|
|
|
|
den = freq + k1 * (1 - b + b * dl / avgdl)
|
|
|
|
|
|
score += idf[term] * num / den
|
|
|
|
|
|
scores.append(score)
|
|
|
|
|
|
return scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _hybrid_rank(
|
|
|
|
|
|
results: list,
|
|
|
|
|
|
query: str,
|
|
|
|
|
|
vector_weight: float = 0.6,
|
|
|
|
|
|
bm25_weight: float = 0.4,
|
|
|
|
|
|
) -> list:
|
|
|
|
|
|
"""Re-rank ``results`` by a convex combination of vector similarity and BM25.
|
|
|
|
|
|
|
|
|
|
|
|
* Vector similarity uses absolute cosine sim ``max(0, 1 - distance)`` —
|
|
|
|
|
|
ChromaDB's hnsw cosine distance lives in ``[0, 2]`` (0 = identical).
|
|
|
|
|
|
Absolute (not relative-to-max) means adding/removing a candidate
|
|
|
|
|
|
can't reshuffle the others.
|
|
|
|
|
|
* BM25 is real Okapi-BM25 with corpus-relative IDF over the candidates
|
|
|
|
|
|
themselves. Since the absolute scale is unbounded, BM25 is min-max
|
|
|
|
|
|
normalized within the candidate set so weights are commensurable.
|
|
|
|
|
|
|
|
|
|
|
|
Mutates each result dict to add ``bm25_score`` and reorders the list
|
|
|
|
|
|
in place. Returns the same list for convenience.
|
2026-04-13 01:47:19 -07:00
|
|
|
|
"""
|
2026-04-13 17:37:45 -03:00
|
|
|
|
if not results:
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
docs = [r.get("text", "") for r in results]
|
|
|
|
|
|
bm25_raw = _bm25_scores(query, docs)
|
|
|
|
|
|
max_bm25 = max(bm25_raw) if bm25_raw else 0.0
|
|
|
|
|
|
bm25_norm = [s / max_bm25 for s in bm25_raw] if max_bm25 > 0 else [0.0] * len(bm25_raw)
|
|
|
|
|
|
|
|
|
|
|
|
scored = []
|
|
|
|
|
|
for r, raw, norm in zip(results, bm25_raw, bm25_norm):
|
|
|
|
|
|
vec_sim = max(0.0, 1.0 - r.get("distance", 1.0))
|
|
|
|
|
|
r["bm25_score"] = round(raw, 3)
|
|
|
|
|
|
scored.append((vector_weight * vec_sim + bm25_weight * norm, r))
|
|
|
|
|
|
|
|
|
|
|
|
scored.sort(key=lambda pair: pair[0], reverse=True)
|
|
|
|
|
|
results[:] = [r for _, r in scored]
|
|
|
|
|
|
return results
|
2026-04-13 01:47:19 -07:00
|
|
|
|
|
|
|
|
|
|
|
2026-04-11 21:25:04 -07:00
|
|
|
|
def build_where_filter(wing: str = None, room: str = None) -> dict:
|
|
|
|
|
|
"""Build ChromaDB where filter for wing/room filtering."""
|
|
|
|
|
|
if wing and room:
|
|
|
|
|
|
return {"$and": [{"wing": wing}, {"room": room}]}
|
|
|
|
|
|
elif wing:
|
|
|
|
|
|
return {"wing": wing}
|
|
|
|
|
|
elif room:
|
|
|
|
|
|
return {"room": room}
|
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-13 17:00:55 -03:00
|
|
|
|
def _extract_drawer_ids_from_closet(closet_doc: str) -> list:
|
|
|
|
|
|
"""Parse all `→drawer_id_a,drawer_id_b` pointers out of a closet document.
|
|
|
|
|
|
|
|
|
|
|
|
Preserves order and dedupes.
|
|
|
|
|
|
"""
|
|
|
|
|
|
seen: dict = {}
|
|
|
|
|
|
for match in _CLOSET_DRAWER_REF_RE.findall(closet_doc):
|
|
|
|
|
|
for did in match.split(","):
|
|
|
|
|
|
did = did.strip()
|
|
|
|
|
|
if did and did not in seen:
|
|
|
|
|
|
seen[did] = None
|
|
|
|
|
|
return list(seen.keys())
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-13 18:08:01 -03:00
|
|
|
|
def _expand_with_neighbors(drawers_col, matched_doc: str, matched_meta: dict, radius: int = 1):
|
|
|
|
|
|
"""Expand a matched drawer with its ±radius sibling chunks in the same source file.
|
|
|
|
|
|
|
|
|
|
|
|
Motivation — "drawer-grep context" feature: a closet hit returns one
|
|
|
|
|
|
drawer, but the chunk boundary may clip mid-thought (e.g., the matched
|
|
|
|
|
|
chunk says "here's a breakdown:" and the actual breakdown lives in the
|
|
|
|
|
|
next chunk). Fetching the small neighborhood around the match gives
|
|
|
|
|
|
callers enough context without forcing a follow-up ``get_drawer`` call.
|
|
|
|
|
|
|
|
|
|
|
|
Returns a dict with:
|
|
|
|
|
|
``text`` combined chunks in chunk_index order
|
|
|
|
|
|
``drawer_index`` the matched chunk's index in the source file
|
|
|
|
|
|
``total_drawers`` total drawer count for the source file (or None)
|
|
|
|
|
|
|
|
|
|
|
|
On any ChromaDB failure or missing metadata, falls back to returning the
|
|
|
|
|
|
matched drawer alone so search never breaks because neighbor expansion
|
|
|
|
|
|
failed.
|
|
|
|
|
|
"""
|
|
|
|
|
|
src = matched_meta.get("source_file")
|
|
|
|
|
|
chunk_idx = matched_meta.get("chunk_index")
|
|
|
|
|
|
if not src or not isinstance(chunk_idx, int):
|
|
|
|
|
|
return {"text": matched_doc, "drawer_index": chunk_idx, "total_drawers": None}
|
|
|
|
|
|
|
|
|
|
|
|
target_indexes = [chunk_idx + offset for offset in range(-radius, radius + 1)]
|
|
|
|
|
|
try:
|
|
|
|
|
|
neighbors = drawers_col.get(
|
|
|
|
|
|
where={
|
|
|
|
|
|
"$and": [
|
|
|
|
|
|
{"source_file": src},
|
|
|
|
|
|
{"chunk_index": {"$in": target_indexes}},
|
|
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
include=["documents", "metadatas"],
|
|
|
|
|
|
)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
return {"text": matched_doc, "drawer_index": chunk_idx, "total_drawers": None}
|
|
|
|
|
|
|
|
|
|
|
|
indexed_docs = []
|
|
|
|
|
|
for doc, meta in zip(neighbors.get("documents") or [], neighbors.get("metadatas") or []):
|
|
|
|
|
|
ci = meta.get("chunk_index")
|
|
|
|
|
|
if isinstance(ci, int):
|
|
|
|
|
|
indexed_docs.append((ci, doc))
|
|
|
|
|
|
indexed_docs.sort(key=lambda pair: pair[0])
|
|
|
|
|
|
|
|
|
|
|
|
if not indexed_docs:
|
|
|
|
|
|
combined_text = matched_doc
|
|
|
|
|
|
else:
|
|
|
|
|
|
combined_text = "\n\n".join(doc for _, doc in indexed_docs)
|
|
|
|
|
|
|
|
|
|
|
|
# Cheap total_drawers lookup: metadata-only scan of the source file.
|
|
|
|
|
|
total_drawers = None
|
|
|
|
|
|
try:
|
|
|
|
|
|
all_meta = drawers_col.get(where={"source_file": src}, include=["metadatas"])
|
|
|
|
|
|
ids = all_meta.get("ids") or []
|
|
|
|
|
|
total_drawers = len(ids) if ids else None
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
"text": combined_text,
|
|
|
|
|
|
"drawer_index": chunk_idx,
|
|
|
|
|
|
"total_drawers": total_drawers,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-04 18:16:04 -07:00
|
|
|
|
def search(query: str, palace_path: str, wing: str = None, room: str = None, n_results: int = 5):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Search the palace. Returns verbatim drawer content.
|
|
|
|
|
|
Optionally filter by wing (project) or room (aspect).
|
|
|
|
|
|
"""
|
|
|
|
|
|
try:
|
2026-04-11 19:16:49 -04:00
|
|
|
|
col = get_collection(palace_path, create=False)
|
2026-04-04 18:16:04 -07:00
|
|
|
|
except Exception:
|
|
|
|
|
|
print(f"\n No palace found at {palace_path}")
|
|
|
|
|
|
print(" Run: mempalace init <dir> then mempalace mine <dir>")
|
2026-04-07 17:38:53 -03:00
|
|
|
|
raise SearchError(f"No palace found at {palace_path}")
|
2026-04-04 18:16:04 -07:00
|
|
|
|
|
2026-04-11 21:25:04 -07:00
|
|
|
|
where = build_where_filter(wing, room)
|
2026-04-04 18:16:04 -07:00
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
kwargs = {
|
|
|
|
|
|
"query_texts": [query],
|
|
|
|
|
|
"n_results": n_results,
|
|
|
|
|
|
"include": ["documents", "metadatas", "distances"],
|
|
|
|
|
|
}
|
|
|
|
|
|
if where:
|
|
|
|
|
|
kwargs["where"] = where
|
|
|
|
|
|
|
|
|
|
|
|
results = col.query(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"\n Search error: {e}")
|
2026-04-07 17:38:53 -03:00
|
|
|
|
raise SearchError(f"Search error: {e}") from e
|
2026-04-04 18:16:04 -07:00
|
|
|
|
|
2026-04-15 09:26:38 +02:00
|
|
|
|
docs = _first_or_empty(results, "documents")
|
|
|
|
|
|
metas = _first_or_empty(results, "metadatas")
|
|
|
|
|
|
dists = _first_or_empty(results, "distances")
|
2026-04-04 18:16:04 -07:00
|
|
|
|
|
|
|
|
|
|
if not docs:
|
|
|
|
|
|
print(f'\n No results found for: "{query}"')
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\n{'=' * 60}")
|
|
|
|
|
|
print(f' Results for: "{query}"')
|
|
|
|
|
|
if wing:
|
|
|
|
|
|
print(f" Wing: {wing}")
|
|
|
|
|
|
if room:
|
|
|
|
|
|
print(f" Room: {room}")
|
|
|
|
|
|
print(f"{'=' * 60}\n")
|
|
|
|
|
|
|
|
|
|
|
|
for i, (doc, meta, dist) in enumerate(zip(docs, metas, dists), 1):
|
2026-04-11 21:25:04 -07:00
|
|
|
|
similarity = round(max(0.0, 1 - dist), 3)
|
2026-04-04 18:16:04 -07:00
|
|
|
|
source = Path(meta.get("source_file", "?")).name
|
|
|
|
|
|
wing_name = meta.get("wing", "?")
|
|
|
|
|
|
room_name = meta.get("room", "?")
|
|
|
|
|
|
|
|
|
|
|
|
print(f" [{i}] {wing_name} / {room_name}")
|
|
|
|
|
|
print(f" Source: {source}")
|
|
|
|
|
|
print(f" Match: {similarity}")
|
|
|
|
|
|
print()
|
|
|
|
|
|
# Print the verbatim text, indented
|
|
|
|
|
|
for line in doc.strip().split("\n"):
|
|
|
|
|
|
print(f" {line}")
|
|
|
|
|
|
print()
|
|
|
|
|
|
print(f" {'─' * 56}")
|
|
|
|
|
|
|
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_memories(
|
2026-04-11 21:25:04 -07:00
|
|
|
|
query: str,
|
|
|
|
|
|
palace_path: str,
|
|
|
|
|
|
wing: str = None,
|
|
|
|
|
|
room: str = None,
|
|
|
|
|
|
n_results: int = 5,
|
|
|
|
|
|
max_distance: float = 0.0,
|
2026-04-04 18:16:04 -07:00
|
|
|
|
) -> dict:
|
2026-04-11 21:25:04 -07:00
|
|
|
|
"""Programmatic search — returns a dict instead of printing.
|
|
|
|
|
|
|
2026-04-04 18:16:04 -07:00
|
|
|
|
Used by the MCP server and other callers that need data.
|
2026-04-11 21:25:04 -07:00
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
query: Natural language search query.
|
|
|
|
|
|
palace_path: Path to the ChromaDB palace directory.
|
|
|
|
|
|
wing: Optional wing filter.
|
|
|
|
|
|
room: Optional room filter.
|
|
|
|
|
|
n_results: Max results to return.
|
|
|
|
|
|
max_distance: Max cosine distance threshold. The palace collection uses
|
|
|
|
|
|
cosine distance (hnsw:space=cosine) — 0 = identical, 2 = opposite.
|
|
|
|
|
|
Results with distance > this value are filtered out. A value of
|
|
|
|
|
|
0.0 disables filtering. Typical useful range: 0.3–1.0.
|
2026-04-04 18:16:04 -07:00
|
|
|
|
"""
|
|
|
|
|
|
try:
|
2026-04-13 01:33:48 -07:00
|
|
|
|
drawers_col = get_collection(palace_path, create=False)
|
2026-04-04 18:16:04 -07:00
|
|
|
|
except Exception as e:
|
2026-04-07 17:38:53 -03:00
|
|
|
|
logger.error("No palace found at %s: %s", palace_path, e)
|
|
|
|
|
|
return {
|
|
|
|
|
|
"error": "No palace found",
|
|
|
|
|
|
"hint": "Run: mempalace init <dir> && mempalace mine <dir>",
|
|
|
|
|
|
}
|
2026-04-04 18:16:04 -07:00
|
|
|
|
|
2026-04-11 21:25:04 -07:00
|
|
|
|
where = build_where_filter(wing, room)
|
2026-04-04 18:16:04 -07:00
|
|
|
|
|
2026-04-13 08:43:54 -03:00
|
|
|
|
# Hybrid retrieval: always query drawers directly (the floor), then use
|
|
|
|
|
|
# closet hits to boost rankings. Closets are a ranking SIGNAL, never a
|
|
|
|
|
|
# GATE — direct drawer search is always the baseline.
|
|
|
|
|
|
#
|
|
|
|
|
|
# This avoids the "weak-closets regression" where narrative content
|
|
|
|
|
|
# produces low-signal closets (regex extraction matches few topics)
|
|
|
|
|
|
# and closet-first routing hides drawers that direct search would find.
|
2026-04-04 18:16:04 -07:00
|
|
|
|
try:
|
2026-04-13 08:43:54 -03:00
|
|
|
|
dkwargs = {
|
2026-04-04 18:16:04 -07:00
|
|
|
|
"query_texts": [query],
|
2026-04-13 08:43:54 -03:00
|
|
|
|
"n_results": n_results * 3, # over-fetch for re-ranking
|
2026-04-04 18:16:04 -07:00
|
|
|
|
"include": ["documents", "metadatas", "distances"],
|
|
|
|
|
|
}
|
|
|
|
|
|
if where:
|
2026-04-13 08:43:54 -03:00
|
|
|
|
dkwargs["where"] = where
|
|
|
|
|
|
drawer_results = drawers_col.query(**dkwargs)
|
2026-04-04 18:16:04 -07:00
|
|
|
|
except Exception as e:
|
|
|
|
|
|
return {"error": f"Search error: {e}"}
|
|
|
|
|
|
|
2026-04-13 08:43:54 -03:00
|
|
|
|
# Gather closet hits (best-per-source) to build a boost lookup.
|
2026-04-13 18:40:36 -03:00
|
|
|
|
closet_boost_by_source: dict = {} # source_file -> (rank, closet_dist, preview)
|
2026-04-13 01:33:48 -07:00
|
|
|
|
try:
|
|
|
|
|
|
closets_col = get_closets_collection(palace_path, create=False)
|
|
|
|
|
|
ckwargs = {
|
|
|
|
|
|
"query_texts": [query],
|
2026-04-13 08:43:54 -03:00
|
|
|
|
"n_results": n_results * 2,
|
2026-04-13 01:33:48 -07:00
|
|
|
|
"include": ["documents", "metadatas", "distances"],
|
|
|
|
|
|
}
|
|
|
|
|
|
if where:
|
|
|
|
|
|
ckwargs["where"] = where
|
|
|
|
|
|
closet_results = closets_col.query(**ckwargs)
|
2026-04-13 18:40:36 -03:00
|
|
|
|
for rank, (cdoc, cmeta, cdist) in enumerate(
|
2026-04-13 08:43:54 -03:00
|
|
|
|
zip(
|
2026-04-15 09:26:38 +02:00
|
|
|
|
_first_or_empty(closet_results, "documents"),
|
|
|
|
|
|
_first_or_empty(closet_results, "metadatas"),
|
|
|
|
|
|
_first_or_empty(closet_results, "distances"),
|
2026-04-13 08:43:54 -03:00
|
|
|
|
)
|
|
|
|
|
|
):
|
2026-04-13 18:40:36 -03:00
|
|
|
|
source = cmeta.get("source_file", "")
|
2026-04-13 08:43:54 -03:00
|
|
|
|
if source and source not in closet_boost_by_source:
|
2026-04-13 18:40:36 -03:00
|
|
|
|
closet_boost_by_source[source] = (rank, cdist, cdoc[:200])
|
2026-04-13 01:33:48 -07:00
|
|
|
|
except Exception:
|
2026-04-13 08:43:54 -03:00
|
|
|
|
pass # no closets yet — hybrid degrades to pure drawer search
|
|
|
|
|
|
|
2026-04-13 18:40:36 -03:00
|
|
|
|
# Rank-based boost. The ordinal signal ("which closet matched best") is
|
|
|
|
|
|
# more reliable than absolute distance on narrative content, where
|
|
|
|
|
|
# closet distances cluster in 1.2-1.5 range regardless of match quality.
|
2026-04-13 08:43:54 -03:00
|
|
|
|
CLOSET_RANK_BOOSTS = [0.40, 0.25, 0.15, 0.08, 0.04]
|
|
|
|
|
|
CLOSET_DISTANCE_CAP = 1.5 # cosine dist > 1.5 = too weak to use as signal
|
|
|
|
|
|
|
2026-04-13 18:40:36 -03:00
|
|
|
|
scored: list = []
|
2026-04-13 08:43:54 -03:00
|
|
|
|
for doc, meta, dist in zip(
|
2026-04-15 09:26:38 +02:00
|
|
|
|
_first_or_empty(drawer_results, "documents"),
|
|
|
|
|
|
_first_or_empty(drawer_results, "metadatas"),
|
|
|
|
|
|
_first_or_empty(drawer_results, "distances"),
|
2026-04-13 08:43:54 -03:00
|
|
|
|
):
|
2026-04-13 18:40:36 -03:00
|
|
|
|
# Filter on raw distance before rounding to avoid precision loss.
|
2026-04-11 21:25:04 -07:00
|
|
|
|
if max_distance > 0.0 and dist > max_distance:
|
|
|
|
|
|
continue
|
2026-04-04 18:16:04 -07:00
|
|
|
|
|
2026-04-13 18:40:36 -03:00
|
|
|
|
source = meta.get("source_file", "") or ""
|
2026-04-13 08:43:54 -03:00
|
|
|
|
boost = 0.0
|
|
|
|
|
|
matched_via = "drawer"
|
|
|
|
|
|
closet_preview = None
|
|
|
|
|
|
if source in closet_boost_by_source:
|
|
|
|
|
|
c_rank, c_dist, c_preview = closet_boost_by_source[source]
|
|
|
|
|
|
if c_dist <= CLOSET_DISTANCE_CAP and c_rank < len(CLOSET_RANK_BOOSTS):
|
|
|
|
|
|
boost = CLOSET_RANK_BOOSTS[c_rank]
|
|
|
|
|
|
matched_via = "drawer+closet"
|
|
|
|
|
|
closet_preview = c_preview
|
|
|
|
|
|
|
|
|
|
|
|
effective_dist = dist - boost
|
|
|
|
|
|
entry = {
|
|
|
|
|
|
"text": doc,
|
|
|
|
|
|
"wing": meta.get("wing", "unknown"),
|
|
|
|
|
|
"room": meta.get("room", "unknown"),
|
|
|
|
|
|
"source_file": Path(source).name if source else "?",
|
2026-04-15 03:26:57 -04:00
|
|
|
|
"created_at": meta.get("filed_at", "unknown"),
|
2026-04-13 08:43:54 -03:00
|
|
|
|
"similarity": round(max(0.0, 1 - effective_dist), 3),
|
|
|
|
|
|
"distance": round(dist, 4),
|
|
|
|
|
|
"effective_distance": round(effective_dist, 4),
|
|
|
|
|
|
"closet_boost": round(boost, 3),
|
|
|
|
|
|
"matched_via": matched_via,
|
2026-04-13 18:40:36 -03:00
|
|
|
|
# Internal: retain the full source_file path + chunk_index so the
|
|
|
|
|
|
# enrichment step below doesn't have to reverse-lookup via
|
|
|
|
|
|
# basename-suffix matching (which silently collides when two
|
|
|
|
|
|
# files share a basename across different directories).
|
2026-04-13 08:43:54 -03:00
|
|
|
|
"_sort_key": effective_dist,
|
2026-04-13 18:40:36 -03:00
|
|
|
|
"_source_file_full": source,
|
|
|
|
|
|
"_chunk_index": meta.get("chunk_index"),
|
2026-04-13 08:43:54 -03:00
|
|
|
|
}
|
|
|
|
|
|
if closet_preview:
|
|
|
|
|
|
entry["closet_preview"] = closet_preview
|
|
|
|
|
|
scored.append(entry)
|
|
|
|
|
|
|
|
|
|
|
|
scored.sort(key=lambda h: h["_sort_key"])
|
|
|
|
|
|
hits = scored[:n_results]
|
|
|
|
|
|
|
2026-04-13 18:40:36 -03:00
|
|
|
|
# Drawer-grep enrichment: for closet-boosted hits whose source has
|
|
|
|
|
|
# multiple drawers, return the keyword-best chunk + its immediate
|
|
|
|
|
|
# neighbors instead of just the drawer vector search landed on. The
|
|
|
|
|
|
# closet said "this source is relevant"; vector may have picked the
|
|
|
|
|
|
# wrong chunk within it; grep picks the right one.
|
2026-04-13 08:43:54 -03:00
|
|
|
|
MAX_HYDRATION_CHARS = 10000
|
|
|
|
|
|
for h in hits:
|
|
|
|
|
|
if h["matched_via"] == "drawer":
|
|
|
|
|
|
continue
|
2026-04-13 18:40:36 -03:00
|
|
|
|
full_source = h.get("_source_file_full") or ""
|
2026-04-13 08:43:54 -03:00
|
|
|
|
if not full_source:
|
|
|
|
|
|
continue
|
|
|
|
|
|
try:
|
|
|
|
|
|
source_drawers = drawers_col.get(
|
2026-04-13 18:40:36 -03:00
|
|
|
|
where={"source_file": full_source},
|
|
|
|
|
|
include=["documents", "metadatas"],
|
2026-04-13 08:43:54 -03:00
|
|
|
|
)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
continue
|
|
|
|
|
|
docs = source_drawers.get("documents") or []
|
2026-04-13 18:40:36 -03:00
|
|
|
|
metas_ = source_drawers.get("metadatas") or []
|
2026-04-13 08:43:54 -03:00
|
|
|
|
if len(docs) <= 1:
|
|
|
|
|
|
continue
|
2026-04-04 18:16:04 -07:00
|
|
|
|
|
2026-04-13 18:40:36 -03:00
|
|
|
|
# Sort by chunk_index so best_idx + neighbors are positional.
|
|
|
|
|
|
indexed = []
|
|
|
|
|
|
for idx, (d, m) in enumerate(zip(docs, metas_)):
|
|
|
|
|
|
ci = m.get("chunk_index", idx) if isinstance(m, dict) else idx
|
|
|
|
|
|
if not isinstance(ci, int):
|
|
|
|
|
|
ci = idx
|
|
|
|
|
|
indexed.append((ci, d))
|
|
|
|
|
|
indexed.sort(key=lambda p: p[0])
|
|
|
|
|
|
ordered_docs = [d for _, d in indexed]
|
|
|
|
|
|
|
|
|
|
|
|
query_terms = set(_tokenize(query))
|
2026-04-13 08:43:54 -03:00
|
|
|
|
best_idx, best_score = 0, -1
|
2026-04-13 18:40:36 -03:00
|
|
|
|
for idx, d in enumerate(ordered_docs):
|
2026-04-13 08:43:54 -03:00
|
|
|
|
d_lower = d.lower()
|
|
|
|
|
|
s = sum(1 for t in query_terms if t in d_lower)
|
|
|
|
|
|
if s > best_score:
|
|
|
|
|
|
best_score, best_idx = s, idx
|
|
|
|
|
|
|
|
|
|
|
|
start = max(0, best_idx - 1)
|
2026-04-13 18:40:36 -03:00
|
|
|
|
end = min(len(ordered_docs), best_idx + 2)
|
|
|
|
|
|
expanded = "\n\n".join(ordered_docs[start:end])
|
2026-04-13 08:43:54 -03:00
|
|
|
|
if len(expanded) > MAX_HYDRATION_CHARS:
|
|
|
|
|
|
expanded = (
|
|
|
|
|
|
expanded[:MAX_HYDRATION_CHARS]
|
2026-04-13 18:40:36 -03:00
|
|
|
|
+ f"\n\n[...truncated. {len(ordered_docs)} total drawers. "
|
|
|
|
|
|
"Use mempalace_get_drawer for full content.]"
|
2026-04-13 08:43:54 -03:00
|
|
|
|
)
|
|
|
|
|
|
h["text"] = expanded
|
|
|
|
|
|
h["drawer_index"] = best_idx
|
2026-04-13 18:40:36 -03:00
|
|
|
|
h["total_drawers"] = len(ordered_docs)
|
2026-04-13 08:43:54 -03:00
|
|
|
|
|
2026-04-13 18:40:36 -03:00
|
|
|
|
# BM25 hybrid re-rank within the final candidate set.
|
2026-04-13 01:47:19 -07:00
|
|
|
|
hits = _hybrid_rank(hits, query)
|
2026-04-13 08:43:54 -03:00
|
|
|
|
for h in hits:
|
|
|
|
|
|
h.pop("_sort_key", None)
|
2026-04-13 18:40:36 -03:00
|
|
|
|
h.pop("_source_file_full", None)
|
|
|
|
|
|
h.pop("_chunk_index", None)
|
2026-04-13 08:43:54 -03:00
|
|
|
|
|
2026-04-04 18:16:04 -07:00
|
|
|
|
return {
|
|
|
|
|
|
"query": query,
|
|
|
|
|
|
"filters": {"wing": wing, "room": room},
|
2026-04-15 09:26:38 +02:00
|
|
|
|
"total_before_filter": len(_first_or_empty(drawer_results, "documents")),
|
2026-04-04 18:16:04 -07:00
|
|
|
|
"results": hits,
|
|
|
|
|
|
}
|