Merge pull request #1029 from eldar702/fix/searcher-effective-distance-clamp

fix(searcher): clamp effective_distance to valid cosine range [0, 2]
This commit is contained in:
Igor Lins e Silva
2026-05-06 03:20:59 -03:00
committed by GitHub
2 changed files with 65 additions and 1 deletions
+6 -1
View File
@@ -825,7 +825,12 @@ def search_memories(
matched_via = "drawer+closet"
closet_preview = c_preview
effective_dist = dist - boost
# Clamp to the valid cosine-distance range [0, 2]. When a strong
# closet boost (up to 0.40) exceeds the raw distance, the subtraction
# can go negative — which (a) yields ``similarity > 1.0`` downstream
# and (b) makes the sort key land *below* ordinary positive distances,
# inverting the ranking so the best hybrid matches sort last.
effective_dist = max(0.0, min(2.0, dist - boost))
entry = {
"text": doc,
"wing": meta.get("wing", "unknown"),
+59
View File
@@ -120,6 +120,65 @@ class TestSearchMemories:
assert none_hit["wing"] == "unknown"
assert none_hit["room"] == "unknown"
def test_effective_distance_clamped_to_valid_cosine_range(self):
"""A strong closet boost (up to 0.40) applied to a low-distance drawer
can drive ``dist - boost`` negative. That violates the cosine-distance
invariant ``[0, 2]``: the API returns ``similarity > 1.0`` and the
internal ``_sort_key`` sinks below ordinary positive distances,
inverting the ranking so the best hybrid matches sort last.
With the clamp, ``effective_distance`` stays in ``[0, 2]``,
``similarity`` stays in ``[0, 1]``, and the sort order is stable.
"""
# Drawer a.md gets a tiny base distance (0.08) — nearly exact match.
# Drawer b.md gets a larger base distance (0.35).
drawers_col = MagicMock()
drawers_col.query.return_value = {
"documents": [["doc-a", "doc-b"]],
"metadatas": [
[
{"source_file": "a.md", "wing": "w", "room": "r", "chunk_index": 0},
{"source_file": "b.md", "wing": "w", "room": "r", "chunk_index": 0},
]
],
"distances": [[0.08, 0.35]],
"ids": [["d-a", "d-b"]],
}
# A strong closet at rank 0 points at a.md → boost = 0.40,
# which exceeds a.md's base distance and would go negative without
# the clamp. No closet for b.md.
closets_col = MagicMock()
closets_col.query.return_value = {
"documents": [["closet-preview-a"]],
"metadatas": [[{"source_file": "a.md"}]],
"distances": [[0.2]], # within CLOSET_DISTANCE_CAP (1.5)
"ids": [["c-a"]],
}
with (
patch("mempalace.searcher.get_collection", return_value=drawers_col),
patch("mempalace.searcher.get_closets_collection", return_value=closets_col),
):
result = search_memories("query", "/fake/path", n_results=5)
hits = result["results"]
assert hits, "should return results"
# Invariants on every hit.
for h in hits:
assert (
0.0 <= h["similarity"] <= 1.0
), f"similarity out of range: {h['similarity']} for {h['source_file']}"
assert 0.0 <= h["effective_distance"] <= 2.0, (
f"effective_distance out of range: {h['effective_distance']} "
f"for {h['source_file']}"
)
# With the clamp, the closet-boosted a.md still ranks ahead of b.md —
# the boost still wins, but it no longer flips the ranking.
assert hits[0]["source_file"] == "a.md"
assert hits[0]["matched_via"] == "drawer+closet"
# ── BM25 internals: None / empty document safety ─────────────────────