fix(searcher): clamp effective_distance to valid cosine range [0, 2]
``search_memories`` computes ``effective_dist = dist - boost`` where ``boost`` can be as large as ``CLOSET_RANK_BOOSTS[0] == 0.40`` for a rank-0 closet hit. When the raw drawer distance is small — any near-exact match — the subtraction goes negative. Two downstream effects: 1. Line 418 returns ``round(max(0.0, 1 - effective_dist), 3)`` as ``similarity``. With ``effective_dist = -0.30`` that yields ``similarity = 1.30``, outside the documented ``[0, 1]`` range. The ``max(0.0, ...)`` only prevents negative similarities; it does not cap above 1. 2. Line 427 stores ``_sort_key: effective_dist`` and line 435 sorts ``scored`` ascending by that key. A negative key drops *below* the rest, so the strongest hybrid matches end up sorting after weaker ones — ranking inversion under the exact conditions hybrid retrieval is supposed to serve best. Clamp ``effective_dist`` to the valid cosine-distance range ``[0, 2]``. The boost still wins (closet-backed hit still ranks first), it just no longer flips the order. Test added: mock drawer_col (base dist 0.08 / 0.35 for two sources) + closet_col (rank-0 closet for the 0.08 source) → assert all hits have ``0 <= similarity <= 1`` and ``0 <= effective_distance <= 2``, and that the closet-boosted source still ranks first. Relationship to other PRs: * **#988** clamps the output ``similarity`` alone. That does not fix the sort-key inversion or the invalid ``effective_distance`` in the returned dict. This PR clamps at the arithmetic source so both downstream users of the value stay in range. * Orthogonal to **#979** (``tool_check_duplicate`` negative similarity).
This commit is contained in:
committed by
Igor Lins e Silva
parent
46d9eb5df0
commit
5347c2c71c
@@ -825,7 +825,12 @@ def search_memories(
|
|||||||
matched_via = "drawer+closet"
|
matched_via = "drawer+closet"
|
||||||
closet_preview = c_preview
|
closet_preview = c_preview
|
||||||
|
|
||||||
effective_dist = dist - boost
|
# Clamp to the valid cosine-distance range [0, 2]. When a strong
|
||||||
|
# closet boost (up to 0.40) exceeds the raw distance, the subtraction
|
||||||
|
# can go negative — which (a) yields ``similarity > 1.0`` downstream
|
||||||
|
# and (b) makes the sort key land *below* ordinary positive distances,
|
||||||
|
# inverting the ranking so the best hybrid matches sort last.
|
||||||
|
effective_dist = max(0.0, min(2.0, dist - boost))
|
||||||
entry = {
|
entry = {
|
||||||
"text": doc,
|
"text": doc,
|
||||||
"wing": meta.get("wing", "unknown"),
|
"wing": meta.get("wing", "unknown"),
|
||||||
|
|||||||
@@ -120,6 +120,63 @@ class TestSearchMemories:
|
|||||||
assert none_hit["wing"] == "unknown"
|
assert none_hit["wing"] == "unknown"
|
||||||
assert none_hit["room"] == "unknown"
|
assert none_hit["room"] == "unknown"
|
||||||
|
|
||||||
|
def test_effective_distance_clamped_to_valid_cosine_range(self):
|
||||||
|
"""A strong closet boost (up to 0.40) applied to a low-distance drawer
|
||||||
|
can drive ``dist - boost`` negative. That violates the cosine-distance
|
||||||
|
invariant ``[0, 2]``: the API returns ``similarity > 1.0`` and the
|
||||||
|
internal ``_sort_key`` sinks below ordinary positive distances,
|
||||||
|
inverting the ranking so the best hybrid matches sort last.
|
||||||
|
|
||||||
|
With the clamp, ``effective_distance`` stays in ``[0, 2]``,
|
||||||
|
``similarity`` stays in ``[0, 1]``, and the sort order is stable.
|
||||||
|
"""
|
||||||
|
# Drawer a.md gets a tiny base distance (0.08) — nearly exact match.
|
||||||
|
# Drawer b.md gets a larger base distance (0.35).
|
||||||
|
drawers_col = MagicMock()
|
||||||
|
drawers_col.query.return_value = {
|
||||||
|
"documents": [["doc-a", "doc-b"]],
|
||||||
|
"metadatas": [[
|
||||||
|
{"source_file": "a.md", "wing": "w", "room": "r", "chunk_index": 0},
|
||||||
|
{"source_file": "b.md", "wing": "w", "room": "r", "chunk_index": 0},
|
||||||
|
]],
|
||||||
|
"distances": [[0.08, 0.35]],
|
||||||
|
"ids": [["d-a", "d-b"]],
|
||||||
|
}
|
||||||
|
# A strong closet at rank 0 points at a.md → boost = 0.40,
|
||||||
|
# which exceeds a.md's base distance and would go negative without
|
||||||
|
# the clamp. No closet for b.md.
|
||||||
|
closets_col = MagicMock()
|
||||||
|
closets_col.query.return_value = {
|
||||||
|
"documents": [["closet-preview-a"]],
|
||||||
|
"metadatas": [[{"source_file": "a.md"}]],
|
||||||
|
"distances": [[0.2]], # within CLOSET_DISTANCE_CAP (1.5)
|
||||||
|
"ids": [["c-a"]],
|
||||||
|
}
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("mempalace.searcher.get_collection", return_value=drawers_col),
|
||||||
|
patch("mempalace.searcher.get_closets_collection", return_value=closets_col),
|
||||||
|
):
|
||||||
|
result = search_memories("query", "/fake/path", n_results=5)
|
||||||
|
|
||||||
|
hits = result["results"]
|
||||||
|
assert hits, "should return results"
|
||||||
|
|
||||||
|
# Invariants on every hit.
|
||||||
|
for h in hits:
|
||||||
|
assert 0.0 <= h["similarity"] <= 1.0, (
|
||||||
|
f"similarity out of range: {h['similarity']} for {h['source_file']}"
|
||||||
|
)
|
||||||
|
assert 0.0 <= h["effective_distance"] <= 2.0, (
|
||||||
|
f"effective_distance out of range: {h['effective_distance']} "
|
||||||
|
f"for {h['source_file']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# With the clamp, the closet-boosted a.md still ranks ahead of b.md —
|
||||||
|
# the boost still wins, but it no longer flips the ranking.
|
||||||
|
assert hits[0]["source_file"] == "a.md"
|
||||||
|
assert hits[0]["matched_via"] == "drawer+closet"
|
||||||
|
|
||||||
|
|
||||||
# ── BM25 internals: None / empty document safety ─────────────────────
|
# ── BM25 internals: None / empty document safety ─────────────────────
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user