From 5347c2c71c782d04fc2023504f58dc52c2369187 Mon Sep 17 00:00:00 2001
From: eldar702 <eldarshlomi7@gmail.com>
Date: Sun, 19 Apr 2026 11:08:45 +0300
Subject: [PATCH] fix(searcher): clamp effective_distance to valid cosine range
 [0, 2]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

``search_memories`` computes ``effective_dist = dist - boost`` where
``boost`` can be as large as ``CLOSET_RANK_BOOSTS[0] == 0.40`` for a
rank-0 closet hit. When the raw drawer distance is small — any
near-exact match — the subtraction goes negative.

Two downstream effects:

1. Line 418 returns ``round(max(0.0, 1 - effective_dist), 3)`` as
   ``similarity``. With ``effective_dist = -0.30`` that yields
   ``similarity = 1.30``, outside the documented ``[0, 1]`` range.
   The ``max(0.0, ...)`` only prevents negative similarities; it does
   not cap above 1.
2. Line 427 stores ``_sort_key: effective_dist`` and line 435 sorts
   ``scored`` ascending by that key. A negative key drops *below* the
   rest, so the strongest hybrid matches end up sorting after weaker
   ones — ranking inversion under the exact conditions hybrid retrieval
   is supposed to serve best.

Clamp ``effective_dist`` to the valid cosine-distance range ``[0, 2]``.
The boost still wins (closet-backed hit still ranks first), it just no
longer flips the order.

Test added: mock drawer_col (base dist 0.08 / 0.35 for two sources) +
closet_col (rank-0 closet for the 0.08 source) → assert all hits have
``0 <= similarity <= 1`` and ``0 <= effective_distance <= 2``, and that
the closet-boosted source still ranks first.

Relationship to other PRs:

* **#988** clamps the output ``similarity`` alone. That does not fix
  the sort-key inversion or the invalid ``effective_distance`` in the
  returned dict. This PR clamps at the arithmetic source so both
  downstream users of the value stay in range.
* Orthogonal to **#979** (``tool_check_duplicate`` negative similarity).
---
 mempalace/searcher.py  |  7 +++++-
 tests/test_searcher.py | 57 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index d615623..e99fc5d 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -825,7 +825,12 @@ def search_memories(
                 matched_via = "drawer+closet"
                 closet_preview = c_preview
 
-        effective_dist = dist - boost
+        # Clamp to the valid cosine-distance range [0, 2]. When a strong
+        # closet boost (up to 0.40) exceeds the raw distance, the subtraction
+        # can go negative — which (a) yields ``similarity > 1.0`` downstream
+        # and (b) makes the sort key land *below* ordinary positive distances,
+        # inverting the ranking so the best hybrid matches sort last.
+        effective_dist = max(0.0, min(2.0, dist - boost))
         entry = {
             "text": doc,
             "wing": meta.get("wing", "unknown"),
diff --git a/tests/test_searcher.py b/tests/test_searcher.py
index 6b85832..ad60641 100644
--- a/tests/test_searcher.py
+++ b/tests/test_searcher.py
@@ -120,6 +120,63 @@ class TestSearchMemories:
         assert none_hit["wing"] == "unknown"
         assert none_hit["room"] == "unknown"
 
+    def test_effective_distance_clamped_to_valid_cosine_range(self):
+        """A strong closet boost (up to 0.40) applied to a low-distance drawer
+        can drive ``dist - boost`` negative. That violates the cosine-distance
+        invariant ``[0, 2]``: the API returns ``similarity > 1.0`` and the
+        internal ``_sort_key`` sinks below ordinary positive distances,
+        inverting the ranking so the best hybrid matches sort last.
+
+        With the clamp, ``effective_distance`` stays in ``[0, 2]``,
+        ``similarity`` stays in ``[0, 1]``, and the sort order is stable.
+        """
+        # Drawer a.md gets a tiny base distance (0.08) — nearly exact match.
+        # Drawer b.md gets a larger base distance (0.35).
+        drawers_col = MagicMock()
+        drawers_col.query.return_value = {
+            "documents": [["doc-a", "doc-b"]],
+            "metadatas": [[
+                {"source_file": "a.md", "wing": "w", "room": "r", "chunk_index": 0},
+                {"source_file": "b.md", "wing": "w", "room": "r", "chunk_index": 0},
+            ]],
+            "distances": [[0.08, 0.35]],
+            "ids": [["d-a", "d-b"]],
+        }
+        # A strong closet at rank 0 points at a.md → boost = 0.40,
+        # which exceeds a.md's base distance and would go negative without
+        # the clamp. No closet for b.md.
+        closets_col = MagicMock()
+        closets_col.query.return_value = {
+            "documents": [["closet-preview-a"]],
+            "metadatas": [[{"source_file": "a.md"}]],
+            "distances": [[0.2]],  # within CLOSET_DISTANCE_CAP (1.5)
+            "ids": [["c-a"]],
+        }
+
+        with (
+            patch("mempalace.searcher.get_collection", return_value=drawers_col),
+            patch("mempalace.searcher.get_closets_collection", return_value=closets_col),
+        ):
+            result = search_memories("query", "/fake/path", n_results=5)
+
+        hits = result["results"]
+        assert hits, "should return results"
+
+        # Invariants on every hit.
+        for h in hits:
+            assert 0.0 <= h["similarity"] <= 1.0, (
+                f"similarity out of range: {h['similarity']} for {h['source_file']}"
+            )
+            assert 0.0 <= h["effective_distance"] <= 2.0, (
+                f"effective_distance out of range: {h['effective_distance']} "
+                f"for {h['source_file']}"
+            )
+
+        # With the clamp, the closet-boosted a.md still ranks ahead of b.md —
+        # the boost still wins, but it no longer flips the ranking.
+        assert hits[0]["source_file"] == "a.md"
+        assert hits[0]["matched_via"] == "drawer+closet"
+
 
 # ── BM25 internals: None / empty document safety ─────────────────────