From ee12c07c545eb5955dae3c6bdcbebd162b518d87 Mon Sep 17 00:00:00 2001
From: jp <jp@jphein.com>
Date: Fri, 24 Apr 2026 21:27:21 -0700
Subject: [PATCH] fix(searcher): tolerate None documents in BM25 reranker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`_tokenize` calls `text.lower()` unconditionally; when ChromaDB returns a
drawer with `documents` containing `None`, the hybrid-rerank path raises
`AttributeError: 'NoneType' object has no attribute 'lower'`.

Observed in production daemon log (2026-04-24 21:07:05) during a search
that triggered `_hybrid_rank → _bm25_scores → _tokenize`:

    File "mempalace/searcher.py", line 81, in _bm25_scores
        tokenized = [_tokenize(d) for d in documents]
    File "mempalace/searcher.py", line 52, in _tokenize
        return _TOKEN_RE.findall(text.lower())
    AttributeError: 'NoneType' object has no attribute 'lower'

Closes the gap left by the upstream None-metadata audit (#999), which
covered metadata loops but not BM25 helpers. Returns `[]` for falsy input
so a None doc gets score 0.0 while the rest of the corpus reranks normally.

Three regression tests in TestBM25NoneSafety lock the behavior and reference
the production trace.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/searcher.py  |  9 ++++++++-
 tests/test_searcher.py | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index c4a1d87..c2fcdb4 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -46,7 +46,14 @@ def _first_or_empty(results, key: str) -> list:
 
 
 def _tokenize(text: str) -> list:
-    """Lowercase + strip to alphanumeric tokens of length ≥ 2."""
+    """Lowercase + strip to alphanumeric tokens of length ≥ 2.
+
+    Tolerates ``None`` documents — Chroma can return ``None`` in the
+    ``documents`` field for drawers without text content, which would
+    otherwise raise ``AttributeError`` mid-rerank.
+    """
+    if not text:
+        return []
     return _TOKEN_RE.findall(text.lower())
 
 
diff --git a/tests/test_searcher.py b/tests/test_searcher.py
index 65191c3..51eb2f8 100644
--- a/tests/test_searcher.py
+++ b/tests/test_searcher.py
@@ -121,6 +121,39 @@ class TestSearchMemories:
         assert none_hit["room"] == "unknown"
 
 
+# ── BM25 internals: None / empty document safety ─────────────────────
+
+
+class TestBM25NoneSafety:
+    """Regression tests for the AttributeError observed in production when
+    Chroma returned ``None`` documents inside a hybrid-rerank pass.
+
+    Trace from the daemon log (2026-04-24 21:07:05):
+        File "mempalace/searcher.py", line 81, in _bm25_scores
+            tokenized = [_tokenize(d) for d in documents]
+        File "mempalace/searcher.py", line 52, in _tokenize
+            return _TOKEN_RE.findall(text.lower())
+        AttributeError: 'NoneType' object has no attribute 'lower'
+    """
+
+    def test_tokenize_handles_none(self):
+        from mempalace.searcher import _tokenize
+        assert _tokenize(None) == []
+
+    def test_tokenize_handles_empty_string(self):
+        from mempalace.searcher import _tokenize
+        assert _tokenize("") == []
+
+    def test_bm25_scores_does_not_crash_on_none_documents(self):
+        """A ``None`` mixed into the corpus must yield score 0.0 for that doc
+        and finite scores for the rest, not raise AttributeError."""
+        from mempalace.searcher import _bm25_scores
+        scores = _bm25_scores("postgres migration", ["postgres migration done", None, "kafka rebalance"])
+        assert len(scores) == 3
+        assert scores[1] == 0.0
+        assert scores[0] > 0.0
+
+
 # ── search() (CLI print function) ─────────────────────────────────────