From ee12c07c545eb5955dae3c6bdcbebd162b518d87 Mon Sep 17 00:00:00 2001 From: jp Date: Fri, 24 Apr 2026 21:27:21 -0700 Subject: [PATCH] fix(searcher): tolerate None documents in BM25 reranker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_tokenize` calls `text.lower()` unconditionally; when ChromaDB returns a drawer with `documents` containing `None`, the hybrid-rerank path raises `AttributeError: 'NoneType' object has no attribute 'lower'`. Observed in production daemon log (2026-04-24 21:07:05) during a search that triggered `_hybrid_rank → _bm25_scores → _tokenize`: File "mempalace/searcher.py", line 81, in _bm25_scores tokenized = [_tokenize(d) for d in documents] File "mempalace/searcher.py", line 52, in _tokenize return _TOKEN_RE.findall(text.lower()) AttributeError: 'NoneType' object has no attribute 'lower' Closes the gap left by the upstream None-metadata audit (#999), which covered metadata loops but not BM25 helpers. Returns `[]` for falsy input so a None doc gets score 0.0 while the rest of the corpus reranks normally. Three regression tests in TestBM25NoneSafety lock the behavior and reference the production trace. Co-Authored-By: Claude Opus 4.7 (1M context) --- mempalace/searcher.py | 9 ++++++++- tests/test_searcher.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/mempalace/searcher.py b/mempalace/searcher.py index c4a1d87..c2fcdb4 100644 --- a/mempalace/searcher.py +++ b/mempalace/searcher.py @@ -46,7 +46,14 @@ def _first_or_empty(results, key: str) -> list: def _tokenize(text: str) -> list: - """Lowercase + strip to alphanumeric tokens of length ≥ 2.""" + """Lowercase + strip to alphanumeric tokens of length ≥ 2. + + Tolerates ``None`` documents — Chroma can return ``None`` in the + ``documents`` field for drawers without text content, which would + otherwise raise ``AttributeError`` mid-rerank. + """ + if not text: + return [] return _TOKEN_RE.findall(text.lower()) diff --git a/tests/test_searcher.py b/tests/test_searcher.py index 65191c3..51eb2f8 100644 --- a/tests/test_searcher.py +++ b/tests/test_searcher.py @@ -121,6 +121,39 @@ class TestSearchMemories: assert none_hit["room"] == "unknown" +# ── BM25 internals: None / empty document safety ───────────────────── + + +class TestBM25NoneSafety: + """Regression tests for the AttributeError observed in production when + Chroma returned ``None`` documents inside a hybrid-rerank pass. + + Trace from the daemon log (2026-04-24 21:07:05): + File "mempalace/searcher.py", line 81, in _bm25_scores + tokenized = [_tokenize(d) for d in documents] + File "mempalace/searcher.py", line 52, in _tokenize + return _TOKEN_RE.findall(text.lower()) + AttributeError: 'NoneType' object has no attribute 'lower' + """ + + def test_tokenize_handles_none(self): + from mempalace.searcher import _tokenize + assert _tokenize(None) == [] + + def test_tokenize_handles_empty_string(self): + from mempalace.searcher import _tokenize + assert _tokenize("") == [] + + def test_bm25_scores_does_not_crash_on_none_documents(self): + """A ``None`` mixed into the corpus must yield score 0.0 for that doc + and finite scores for the rest, not raise AttributeError.""" + from mempalace.searcher import _bm25_scores + scores = _bm25_scores("postgres migration", ["postgres migration done", None, "kafka rebalance"]) + assert len(scores) == 3 + assert scores[1] == 0.0 + assert scores[0] > 0.0 + + # ── search() (CLI print function) ─────────────────────────────────────