Fixes #195. When ChromaDB returns no documents (empty palace, or wing/room filter that excludes everything), it returns the shape: {"documents": [], "metadatas": [], "distances": []} Indexing `results["documents"][0]` blindly raises IndexError instead of the expected 'no results' response. Affected: searcher.search(), searcher.search_memories() (drawer + closet branches plus the total_before_filter aggregate), and Layer3.search() / Layer3.search_raw(). Adds a tiny private helper `searcher._first_or_empty(results, key)` that safely extracts the inner list, returning [] for any of: missing key, empty outer list, [None], or [[]]. layers.py imports the same helper to avoid duplicating the guard. Tests: tests/test_empty_chromadb_results.py covers all observed shapes plus a documentation-style test that pins the original IndexError so future readers understand why the helper exists.
This commit is contained in:
+7
-7
@@ -23,7 +23,7 @@ from collections import defaultdict
|
|||||||
|
|
||||||
from .config import MempalaceConfig
|
from .config import MempalaceConfig
|
||||||
from .palace import get_collection as _get_collection
|
from .palace import get_collection as _get_collection
|
||||||
from .searcher import build_where_filter
|
from .searcher import _first_or_empty, build_where_filter
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -272,9 +272,9 @@ class Layer3:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"Search error: {e}"
|
return f"Search error: {e}"
|
||||||
|
|
||||||
docs = results["documents"][0]
|
docs = _first_or_empty(results, "documents")
|
||||||
metas = results["metadatas"][0]
|
metas = _first_or_empty(results, "metadatas")
|
||||||
dists = results["distances"][0]
|
dists = _first_or_empty(results, "distances")
|
||||||
|
|
||||||
if not docs:
|
if not docs:
|
||||||
return "No results found."
|
return "No results found."
|
||||||
@@ -323,9 +323,9 @@ class Layer3:
|
|||||||
|
|
||||||
hits = []
|
hits = []
|
||||||
for doc, meta, dist in zip(
|
for doc, meta, dist in zip(
|
||||||
results["documents"][0],
|
_first_or_empty(results, "documents"),
|
||||||
results["metadatas"][0],
|
_first_or_empty(results, "metadatas"),
|
||||||
results["distances"][0],
|
_first_or_empty(results, "distances"),
|
||||||
):
|
):
|
||||||
hits.append(
|
hits.append(
|
||||||
{
|
{
|
||||||
|
|||||||
+24
-10
@@ -30,6 +30,20 @@ class SearchError(Exception):
|
|||||||
_TOKEN_RE = re.compile(r"\w{2,}", re.UNICODE)
|
_TOKEN_RE = re.compile(r"\w{2,}", re.UNICODE)
|
||||||
|
|
||||||
|
|
||||||
|
def _first_or_empty(results: dict, key: str) -> list:
|
||||||
|
"""Return the first inner list of a ChromaDB query result, or [].
|
||||||
|
|
||||||
|
ChromaDB returns shapes like ``{"documents": [["a", "b"]], ...}`` for a
|
||||||
|
successful query, but ``{"documents": [], ...}`` (empty outer list) when
|
||||||
|
the collection is empty or the filter excludes everything. Indexing
|
||||||
|
``[0]`` blindly raises IndexError in that case (issue #195).
|
||||||
|
"""
|
||||||
|
outer = results.get(key)
|
||||||
|
if not outer:
|
||||||
|
return []
|
||||||
|
return outer[0] or []
|
||||||
|
|
||||||
|
|
||||||
def _tokenize(text: str) -> list:
|
def _tokenize(text: str) -> list:
|
||||||
"""Lowercase + strip to alphanumeric tokens of length ≥ 2."""
|
"""Lowercase + strip to alphanumeric tokens of length ≥ 2."""
|
||||||
return _TOKEN_RE.findall(text.lower())
|
return _TOKEN_RE.findall(text.lower())
|
||||||
@@ -251,9 +265,9 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
|
|||||||
print(f"\n Search error: {e}")
|
print(f"\n Search error: {e}")
|
||||||
raise SearchError(f"Search error: {e}") from e
|
raise SearchError(f"Search error: {e}") from e
|
||||||
|
|
||||||
docs = results["documents"][0]
|
docs = _first_or_empty(results, "documents")
|
||||||
metas = results["metadatas"][0]
|
metas = _first_or_empty(results, "metadatas")
|
||||||
dists = results["distances"][0]
|
dists = _first_or_empty(results, "distances")
|
||||||
|
|
||||||
if not docs:
|
if not docs:
|
||||||
print(f'\n No results found for: "{query}"')
|
print(f'\n No results found for: "{query}"')
|
||||||
@@ -353,9 +367,9 @@ def search_memories(
|
|||||||
closet_results = closets_col.query(**ckwargs)
|
closet_results = closets_col.query(**ckwargs)
|
||||||
for rank, (cdoc, cmeta, cdist) in enumerate(
|
for rank, (cdoc, cmeta, cdist) in enumerate(
|
||||||
zip(
|
zip(
|
||||||
closet_results["documents"][0],
|
_first_or_empty(closet_results, "documents"),
|
||||||
closet_results["metadatas"][0],
|
_first_or_empty(closet_results, "metadatas"),
|
||||||
closet_results["distances"][0],
|
_first_or_empty(closet_results, "distances"),
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
source = cmeta.get("source_file", "")
|
source = cmeta.get("source_file", "")
|
||||||
@@ -372,9 +386,9 @@ def search_memories(
|
|||||||
|
|
||||||
scored: list = []
|
scored: list = []
|
||||||
for doc, meta, dist in zip(
|
for doc, meta, dist in zip(
|
||||||
drawer_results["documents"][0],
|
_first_or_empty(drawer_results, "documents"),
|
||||||
drawer_results["metadatas"][0],
|
_first_or_empty(drawer_results, "metadatas"),
|
||||||
drawer_results["distances"][0],
|
_first_or_empty(drawer_results, "distances"),
|
||||||
):
|
):
|
||||||
# Filter on raw distance before rounding to avoid precision loss.
|
# Filter on raw distance before rounding to avoid precision loss.
|
||||||
if max_distance > 0.0 and dist > max_distance:
|
if max_distance > 0.0 and dist > max_distance:
|
||||||
@@ -482,6 +496,6 @@ def search_memories(
|
|||||||
return {
|
return {
|
||||||
"query": query,
|
"query": query,
|
||||||
"filters": {"wing": wing, "room": room},
|
"filters": {"wing": wing, "room": room},
|
||||||
"total_before_filter": len(drawer_results["documents"][0]),
|
"total_before_filter": len(_first_or_empty(drawer_results, "documents")),
|
||||||
"results": hits,
|
"results": hits,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,48 @@
|
|||||||
|
"""Regression tests for issue #195 — IndexError on empty ChromaDB results.
|
||||||
|
|
||||||
|
Before the fix, `searcher.search()`, `searcher.search_memories()`, and
|
||||||
|
`Layer3.search()` indexed `results["documents"][0]` without checking the
|
||||||
|
outer list, so a query against an empty collection (or a wing/room
|
||||||
|
filter that excluded everything) crashed with IndexError instead of
|
||||||
|
returning a graceful "no results" response.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from mempalace.searcher import _first_or_empty
|
||||||
|
|
||||||
|
|
||||||
|
def test_first_or_empty_handles_empty_outer_list():
|
||||||
|
"""The shape ChromaDB returns from an empty collection (issue #195)."""
|
||||||
|
results = {"documents": [], "metadatas": [], "distances": []}
|
||||||
|
assert _first_or_empty(results, "documents") == []
|
||||||
|
assert _first_or_empty(results, "metadatas") == []
|
||||||
|
assert _first_or_empty(results, "distances") == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_first_or_empty_handles_outer_with_empty_inner():
|
||||||
|
"""ChromaDB also returns ``{"documents": [[]]}`` in some versions —
|
||||||
|
must yield [] either way."""
|
||||||
|
assert _first_or_empty({"documents": [[]]}, "documents") == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_first_or_empty_handles_missing_key():
|
||||||
|
assert _first_or_empty({}, "documents") == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_first_or_empty_handles_none_inner():
|
||||||
|
"""``[None]`` (unusual but observed) must not blow up."""
|
||||||
|
assert _first_or_empty({"documents": [None]}, "documents") == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_first_or_empty_returns_inner_list_for_normal_result():
|
||||||
|
results = {"documents": [["a", "b", "c"]]}
|
||||||
|
assert _first_or_empty(results, "documents") == ["a", "b", "c"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_raw_indexing_still_raises_to_document_the_bug():
|
||||||
|
"""Document the original failure mode so future readers understand
|
||||||
|
why _first_or_empty exists."""
|
||||||
|
results = {"documents": []}
|
||||||
|
with pytest.raises(IndexError):
|
||||||
|
_ = results["documents"][0]
|
||||||
Reference in New Issue
Block a user