fix(backends): address Copilot review on #995

Four defects surfaced by the automated review, fixed with targeted tests:

1. BaseCollection.update() default now validates that documents / metadatas /
   embeddings lengths match ids, raising ValueError instead of silently
   misaligning pairs or raising IndexError (base.py).

2. ChromaCollection.query() now rejects the two ambiguous input shapes up
   front — neither or both of query_texts / query_embeddings, and empty input
   lists — with clear ValueError messages rather than delegating to chromadb's
   less-obvious errors (chroma.py).

3. QueryResult.empty() accepts embeddings_requested=True to preserve the
   outer-query dimension with empty hit lists when the caller asked for
   embeddings, matching the spec rule that included fields carry the outer
   shape even when empty (base.py). ChromaCollection.query() threads this
   through on the empty-result path (chroma.py).

4. ChromaBackend cache-freshness check now matches the semantics from
   mcp_server._get_client (merged via #757) on three edge cases Copilot
   called out: (a) invalidate when chroma.sqlite3 disappears while a cached
   client is held, (b) treat a 0→nonzero stat transition as a change so a
   cache built when the DB did not yet exist is refreshed, (c) re-stat
   after PersistentClient constructs the DB lazily so freshness reflects
   the post-creation state (chroma.py).

Tests: 978 passed (up from 970), 8 new tests covering the fixes.
This commit is contained in:
Igor Lins e Silva
2026-04-18 13:19:18 -03:00
parent a17a8b734a
commit 42b940d263
3 changed files with 194 additions and 9 deletions
+134
View File
@@ -164,6 +164,140 @@ def test_chroma_detect_matches_palace_with_chroma_sqlite(tmp_path):
assert ChromaBackend.detect(str(tmp_path.parent)) is False
def test_query_rejects_missing_input():
fake = _FakeCollection()
collection = ChromaCollection(fake)
with pytest.raises(ValueError):
collection.query()
def test_query_rejects_both_texts_and_embeddings():
fake = _FakeCollection()
collection = ChromaCollection(fake)
with pytest.raises(ValueError):
collection.query(query_texts=["q"], query_embeddings=[[0.1, 0.2]])
def test_query_rejects_empty_input_list():
fake = _FakeCollection()
collection = ChromaCollection(fake)
with pytest.raises(ValueError):
collection.query(query_texts=[])
def test_query_empty_preserves_embeddings_outer_shape_when_requested():
fake = _FakeCollection(
query_response={"ids": [], "documents": [], "metadatas": [], "distances": []}
)
collection = ChromaCollection(fake)
requested = collection.query(query_texts=["q1", "q2"], include=["documents", "embeddings"])
assert requested.embeddings == [[], []]
not_requested = collection.query(query_texts=["q1", "q2"], include=["documents"])
assert not_requested.embeddings is None
def test_base_collection_update_default_validates_list_lengths(tmp_path):
backend = ChromaBackend()
palace_path = tmp_path / "palace"
collection = backend.get_collection(
palace=PalaceRef(id=str(palace_path), local_path=str(palace_path)),
collection_name="mempalace_drawers",
create=True,
)
# Mismatched documents length → clear ValueError, not silent merge.
with pytest.raises(ValueError, match="documents length"):
collection._collection.add(
documents=["a", "b"],
ids=["1", "2"],
metadatas=[{"k": 1}, {"k": 2}],
)
from mempalace.backends.base import BaseCollection
BaseCollection.update(
collection,
ids=["1", "2"],
documents=["only-one"],
)
def test_chroma_cache_invalidates_when_db_file_missing(tmp_path):
"""A palace rebuild that removes chroma.sqlite3 must drop the stale cache."""
backend = ChromaBackend()
palace_path = tmp_path / "palace"
backend.get_collection(
palace=PalaceRef(id=str(palace_path), local_path=str(palace_path)),
collection_name="mempalace_drawers",
create=True,
)
assert str(palace_path) in backend._clients
prior_client = backend._clients[str(palace_path)]
prior_freshness = backend._freshness[str(palace_path)]
assert prior_freshness != (0, 0.0) # DB file exists after get_or_create_collection
# Remove chroma.sqlite3 to simulate a rebuild mid-flight. The stale cache
# must not be silently reused — the in-memory HNSW index would be wrong.
(palace_path / "chroma.sqlite3").unlink()
new_client = backend._client(str(palace_path))
# New client object (cache was replaced, not reused) and freshness was reset
# to (0, 0.0) to reflect "no DB on disk yet" state.
assert new_client is not prior_client
assert backend._freshness[str(palace_path)] == (0, 0.0)
def test_chroma_cache_picks_up_db_created_after_first_open(tmp_path):
"""The 0 → nonzero stat transition invalidates a cache built before the DB existed."""
backend = ChromaBackend()
palace_path = tmp_path / "palace"
palace_path.mkdir()
# Seed an entry in the caches as if a prior _client() call had opened the
# palace when chroma.sqlite3 did not exist yet. Freshness (0, 0.0) is the
# signal that the DB was absent at cache time.
sentinel = object()
backend._clients[str(palace_path)] = sentinel
backend._freshness[str(palace_path)] = (0, 0.0)
# The DB file now appears (real chromadb would have created it by now).
# Use a real chromadb call so _fix_blob_seq_ids and PersistentClient succeed.
import chromadb as _chromadb
_chromadb.PersistentClient(path=str(palace_path)).get_or_create_collection("seed")
assert (palace_path / "chroma.sqlite3").is_file()
# Next _client() call must detect the 0 → nonzero transition and rebuild.
refreshed = backend._client(str(palace_path))
assert refreshed is not sentinel
assert backend._freshness[str(palace_path)] != (0, 0.0)
def test_base_collection_update_default_rejects_mismatched_lengths(tmp_path):
"""The ABC default update() raises ValueError rather than silently misaligning."""
from mempalace.backends.base import BaseCollection
backend = ChromaBackend()
palace_path = tmp_path / "palace"
collection = backend.get_collection(
palace=PalaceRef(id=str(palace_path), local_path=str(palace_path)),
collection_name="mempalace_drawers",
create=True,
)
collection.add(
documents=["a", "b"],
ids=["1", "2"],
metadatas=[{"k": 1}, {"k": 2}],
)
with pytest.raises(ValueError, match="documents length"):
BaseCollection.update(collection, ids=["1", "2"], documents=["only-one"])
with pytest.raises(ValueError, match="metadatas length"):
BaseCollection.update(collection, ids=["1", "2"], metadatas=[{"k": 9}])
def test_chroma_backend_accepts_palace_ref_kwarg(tmp_path):
palace_path = tmp_path / "palace"
backend = ChromaBackend()