fix(mcp): retry _get_collection once on transient failure (#1286)
A transient chromadb exception inside `_get_collection` was swallowed by the bare `except Exception: return None`, leaving every subsequent tool call hitting the same poisoned cache silently. The fix wraps the body in a `for attempt in range(2)` loop: on attempt 0 failure, log via `logger.exception(...)` and clear `_client_cache` / `_collection_cache` / `_metadata_cache` so the next iteration forces `_get_client()` to rebuild from scratch — that path now re-runs `quarantine_stale_hnsw` (per #1322), so the second attempt heals the common stale-handle case automatically. If both attempts fail, return `None` (matches the prior contract for permanent failures). Two new tests in `tests/test_mcp_server.py::TestCacheInvalidation`: - `test_get_collection_retries_once_on_exception` — first attempt raises via a monkeypatched `_get_client`, second attempt succeeds; assert the caller gets the collection back, not None. - `test_get_collection_returns_none_after_two_failures` — both attempts fail, assert we exhaust the loop and return None (no infinite retry). Surgical extraction from PR #1286, which carried the same fix idea (plus a fork-sync bundle that couldn't be merged); credit to the original author below. Co-authored-by: Jeffrey Hein <jp@jphein.com>
This commit is contained in:
+28
-2
@@ -326,8 +326,19 @@ def _get_client():
|
|||||||
|
|
||||||
|
|
||||||
def _get_collection(create=False):
|
def _get_collection(create=False):
|
||||||
"""Return the ChromaDB collection, caching the client between calls."""
|
"""Return the ChromaDB collection, caching the client between calls.
|
||||||
global _collection_cache, _metadata_cache, _metadata_cache_time
|
|
||||||
|
On failure, log the exception and retry once after clearing the client
|
||||||
|
and collection caches. Tools were silently returning ``None`` when a
|
||||||
|
cached client/collection went stale — typically after the chromadb
|
||||||
|
rust bindings invalidated a handle following an out-of-band write —
|
||||||
|
leaving the LLM with no diagnostic and no recovery path. The retry
|
||||||
|
forces ``_get_client()`` to rebuild from scratch (which re-runs
|
||||||
|
``quarantine_stale_hnsw`` per #1322), so the second attempt heals the
|
||||||
|
common stale-handle / stale-HNSW case automatically.
|
||||||
|
"""
|
||||||
|
global _client_cache, _collection_cache, _metadata_cache, _metadata_cache_time
|
||||||
|
for attempt in range(2):
|
||||||
try:
|
try:
|
||||||
client = _get_client()
|
client = _get_client()
|
||||||
# ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
|
# ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
|
||||||
@@ -387,6 +398,21 @@ def _get_collection(create=False):
|
|||||||
_metadata_cache_time = 0
|
_metadata_cache_time = 0
|
||||||
return _collection_cache
|
return _collection_cache
|
||||||
except Exception:
|
except Exception:
|
||||||
|
logger.exception(
|
||||||
|
"_get_collection attempt %d/2 failed (palace=%s, create=%s)",
|
||||||
|
attempt + 1,
|
||||||
|
_config.palace_path,
|
||||||
|
create,
|
||||||
|
)
|
||||||
|
if attempt == 0:
|
||||||
|
# Reset all caches so the next attempt forces _get_client()
|
||||||
|
# to rebuild the chromadb client from scratch — that path
|
||||||
|
# re-runs quarantine_stale_hnsw (#1322) and reopens the
|
||||||
|
# collection cleanly, healing the common stale-handle case.
|
||||||
|
_client_cache = None
|
||||||
|
_collection_cache = None
|
||||||
|
_metadata_cache = None
|
||||||
|
_metadata_cache_time = 0
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1259,6 +1259,71 @@ class TestCacheInvalidation:
|
|||||||
assert "embedding_function" in kwargs
|
assert "embedding_function" in kwargs
|
||||||
assert kwargs["embedding_function"] is not None
|
assert kwargs["embedding_function"] is not None
|
||||||
|
|
||||||
|
def test_get_collection_retries_once_on_exception(self, monkeypatch, config, palace_path, kg):
|
||||||
|
"""Regression: a transient failure inside _get_collection must trigger
|
||||||
|
one retry after clearing the client/collection caches, not silently
|
||||||
|
return None.
|
||||||
|
|
||||||
|
Before this fix, a stale chromadb handle (e.g. the rust bindings
|
||||||
|
invalidating after an out-of-band write) would raise inside the
|
||||||
|
single ``try`` block, get swallowed by ``except Exception: return
|
||||||
|
None``, and every subsequent tool call would hit the same poisoned
|
||||||
|
cache returning None. The retry forces ``_get_client()`` to rebuild
|
||||||
|
the client (which re-runs ``quarantine_stale_hnsw`` per #1322), so
|
||||||
|
the second attempt heals the common stale-handle case.
|
||||||
|
"""
|
||||||
|
_patch_mcp_server(monkeypatch, config, kg)
|
||||||
|
_client, _col = _get_collection(palace_path, create=True)
|
||||||
|
del _client
|
||||||
|
from mempalace import mcp_server
|
||||||
|
|
||||||
|
# Force a cold cache so the first call goes through the open path.
|
||||||
|
mcp_server._client_cache = None
|
||||||
|
mcp_server._collection_cache = None
|
||||||
|
|
||||||
|
real_get_client = mcp_server._get_client
|
||||||
|
attempts = {"count": 0}
|
||||||
|
|
||||||
|
def flaky_get_client():
|
||||||
|
attempts["count"] += 1
|
||||||
|
if attempts["count"] == 1:
|
||||||
|
raise RuntimeError("simulated transient chromadb failure")
|
||||||
|
return real_get_client()
|
||||||
|
|
||||||
|
monkeypatch.setattr(mcp_server, "_get_client", flaky_get_client)
|
||||||
|
|
||||||
|
col = mcp_server._get_collection()
|
||||||
|
|
||||||
|
# Both attempts ran and the second succeeded.
|
||||||
|
assert attempts["count"] == 2
|
||||||
|
assert col is not None
|
||||||
|
|
||||||
|
def test_get_collection_returns_none_after_two_failures(
|
||||||
|
self, monkeypatch, config, palace_path, kg
|
||||||
|
):
|
||||||
|
"""If both attempts fail, return None (matches the prior contract for
|
||||||
|
permanent failures — only the transient case is now self-healing)."""
|
||||||
|
_patch_mcp_server(monkeypatch, config, kg)
|
||||||
|
_client, _col = _get_collection(palace_path, create=True)
|
||||||
|
del _client
|
||||||
|
from mempalace import mcp_server
|
||||||
|
|
||||||
|
mcp_server._client_cache = None
|
||||||
|
mcp_server._collection_cache = None
|
||||||
|
|
||||||
|
attempts = {"count": 0}
|
||||||
|
|
||||||
|
def always_fails():
|
||||||
|
attempts["count"] += 1
|
||||||
|
raise RuntimeError("permanent chromadb failure")
|
||||||
|
|
||||||
|
monkeypatch.setattr(mcp_server, "_get_client", always_fails)
|
||||||
|
|
||||||
|
col = mcp_server._get_collection()
|
||||||
|
|
||||||
|
assert attempts["count"] == 2
|
||||||
|
assert col is None
|
||||||
|
|
||||||
|
|
||||||
class TestKGLazyCache:
|
class TestKGLazyCache:
|
||||||
"""Lazy per-path KnowledgeGraph cache (issue #1136)."""
|
"""Lazy per-path KnowledgeGraph cache (issue #1136)."""
|
||||||
|
|||||||
Reference in New Issue
Block a user