fix(repair): address Copilot review on #1227

Five Copilot review issues + the Python 3.9 CI failure rolled into one
follow-up:

* Replace ``dict | None`` annotated assignment with a type-comment so
  module load doesn't evaluate PEP 604 syntax on Python 3.9 (CI red).
* Drop ``mempalace repair rebuild`` — the CLI only ships ``mempalace
  repair`` (rebuild) and ``mempalace repair-status``. Updated all
  user-facing messages, docstrings, and test assertions.
* Replace ``_get_client()`` in ``tool_search`` with the safe
  ``_refresh_vector_disabled_flag`` probe so the fallback isn't
  defeated by the very chromadb client load it's trying to avoid.
* Short-circuit ``tool_status`` to a pure-sqlite reader
  (``_tool_status_via_sqlite``) when divergence is detected so wing /
  room counts come back without ever opening the persistent client.
* Wrap the recency-window query in ``_bm25_only_via_sqlite`` with an
  ``id``-ordered fallback so legacy schemas missing ``created_at``
  don't break BM25 search.

New test covers the sqlite-status short-circuit. 1409 tests pass.
This commit is contained in:
Igor Lins e Silva
2026-04-26 21:53:56 -03:00
parent 0d349c3d86
commit 57ac669dbc
4 changed files with 166 additions and 37 deletions
+92 -19
View File
@@ -122,7 +122,10 @@ _palace_db_mtime = 0.0 # mtime of chroma.sqlite3 at cache time
# successful repair via :func:`tool_reconnect` (which re-runs the probe).
_vector_disabled = False
_vector_disabled_reason = ""
_vector_capacity_status: dict | None = None
# Optional[dict] (not ``dict | None``) keeps Python 3.9 import-time
# parsing happy — PEP 604 unions in annotations only became unconditional
# at module-eval time in 3.10.
_vector_capacity_status = None # type: Optional[dict]
def _refresh_vector_disabled_flag() -> None:
@@ -144,7 +147,7 @@ def _refresh_vector_disabled_flag() -> None:
if not _vector_disabled:
logger.warning(
"HNSW capacity divergence detected (%s) — routing search to "
"BM25-only sqlite fallback. Run `mempalace repair rebuild` to restore "
"BM25-only sqlite fallback. Run `mempalace repair` to restore "
"vector search.",
info.get("message", "unknown"),
)
@@ -359,11 +362,91 @@ def _sanitize_optional_name(value: str = None, field_name: str = "name") -> str:
# ==================== READ TOOLS ====================
def _tool_status_via_sqlite() -> dict:
"""Pure-sqlite status reader for the #1222 fallback path.
When the HNSW capacity probe detects divergence, opening the chromadb
persistent client can segfault. This reader pulls the same wing/room
breakdown directly from ``embedding_metadata`` so the operator still
gets a working status response — and crucially the
``vector_disabled`` flag — without us touching the vector segment.
"""
import sqlite3 as _sqlite3
db_path = os.path.join(_config.palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
return _no_palace()
wings: dict = {}
rooms: dict = {}
total = 0
try:
conn = _sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
try:
row = conn.execute(
"""
SELECT COUNT(*)
FROM embeddings e
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = 'mempalace_drawers'
"""
).fetchone()
total = int(row[0]) if row and row[0] is not None else 0
for key, target in (("wing", wings), ("room", rooms)):
for value, count in conn.execute(
"""
SELECT em.string_value, COUNT(*)
FROM embedding_metadata em
JOIN embeddings e ON em.id = e.id
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = 'mempalace_drawers'
AND em.key = ?
AND em.string_value IS NOT NULL
GROUP BY em.string_value
""",
(key,),
):
target[value] = count
finally:
conn.close()
except _sqlite3.Error:
logger.exception("tool_status sqlite fallback read failed")
result = {
"total_drawers": total,
"wings": wings,
"rooms": rooms,
"palace_path": _config.palace_path,
"protocol": PALACE_PROTOCOL,
"aaak_dialect": AAAK_SPEC,
"vector_disabled": True,
"vector_disabled_reason": _vector_disabled_reason,
}
if _vector_capacity_status:
result["hnsw_capacity"] = {
"sqlite_count": _vector_capacity_status.get("sqlite_count"),
"hnsw_count": _vector_capacity_status.get("hnsw_count"),
"divergence": _vector_capacity_status.get("divergence"),
}
return result
def tool_status():
# Run the safe sqlite/pickle probe before we touch chromadb. In the
# #1222 failure mode, opening the persistent client to call .count()
# can segfault — short-circuit to a pure-sqlite path when divergence
# is detected so status stays reachable.
db_exists = os.path.isfile(os.path.join(_config.palace_path, "chroma.sqlite3"))
_refresh_vector_disabled_flag()
if _vector_disabled:
return _tool_status_via_sqlite()
# Use create=True only when a palace DB already exists on disk -- this
# bootstraps the ChromaDB collection on a valid-but-empty palace without
# accidentally creating a palace in a non-existent directory (#830).
db_exists = os.path.isfile(os.path.join(_config.palace_path, "chroma.sqlite3"))
col = _get_collection(create=db_exists)
if not col:
return _no_palace()
@@ -378,17 +461,6 @@ def tool_status():
"protocol": PALACE_PROTOCOL,
"aaak_dialect": AAAK_SPEC,
}
if _vector_disabled:
# Surface the #1222 fallback state so the AI knows search results
# are BM25-only and recommends the operator run repair.
result["vector_disabled"] = True
result["vector_disabled_reason"] = _vector_disabled_reason
if _vector_capacity_status:
result["hnsw_capacity"] = {
"sqlite_count": _vector_capacity_status.get("sqlite_count"),
"hnsw_count": _vector_capacity_status.get("hnsw_count"),
"divergence": _vector_capacity_status.get("divergence"),
}
try:
all_meta = _get_cached_metadata(col)
for m in all_meta:
@@ -523,9 +595,11 @@ def tool_search(
dist = (1.0 - min_similarity) if min_similarity is not None else max_distance
# Mitigate system prompt contamination (Issue #333)
sanitized = sanitize_query(query)
# Ensure the capacity probe has been run at least once before we
# decide which path to take — _get_client triggers it on first open.
_get_client()
# Ensure the vector-disabled probe has been run via the safe
# sqlite/pickle path before we touch chromadb. Calling _get_client()
# here would defeat the fallback — it constructs a PersistentClient
# which can segfault on segment load in the #1222 failure mode.
_refresh_vector_disabled_flag()
result = search_memories(
sanitized["clean_query"],
palace_path=_config.palace_path,
@@ -567,8 +641,7 @@ def tool_check_duplicate(content: str, threshold: float = 0.9):
"vector_disabled": True,
"vector_disabled_reason": _vector_disabled_reason,
"hint": (
"duplicate detection requires vector search; run "
"`mempalace repair rebuild` to restore"
"duplicate detection requires vector search; run " "`mempalace repair` to restore"
),
}
try:
+2 -2
View File
@@ -440,7 +440,7 @@ def status(palace_path=None) -> dict:
at a stale ``max_elements`` while sqlite keeps accumulating rows.
Once the divergence is large enough, every tool call segfaults when
chromadb tries to load the undersized HNSW. Running ``mempalace
repair status`` *before* opening the segment lets the operator
repair-status`` *before* opening the segment lets the operator
discover the problem without crashing the MCP server.
The check itself never opens a chromadb client and never imports
@@ -481,7 +481,7 @@ def status(palace_path=None) -> dict:
print(f" note: {info['message']}")
if drawers["diverged"] or closets["diverged"]:
print("\n Recommended: run `mempalace repair rebuild` to rebuild the index.")
print("\n Recommended: run `mempalace repair` to rebuild the index.")
print()
return {"drawers": drawers, "closets": closets}
+29 -1
View File
@@ -427,7 +427,13 @@ def _bm25_only_via_sqlite(
if not candidate_ids:
# No FTS hits (or no usable tokens) — pull the most recent
# rows for the drawers segment so we can BM25-rank something
# rather than return empty-handed.
# rather than return empty-handed. Wrapped in try/except
# because the schema may differ on legacy palaces (older
# chromadb without ``created_at``, missing ``segments``
# rows after partial restore, etc.); on schema mismatch we
# fall back to ordering by primary-key id and finally to an
# empty result rather than letting search raise.
try:
rows = conn.execute(
"""
SELECT e.id
@@ -441,6 +447,28 @@ def _bm25_only_via_sqlite(
(max_candidates,),
).fetchall()
candidate_ids = [r[0] for r in rows]
except sqlite3.Error:
logger.debug(
"recency-window query failed; trying id-ordered fallback",
exc_info=True,
)
try:
rows = conn.execute(
"""
SELECT e.id
FROM embeddings e
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = 'mempalace_drawers'
ORDER BY e.id DESC
LIMIT ?
""",
(max_candidates,),
).fetchall()
candidate_ids = [r[0] for r in rows]
except sqlite3.Error:
logger.debug("id-ordered fallback also failed", exc_info=True)
candidate_ids = []
if not candidate_ids:
return {
+30 -2
View File
@@ -347,7 +347,7 @@ def test_repair_status_reports_diverged(tmp_path, capsys):
out = repair_status(palace_path=str(tmp_path))
captured = capsys.readouterr().out
assert "DIVERGED" in captured
assert "mempalace repair rebuild" in captured
assert "mempalace repair`" in captured
assert out["drawers"]["diverged"] is True
@@ -360,4 +360,32 @@ def test_repair_status_quiet_on_healthy_palace(tmp_path, capsys):
repair_status(palace_path=str(tmp_path))
captured = capsys.readouterr().out
assert "DIVERGED" not in captured
assert "mempalace repair rebuild" not in captured
assert "Recommended" not in captured
# ── tool_status sqlite fallback (#1222 short-circuit) ─────────────────
def test_tool_status_via_sqlite_returns_breakdown(palace_with_drawers, monkeypatch):
"""When _vector_disabled is set, tool_status reads counts from sqlite
instead of opening a chromadb client."""
from mempalace import mcp_server
# _config.palace_path is a read-only property; swap the whole object
# for a tiny stand-in so we don't have to monkey with the real
# MempalaceConfig.
class _Cfg:
palace_path = str(palace_with_drawers)
monkeypatch.setattr(mcp_server, "_config", _Cfg())
monkeypatch.setattr(mcp_server, "_vector_disabled", True)
monkeypatch.setattr(mcp_server, "_vector_disabled_reason", "test divergence")
out = mcp_server._tool_status_via_sqlite()
assert out["vector_disabled"] is True
assert out["vector_disabled_reason"] == "test divergence"
assert out["total_drawers"] == 3
# Wing breakdown comes from the seeded palace_with_drawers fixture:
# ops×2 (incident + repair runbook), design×1 (metaphor).
assert out["wings"].get("ops") == 2
assert out["wings"].get("design") == 1