fix(repair): address Copilot review on #1227

Five Copilot review issues + the Python 3.9 CI failure rolled into one
follow-up:

* Replace ``dict | None`` annotated assignment with a type-comment so
  module load doesn't evaluate PEP 604 syntax on Python 3.9 (CI red).
* Drop ``mempalace repair rebuild`` — the CLI only ships ``mempalace
  repair`` (rebuild) and ``mempalace repair-status``. Updated all
  user-facing messages, docstrings, and test assertions.
* Replace ``_get_client()`` in ``tool_search`` with the safe
  ``_refresh_vector_disabled_flag`` probe so the fallback isn't
  defeated by the very chromadb client load it's trying to avoid.
* Short-circuit ``tool_status`` to a pure-sqlite reader
  (``_tool_status_via_sqlite``) when divergence is detected so wing /
  room counts come back without ever opening the persistent client.
* Wrap the recency-window query in ``_bm25_only_via_sqlite`` with an
  ``id``-ordered fallback so legacy schemas missing ``created_at``
  don't break BM25 search.

New test covers the sqlite-status short-circuit. 1409 tests pass.
This commit is contained in:
Igor Lins e Silva
2026-04-26 21:53:56 -03:00
parent 0d349c3d86
commit 57ac669dbc
4 changed files with 166 additions and 37 deletions
+92 -19
View File
@@ -122,7 +122,10 @@ _palace_db_mtime = 0.0 # mtime of chroma.sqlite3 at cache time
# successful repair via :func:`tool_reconnect` (which re-runs the probe). # successful repair via :func:`tool_reconnect` (which re-runs the probe).
_vector_disabled = False _vector_disabled = False
_vector_disabled_reason = "" _vector_disabled_reason = ""
_vector_capacity_status: dict | None = None # Optional[dict] (not ``dict | None``) keeps Python 3.9 import-time
# parsing happy — PEP 604 unions in annotations only became unconditional
# at module-eval time in 3.10.
_vector_capacity_status = None # type: Optional[dict]
def _refresh_vector_disabled_flag() -> None: def _refresh_vector_disabled_flag() -> None:
@@ -144,7 +147,7 @@ def _refresh_vector_disabled_flag() -> None:
if not _vector_disabled: if not _vector_disabled:
logger.warning( logger.warning(
"HNSW capacity divergence detected (%s) — routing search to " "HNSW capacity divergence detected (%s) — routing search to "
"BM25-only sqlite fallback. Run `mempalace repair rebuild` to restore " "BM25-only sqlite fallback. Run `mempalace repair` to restore "
"vector search.", "vector search.",
info.get("message", "unknown"), info.get("message", "unknown"),
) )
@@ -359,11 +362,91 @@ def _sanitize_optional_name(value: str = None, field_name: str = "name") -> str:
# ==================== READ TOOLS ==================== # ==================== READ TOOLS ====================
def _tool_status_via_sqlite() -> dict:
"""Pure-sqlite status reader for the #1222 fallback path.
When the HNSW capacity probe detects divergence, opening the chromadb
persistent client can segfault. This reader pulls the same wing/room
breakdown directly from ``embedding_metadata`` so the operator still
gets a working status response — and crucially the
``vector_disabled`` flag — without us touching the vector segment.
"""
import sqlite3 as _sqlite3
db_path = os.path.join(_config.palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
return _no_palace()
wings: dict = {}
rooms: dict = {}
total = 0
try:
conn = _sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
try:
row = conn.execute(
"""
SELECT COUNT(*)
FROM embeddings e
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = 'mempalace_drawers'
"""
).fetchone()
total = int(row[0]) if row and row[0] is not None else 0
for key, target in (("wing", wings), ("room", rooms)):
for value, count in conn.execute(
"""
SELECT em.string_value, COUNT(*)
FROM embedding_metadata em
JOIN embeddings e ON em.id = e.id
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = 'mempalace_drawers'
AND em.key = ?
AND em.string_value IS NOT NULL
GROUP BY em.string_value
""",
(key,),
):
target[value] = count
finally:
conn.close()
except _sqlite3.Error:
logger.exception("tool_status sqlite fallback read failed")
result = {
"total_drawers": total,
"wings": wings,
"rooms": rooms,
"palace_path": _config.palace_path,
"protocol": PALACE_PROTOCOL,
"aaak_dialect": AAAK_SPEC,
"vector_disabled": True,
"vector_disabled_reason": _vector_disabled_reason,
}
if _vector_capacity_status:
result["hnsw_capacity"] = {
"sqlite_count": _vector_capacity_status.get("sqlite_count"),
"hnsw_count": _vector_capacity_status.get("hnsw_count"),
"divergence": _vector_capacity_status.get("divergence"),
}
return result
def tool_status(): def tool_status():
# Run the safe sqlite/pickle probe before we touch chromadb. In the
# #1222 failure mode, opening the persistent client to call .count()
# can segfault — short-circuit to a pure-sqlite path when divergence
# is detected so status stays reachable.
db_exists = os.path.isfile(os.path.join(_config.palace_path, "chroma.sqlite3"))
_refresh_vector_disabled_flag()
if _vector_disabled:
return _tool_status_via_sqlite()
# Use create=True only when a palace DB already exists on disk -- this # Use create=True only when a palace DB already exists on disk -- this
# bootstraps the ChromaDB collection on a valid-but-empty palace without # bootstraps the ChromaDB collection on a valid-but-empty palace without
# accidentally creating a palace in a non-existent directory (#830). # accidentally creating a palace in a non-existent directory (#830).
db_exists = os.path.isfile(os.path.join(_config.palace_path, "chroma.sqlite3"))
col = _get_collection(create=db_exists) col = _get_collection(create=db_exists)
if not col: if not col:
return _no_palace() return _no_palace()
@@ -378,17 +461,6 @@ def tool_status():
"protocol": PALACE_PROTOCOL, "protocol": PALACE_PROTOCOL,
"aaak_dialect": AAAK_SPEC, "aaak_dialect": AAAK_SPEC,
} }
if _vector_disabled:
# Surface the #1222 fallback state so the AI knows search results
# are BM25-only and recommends the operator run repair.
result["vector_disabled"] = True
result["vector_disabled_reason"] = _vector_disabled_reason
if _vector_capacity_status:
result["hnsw_capacity"] = {
"sqlite_count": _vector_capacity_status.get("sqlite_count"),
"hnsw_count": _vector_capacity_status.get("hnsw_count"),
"divergence": _vector_capacity_status.get("divergence"),
}
try: try:
all_meta = _get_cached_metadata(col) all_meta = _get_cached_metadata(col)
for m in all_meta: for m in all_meta:
@@ -523,9 +595,11 @@ def tool_search(
dist = (1.0 - min_similarity) if min_similarity is not None else max_distance dist = (1.0 - min_similarity) if min_similarity is not None else max_distance
# Mitigate system prompt contamination (Issue #333) # Mitigate system prompt contamination (Issue #333)
sanitized = sanitize_query(query) sanitized = sanitize_query(query)
# Ensure the capacity probe has been run at least once before we # Ensure the vector-disabled probe has been run via the safe
# decide which path to take — _get_client triggers it on first open. # sqlite/pickle path before we touch chromadb. Calling _get_client()
_get_client() # here would defeat the fallback — it constructs a PersistentClient
# which can segfault on segment load in the #1222 failure mode.
_refresh_vector_disabled_flag()
result = search_memories( result = search_memories(
sanitized["clean_query"], sanitized["clean_query"],
palace_path=_config.palace_path, palace_path=_config.palace_path,
@@ -567,8 +641,7 @@ def tool_check_duplicate(content: str, threshold: float = 0.9):
"vector_disabled": True, "vector_disabled": True,
"vector_disabled_reason": _vector_disabled_reason, "vector_disabled_reason": _vector_disabled_reason,
"hint": ( "hint": (
"duplicate detection requires vector search; run " "duplicate detection requires vector search; run " "`mempalace repair` to restore"
"`mempalace repair rebuild` to restore"
), ),
} }
try: try:
+2 -2
View File
@@ -440,7 +440,7 @@ def status(palace_path=None) -> dict:
at a stale ``max_elements`` while sqlite keeps accumulating rows. at a stale ``max_elements`` while sqlite keeps accumulating rows.
Once the divergence is large enough, every tool call segfaults when Once the divergence is large enough, every tool call segfaults when
chromadb tries to load the undersized HNSW. Running ``mempalace chromadb tries to load the undersized HNSW. Running ``mempalace
repair status`` *before* opening the segment lets the operator repair-status`` *before* opening the segment lets the operator
discover the problem without crashing the MCP server. discover the problem without crashing the MCP server.
The check itself never opens a chromadb client and never imports The check itself never opens a chromadb client and never imports
@@ -481,7 +481,7 @@ def status(palace_path=None) -> dict:
print(f" note: {info['message']}") print(f" note: {info['message']}")
if drawers["diverged"] or closets["diverged"]: if drawers["diverged"] or closets["diverged"]:
print("\n Recommended: run `mempalace repair rebuild` to rebuild the index.") print("\n Recommended: run `mempalace repair` to rebuild the index.")
print() print()
return {"drawers": drawers, "closets": closets} return {"drawers": drawers, "closets": closets}
+29 -1
View File
@@ -427,7 +427,13 @@ def _bm25_only_via_sqlite(
if not candidate_ids: if not candidate_ids:
# No FTS hits (or no usable tokens) — pull the most recent # No FTS hits (or no usable tokens) — pull the most recent
# rows for the drawers segment so we can BM25-rank something # rows for the drawers segment so we can BM25-rank something
# rather than return empty-handed. # rather than return empty-handed. Wrapped in try/except
# because the schema may differ on legacy palaces (older
# chromadb without ``created_at``, missing ``segments``
# rows after partial restore, etc.); on schema mismatch we
# fall back to ordering by primary-key id and finally to an
# empty result rather than letting search raise.
try:
rows = conn.execute( rows = conn.execute(
""" """
SELECT e.id SELECT e.id
@@ -441,6 +447,28 @@ def _bm25_only_via_sqlite(
(max_candidates,), (max_candidates,),
).fetchall() ).fetchall()
candidate_ids = [r[0] for r in rows] candidate_ids = [r[0] for r in rows]
except sqlite3.Error:
logger.debug(
"recency-window query failed; trying id-ordered fallback",
exc_info=True,
)
try:
rows = conn.execute(
"""
SELECT e.id
FROM embeddings e
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = 'mempalace_drawers'
ORDER BY e.id DESC
LIMIT ?
""",
(max_candidates,),
).fetchall()
candidate_ids = [r[0] for r in rows]
except sqlite3.Error:
logger.debug("id-ordered fallback also failed", exc_info=True)
candidate_ids = []
if not candidate_ids: if not candidate_ids:
return { return {
+30 -2
View File
@@ -347,7 +347,7 @@ def test_repair_status_reports_diverged(tmp_path, capsys):
out = repair_status(palace_path=str(tmp_path)) out = repair_status(palace_path=str(tmp_path))
captured = capsys.readouterr().out captured = capsys.readouterr().out
assert "DIVERGED" in captured assert "DIVERGED" in captured
assert "mempalace repair rebuild" in captured assert "mempalace repair`" in captured
assert out["drawers"]["diverged"] is True assert out["drawers"]["diverged"] is True
@@ -360,4 +360,32 @@ def test_repair_status_quiet_on_healthy_palace(tmp_path, capsys):
repair_status(palace_path=str(tmp_path)) repair_status(palace_path=str(tmp_path))
captured = capsys.readouterr().out captured = capsys.readouterr().out
assert "DIVERGED" not in captured assert "DIVERGED" not in captured
assert "mempalace repair rebuild" not in captured assert "Recommended" not in captured
# ── tool_status sqlite fallback (#1222 short-circuit) ─────────────────
def test_tool_status_via_sqlite_returns_breakdown(palace_with_drawers, monkeypatch):
"""When _vector_disabled is set, tool_status reads counts from sqlite
instead of opening a chromadb client."""
from mempalace import mcp_server
# _config.palace_path is a read-only property; swap the whole object
# for a tiny stand-in so we don't have to monkey with the real
# MempalaceConfig.
class _Cfg:
palace_path = str(palace_with_drawers)
monkeypatch.setattr(mcp_server, "_config", _Cfg())
monkeypatch.setattr(mcp_server, "_vector_disabled", True)
monkeypatch.setattr(mcp_server, "_vector_disabled_reason", "test divergence")
out = mcp_server._tool_status_via_sqlite()
assert out["vector_disabled"] is True
assert out["vector_disabled_reason"] == "test divergence"
assert out["total_drawers"] == 3
# Wing breakdown comes from the seeded palace_with_drawers fixture:
# ops×2 (incident + repair runbook), design×1 (metaphor).
assert out["wings"].get("ops") == 2
assert out["wings"].get("design") == 1