merge: develop into fix/1295-repair-max-seq-id-preflight

Three conflicts, all from develop landing #1285/#1310/#1312 after this
branch was authored:

- mempalace/cli.py: keep both import sets — this branch's
  maybe_repair_poisoned_max_seq_id_before_rebuild plus develop's
  RebuildCollectionError / _close_chroma_handles / _extract_drawers /
  _rebuild_collection_via_temp added in #1285.

- mempalace/repair.py: keep this branch's
  maybe_repair_poisoned_max_seq_id_before_rebuild definition; use
  develop's rebuild_index signature with the collection_name parameter
  added in #1312. Normalized print indent to 2 spaces matching the
  rest of the file.

- tests/test_repair.py: keep both this branch's max_seq_id preflight
  tests and develop's rebuild_from_sqlite + configured-collection-name
  tests; they exercise distinct code paths and don't overlap.

Local: 1617 tests pass, ruff lint+format clean against 0.4.x CI pin.
This commit is contained in:
Igor Lins e Silva
2026-05-07 10:32:29 -03:00
49 changed files with 5341 additions and 366 deletions
+71
View File
@@ -0,0 +1,71 @@
"""Stdio UTF-8 reconfiguration helper for Windows entry points.
Python on Windows defaults stdio to the system ANSI codepage
(cp1252/cp1251/cp950 depending on locale), which mojibakes UTF-8 input
or output the moment a non-Latin character shows up. Every console
entry point that touches stdio needs to fix this on Windows -- the MCP
server, the CLI, the fact_checker `--stdin` mode -- so the
reconfigure code lives here in one place to keep the per-stream
errors policies aligned across them.
Per-stream errors policy is caller-chosen:
* MCP server uses ``strict`` on stdout/stderr because everything written
there is server-controlled JSON-RPC; any encode failure is a real bug
the operator wants loud.
* CLI / fact_checker use ``replace`` on stdout/stderr because they print
verbatim drawer text that may contain surrogate halves round-tripped
from filenames -- ``strict`` would crash mid-print.
* All callers use ``surrogateescape`` on stdin so a malformed byte from
a redirected file or a misbehaving client survives as a lone surrogate
the consumer's parser surfaces, instead of ``UnicodeDecodeError``
killing the read loop on the first bad byte.
"""
from __future__ import annotations
import sys
from typing import Callable, Optional
def reconfigure_stdio_utf8_on_windows(
*,
stdin_errors: str = "surrogateescape",
stdout_errors: str = "strict",
stderr_errors: str = "strict",
on_failure: Optional[Callable[[str, BaseException], None]] = None,
) -> None:
"""Reconfigure stdio to UTF-8 on Windows. No-op elsewhere.
Args:
stdin_errors: errors= policy for stdin.reconfigure().
stdout_errors: errors= policy for stdout.reconfigure().
stderr_errors: errors= policy for stderr.reconfigure().
on_failure: optional ``(stream_name, exc) -> None`` callback for
streams whose ``reconfigure`` raises (e.g. Jupyter-replaced
streams that lack the method-shape we expect). Defaults to a
``WARNING:`` line on the original sys.stderr.
"""
if sys.platform != "win32":
return
policies = (
("stdin", stdin_errors),
("stdout", stdout_errors),
("stderr", stderr_errors),
)
for name, errors in policies:
stream = getattr(sys, name, None)
reconfigure = getattr(stream, "reconfigure", None)
if reconfigure is None:
continue
try:
reconfigure(encoding="utf-8", errors=errors)
except Exception as exc: # noqa: BLE001 -- last-resort guard
if on_failure is not None:
on_failure(name, exc)
else:
print(
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
file=sys.stderr,
)
+299 -101
View File
@@ -1,9 +1,12 @@
"""ChromaDB-backed MemPalace storage backend (RFC 001 reference implementation)."""
import contextlib
import datetime as _dt
import logging
import os
import pickle
import sqlite3
from numbers import Integral
from pathlib import Path
from typing import Any, Optional
@@ -29,6 +32,51 @@ _REQUIRED_OPERATORS = frozenset({"$eq", "$ne", "$in", "$nin", "$and", "$or", "$c
_OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
_SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
# A healthy HNSW payload should keep link_lists.bin proportional to
# data_level0.bin. When link_lists.bin grows orders of magnitude larger than
# data_level0.bin, Chroma/HNSW can segfault while opening the segment even if
# index_metadata.pickle is structurally valid.
#
# The report in #1218 showed ratios above 300x, while healthy snapshots were far below 1x.
# Treat only >10x as corruption so normal flush lag or small segments do not get
# quarantined.
_HNSW_LINK_TO_DATA_MAX_RATIO = 10.0
def _hnsw_link_to_data_ratio(seg_dir: str) -> Optional[float]:
"""Return link_lists.bin / data_level0.bin size ratio for a segment.
``None`` means the ratio is not meaningful, usually because one file is
missing or data_level0.bin is empty. ``float("inf")`` means the files were
present but could not be statted safely, which should be treated as
suspicious by callers.
"""
link_path = os.path.join(seg_dir, "link_lists.bin")
data_path = os.path.join(seg_dir, "data_level0.bin")
if not (os.path.isfile(link_path) and os.path.isfile(data_path)):
return None
try:
data_size = os.path.getsize(data_path)
link_size = os.path.getsize(link_path)
except OSError:
return float("inf")
if data_size <= 0:
return None
return link_size / data_size
def _hnsw_payload_appears_sane(seg_dir: str) -> bool:
"""Return False when HNSW payload files are structurally implausible."""
ratio = _hnsw_link_to_data_ratio(seg_dir)
return ratio is None or ratio <= _HNSW_LINK_TO_DATA_MAX_RATIO
# HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
#
# With default params (batch_size=100, sync_threshold=1000, initial capacity
@@ -106,6 +154,9 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
files and quarantine_stale_hnsw would conservatively rename them
out of the way (lazy rebuild on next open recovers).
"""
if not _hnsw_payload_appears_sane(seg_dir):
return False
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
if not os.path.isfile(meta_path):
# No metadata file yet — segment hasn't flushed (fresh / empty).
@@ -127,64 +178,35 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
"""Rename HNSW segment dirs that are both stale-by-mtime AND fail an
integrity sniff-test.
"""Rename HNSW segment dirs that look unsafe to open.
Catches the segfault failure mode from #823 (semantic search stale
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
``count()`` with chromadb 1.5.5), and acknowledged as by-design at
chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb
rebuild lazily on next open instead of segfaulting.
This catches two classes of HNSW corruption before ChromaDB opens the
native segment reader:
Two-stage check:
1. stale-by-mtime segments whose ``index_metadata.pickle`` fails the
existing format sniff-test;
2. structurally impossible HNSW payloads where ``link_lists.bin`` is much
larger than ``data_level0.bin``.
1. **mtime gate.** If ``chroma.sqlite3`` is less than
``stale_seconds`` newer than the segment's ``data_level0.bin``,
skip — chromadb is in normal write-path territory.
The second check is intentionally not gated by mtime. A segment with a
300x link/data ratio is unsafe regardless of whether its mtime is recent;
letting Chroma open it can SIGSEGV before Python fallback code runs.
2. **Integrity gate** (``_segment_appears_healthy``). Even when the
mtime gap exceeds the threshold, a segment whose
``index_metadata.pickle`` passes a format sniff-test is healthy:
chromadb 1.5.x flushes HNSW state asynchronously and a clean
shutdown does NOT force-flush, so the on-disk HNSW is *always*
somewhat older than ``chroma.sqlite3``. Production observation
(2026-04-26 disks daemon): three of three segments quarantined
on every cold start, with 538-557s gaps, leaving the 151K-drawer
palace with vector_ranked=0 until rebuild. Renaming a healthy
segment based on mtime alone destroys a valid index — chromadb
creates an empty replacement, orphaning every drawer in sqlite
from vector recall until the operator runs ``mempalace repair
--mode rebuild`` (15+ min on a 151K palace).
Only segments that pass stage 1 (suspiciously stale) AND fail stage
2 (metadata file truncated, zero-filled, or absent-with-data) are
renamed to ``<uuid>.drift-<timestamp>``. The original directory is
renamed, not deleted, so recovery remains possible if the heuristic
misfires.
The default threshold (5 min) is advisory under daemon-strict; the
integrity gate is what actually distinguishes corruption from flush
lag. The threshold still matters for the cross-machine replication
case (#823), where it bounds how stale a Syncthing-replicated
segment can be before we look harder at it.
Args:
palace_path: path to the palace directory containing ``chroma.sqlite3``
stale_seconds: minimum mtime gap to *consider* a segment for quarantine
Returns:
List of paths that were quarantined (empty if nothing actually
looked corrupt).
The original directory is renamed, not deleted, so recovery remains
possible if the heuristic ever misfires.
"""
db_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
return []
try:
sqlite_mtime = os.path.getmtime(db_path)
except OSError:
return []
moved: list[str] = []
try:
entries = os.listdir(palace_path)
except OSError:
@@ -193,29 +215,34 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
for name in entries:
if "-" not in name or name.startswith(".") or ".drift-" in name:
continue
seg_dir = os.path.join(palace_path, name)
if not os.path.isdir(seg_dir):
continue
hnsw_bin = os.path.join(seg_dir, "data_level0.bin")
if not os.path.isfile(hnsw_bin):
continue
try:
hnsw_mtime = os.path.getmtime(hnsw_bin)
except OSError:
continue
if sqlite_mtime - hnsw_mtime < stale_seconds:
payload_ratio = _hnsw_link_to_data_ratio(seg_dir)
payload_corrupt = payload_ratio is not None and payload_ratio > _HNSW_LINK_TO_DATA_MAX_RATIO
if not payload_corrupt and sqlite_mtime - hnsw_mtime < stale_seconds:
continue
# Stage 2: integrity gate. mtime drift is necessary but not
# sufficient — chromadb's async flush makes drift the steady-
# state condition. A healthy segment metadata file proves
# chromadb can open the segment without segfault; don't
# quarantine a healthy index.
if _segment_appears_healthy(seg_dir):
# Stage 2: integrity gate. Mtime drift alone is not corruption because
# Chroma flushes HNSW asynchronously. A healthy metadata file proves the
# ordinary stale-by-mtime case is just flush lag.
if not payload_corrupt and _segment_appears_healthy(seg_dir):
logger.info(
"HNSW mtime gap %.0fs on %s exceeds threshold but segment "
"metadata file is intact — flush-lag, not corruption. "
"Leaving in place.",
"metadata and payload size are intact — flush-lag, not "
"corruption. Leaving in place.",
sqlite_mtime - hnsw_mtime,
seg_dir,
)
@@ -223,17 +250,30 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
target = f"{seg_dir}.drift-{stamp}"
if payload_corrupt:
reason = (
f"link_lists.bin/data_level0.bin ratio {payload_ratio:.1f}x "
f"exceeds {_HNSW_LINK_TO_DATA_MAX_RATIO:.1f}x"
)
else:
reason = (
f"sqlite {sqlite_mtime - hnsw_mtime:.0f}s newer than HNSW "
"and integrity check failed"
)
try:
os.rename(seg_dir, target)
moved.append(target)
logger.warning(
"Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s",
"Quarantined corrupt HNSW segment %s (%s); renamed to %s",
seg_dir,
sqlite_mtime - hnsw_mtime,
reason,
target,
)
except OSError:
logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir)
return moved
@@ -489,22 +529,17 @@ def hnsw_capacity_status(palace_path: str, collection_name: str = "mempalace_dra
divergence_floor = max(_HNSW_DIVERGENCE_FALLBACK_FLOOR, 2 * sync_threshold)
if hnsw_count is None:
# No pickle yet — segment hasn't persisted metadata. Could be
# fresh-but-unflushed (normal) or interrupted-mid-flush (bad).
# We can't distinguish without the pickle, so only flag
# divergence when sqlite holds clearly more than two flush
# windows worth — same threshold as the with-pickle path.
if sqlite_count > divergence_floor:
out["status"] = "diverged"
out["diverged"] = True
out["divergence"] = sqlite_count
out["message"] = (
f"sqlite holds {sqlite_count:,} embeddings but the HNSW segment "
"has never flushed metadata — vector search will return nothing "
"until the segment is rebuilt. Run `mempalace repair`."
)
else:
out["message"] = "HNSW segment metadata not yet flushed; skipping"
# No pickle yet, so this probe cannot measure HNSW capacity.
# Chroma 1.5.x can have binary HNSW files without a flushed
# metadata pickle; absence of the pickle alone is not proof that
# vector search is unusable or dangerous. Keep the status unknown
# so MCP does not globally disable vectors on an inconclusive
# signal. Corrupt/invalid metadata, when present, is handled by
# quarantine_invalid_hnsw_metadata before Chroma opens.
out["message"] = (
"HNSW capacity unavailable: metadata has not been flushed; "
"leaving vector search enabled"
)
return out
divergence = sqlite_count - hnsw_count
@@ -591,6 +626,97 @@ def _pin_hnsw_threads(collection) -> None:
_BLOB_FIX_MARKER = ".blob_seq_ids_migrated"
def _valid_dimensionality(value: object) -> bool:
return isinstance(value, Integral) and not isinstance(value, bool) and int(value) > 0
def _persisted_metadata_fields(obj: object) -> tuple[object, object]:
if isinstance(obj, dict):
return obj.get("dimensionality"), obj.get("id_to_label")
return getattr(obj, "dimensionality", None), getattr(obj, "id_to_label", None)
def quarantine_invalid_hnsw_metadata(palace_path: str) -> list[str]:
"""Quarantine segment dirs whose ``index_metadata.pickle`` is unreadable or invalid.
Chroma's persisted HNSW metadata is untrusted disk state. If a segment has
labels but no valid positive dimensionality, current Chroma versions can
accept the pickle and crash later in the Rust loader. We rename the entire
segment out of the way before ``PersistentClient`` opens so Chroma can
rebuild cleanly instead of touching known-bad metadata.
"""
try:
entries = os.listdir(palace_path)
except OSError:
return []
moved: list[str] = []
for name in entries:
if "-" not in name or name.startswith(".") or ".drift-" in name or ".corrupt-" in name:
continue
seg_dir = os.path.join(palace_path, name)
if not os.path.isdir(seg_dir):
continue
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
if not os.path.isfile(meta_path):
continue
reason = None
try:
persisted = _SafePersistentDataUnpickler.load(meta_path)
except (EOFError, OSError):
logger.debug(
"Skipping invalid-HNSW quarantine for transient metadata read in %s",
meta_path,
exc_info=True,
)
continue
except pickle.UnpicklingError as exc:
if "truncated" in str(exc).lower() or "ran out of input" in str(exc).lower():
logger.debug(
"Skipping invalid-HNSW quarantine for transient metadata read in %s",
meta_path,
exc_info=True,
)
continue
reason = f"invalid index_metadata.pickle: {exc}"
except Exception as exc:
reason = f"invalid index_metadata.pickle: {exc}"
else:
if not isinstance(persisted, dict) and not (
hasattr(persisted, "dimensionality") or hasattr(persisted, "id_to_label")
):
reason = f"unrecognized index_metadata.pickle payload: {type(persisted).__name__}"
else:
dimensionality, id_to_label = _persisted_metadata_fields(persisted)
if id_to_label is not None and not isinstance(id_to_label, dict):
reason = f"invalid id_to_label type {type(id_to_label).__name__}"
else:
has_labels = bool(id_to_label)
if has_labels and not _valid_dimensionality(dimensionality):
reason = (
"labels present but dimensionality is missing or invalid "
f"({dimensionality!r})"
)
elif dimensionality is not None and not _valid_dimensionality(dimensionality):
reason = f"invalid dimensionality {dimensionality!r}"
if reason is None:
continue
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
target = f"{seg_dir}.corrupt-{stamp}"
try:
os.rename(seg_dir, target)
moved.append(target)
logger.warning("Quarantined invalid HNSW metadata in %s: %s", seg_dir, reason)
except OSError:
logger.exception("Failed to quarantine invalid HNSW metadata in %s", seg_dir)
return moved
def _fix_blob_seq_ids(palace_path: str) -> None:
"""Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
@@ -676,11 +802,58 @@ def _as_list(v: Any) -> list:
return [v]
class ChromaCollection(BaseCollection):
"""Thin adapter translating ChromaDB dict returns into typed results."""
def _close_client(client) -> None:
"""Call ``PersistentClient.close()`` if available, swallow otherwise.
def __init__(self, collection):
chromadb 1.5.x exposes ``Client.close()`` to release rust-side SQLite
file locks; older versions relied on GC. Try/except keeps forward-compat.
"""
if client is None:
return
try:
client.close()
except Exception:
logger.debug("client.close() unavailable or failed", exc_info=True)
class ChromaCollection(BaseCollection):
"""Thin adapter translating ChromaDB dict returns into typed results.
When ``palace_path`` is set, all write methods (``add``, ``upsert``,
``update``, ``delete``) acquire ``mine_palace_lock(palace_path)`` for the
duration of the underlying chromadb call. This serializes MCP and other
direct-backend writers against ``mempalace mine`` and against each other,
closing the race between concurrent writers that triggers ChromaDB's
multi-threaded HNSW corruption (#974/#965).
The lock is the same primitive used by ``miner.mine()`` so re-entrant
acquisition from inside the mine pipeline (mine -> _mine_body ->
collection.upsert) is short-circuited by the per-thread guard inside
``mine_palace_lock`` — no self-deadlock.
``palace_path=None`` disables the wrapping, preserving the legacy
no-lock behaviour for callers that construct a ``ChromaCollection``
directly without going through ``ChromaBackend``.
"""
def __init__(self, collection, palace_path: Optional[str] = None):
self._collection = collection
self._palace_path = palace_path
@contextlib.contextmanager
def _write_lock(self):
"""Acquire ``mine_palace_lock`` for the configured palace, if any.
No-op (yields immediately) when ``self._palace_path`` is None.
"""
if self._palace_path is None:
yield
return
# Late import — palace.py imports ChromaBackend from this module.
from ..palace import mine_palace_lock
with mine_palace_lock(self._palace_path):
yield
# ------------------------------------------------------------------
# Writes
@@ -692,7 +865,8 @@ class ChromaCollection(BaseCollection):
kwargs["metadatas"] = metadatas
if embeddings is not None:
kwargs["embeddings"] = embeddings
self._collection.add(**kwargs)
with self._write_lock():
self._collection.add(**kwargs)
def upsert(self, *, documents, ids, metadatas=None, embeddings=None):
kwargs: dict[str, Any] = {"documents": documents, "ids": ids}
@@ -700,7 +874,8 @@ class ChromaCollection(BaseCollection):
kwargs["metadatas"] = metadatas
if embeddings is not None:
kwargs["embeddings"] = embeddings
self._collection.upsert(**kwargs)
with self._write_lock():
self._collection.upsert(**kwargs)
def update(
self,
@@ -719,7 +894,8 @@ class ChromaCollection(BaseCollection):
kwargs["metadatas"] = metadatas
if embeddings is not None:
kwargs["embeddings"] = embeddings
self._collection.update(**kwargs)
with self._write_lock():
self._collection.update(**kwargs)
# ------------------------------------------------------------------
# Reads
@@ -863,7 +1039,8 @@ class ChromaCollection(BaseCollection):
kwargs["ids"] = ids
if where is not None:
kwargs["where"] = where
self._collection.delete(**kwargs)
with self._write_lock():
self._collection.delete(**kwargs)
def count(self):
return self._collection.count()
@@ -977,7 +1154,7 @@ class ChromaBackend(BaseBackend):
db_path = os.path.join(palace_path, "chroma.sqlite3")
# DB was present when cache was built but is now missing → invalidate.
if cached is not None and not os.path.isfile(db_path):
self._clients.pop(palace_path, None)
_close_client(self._clients.pop(palace_path, None))
self._freshness.pop(palace_path, None)
cached = None
cached_inode, cached_mtime = 0, 0.0
@@ -993,6 +1170,13 @@ class ChromaBackend(BaseBackend):
)
if cached is None or inode_changed or mtime_changed or mtime_appeared:
# An inode swap means we are reopening a different physical DB
# (post-restore, fresh palace at the same path, etc.); drop the
# per-process gate so the quarantine pre-checks run again
# against the new disk state instead of trusting cached "we
# already cleaned this path" credit from the prior inode.
if inode_changed:
ChromaBackend._quarantined_paths.discard(palace_path)
ChromaBackend._prepare_palace_for_open(palace_path)
cached = chromadb.PersistentClient(path=palace_path)
self._clients[palace_path] = cached
@@ -1006,26 +1190,27 @@ class ChromaBackend(BaseBackend):
# Public static helpers (legacy; prefer :meth:`get_collection`)
# ------------------------------------------------------------------
# Per-process record of palaces that have already had quarantine_stale_hnsw
# invoked at least once. The proactive drift check is a *cold-start*
# protection — it catches HNSW segments that arrived stale relative to
# ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore,
# crashed-mid-write). Once a long-running process has opened the palace
# cleanly, re-firing on every reconnect is a *runtime thrash*: the
# daemon's own writes bump sqlite mtime but HNSW flushes batch on
# chromadb's internal cadence, so the mtime gap naturally exceeds the
# threshold under steady write load even though nothing is corrupt.
# Per-process record of palaces that have already had the cold-start
# quarantine invoked at least once. The proactive HNSW checks are a
# *cold-start* protection — they catch segments that arrive stale relative
# to ``chroma.sqlite3`` or invalid on disk (e.g. cross-machine replication,
# partial restore, crashed-mid-write). Once a long-running process has
# opened the palace cleanly, re-firing the stale check on every reconnect
# is a *runtime thrash*: the daemon's own writes bump sqlite mtime but HNSW
# flushes batch on chromadb's internal cadence, so the mtime gap naturally
# exceeds the threshold under steady write load even though nothing is
# corrupt.
# Real runtime drift is still handled — palace-daemon's ``_auto_repair``
# calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors,
# which bypasses this gate.
#
# Thread-safety: this set is mutated without a lock. Two concurrent
# ``make_client()`` calls for the same palace can both pass the
# membership check and both invoke ``quarantine_stale_hnsw``. That's
# safe because the function is idempotent (mtime check + timestamped
# rename of distinct directories), so the worst-case race produces
# one redundant rename attempt that no-ops. Idempotency is the
# safety property; locking would add cost without correctness gain.
# membership check and both invoke the cold-start quarantine. That's
# safe because the functions are idempotent (mtime checks + timestamped
# rename of distinct directories), so the worst-case race produces one
# redundant rename attempt that no-ops. Idempotency is the safety
# property; locking would add cost without correctness gain.
_quarantined_paths: set[str] = set()
@staticmethod
@@ -1033,12 +1218,16 @@ class ChromaBackend(BaseBackend):
"""Run the pre-open safety pass shared by :meth:`make_client` and
:meth:`_client`.
Two steps, both required before constructing a ``PersistentClient``:
Three steps, all required before constructing a ``PersistentClient``:
1. ``_fix_blob_seq_ids`` — repairs the BLOB seq_id quirk that bites
certain chromadb migrations.
2. ``quarantine_stale_hnsw`` — gated by :attr:`_quarantined_paths` so
it fires once per palace per process. This is the SIGSEGV
2. ``quarantine_invalid_hnsw_metadata`` — renames aside any HNSW
``index_metadata.pickle`` that fails to load, so chromadb opens
against an empty index instead of crashing on the unloadable
pickle (#1266 / PR #1285).
3. ``quarantine_stale_hnsw`` — also gated by :attr:`_quarantined_paths`
so it fires once per palace per process. This is the SIGSEGV
prevention path for stale HNSW segments (see #1121, #1132, #1263);
wiring it through this helper means CLI mining, search, repair,
and status all benefit, not just the legacy ``make_client``
@@ -1050,6 +1239,7 @@ class ChromaBackend(BaseBackend):
"""
_fix_blob_seq_ids(palace_path)
if palace_path not in ChromaBackend._quarantined_paths:
quarantine_invalid_hnsw_metadata(palace_path)
quarantine_stale_hnsw(palace_path)
ChromaBackend._quarantined_paths.add(palace_path)
@@ -1061,7 +1251,7 @@ class ChromaBackend(BaseBackend):
own client cache. New code should obtain a collection through
:meth:`get_collection` which manages caching internally.
Quarantines stale HNSW segments **once per palace per process**. See
Quarantines HNSW segments **once per palace per process**. See
:attr:`_quarantined_paths` for the rationale (cold-start protection
vs. runtime thrash on steady-write daemons).
"""
@@ -1131,17 +1321,25 @@ class ChromaBackend(BaseBackend):
else:
collection = client.get_collection(collection_name, **ef_kwargs)
_pin_hnsw_threads(collection)
return ChromaCollection(collection)
return ChromaCollection(collection, palace_path=palace_path)
def close_palace(self, palace) -> None:
"""Drop cached handles for ``palace``. Accepts ``PalaceRef`` or legacy path str."""
"""Drop cached handles for ``palace`` and release its SQLite file lock.
Accepts ``PalaceRef`` or legacy path str. chromadb's rust-side file
lock is held until ``PersistentClient.close()`` is called, so plain
dict eviction would leave the palace path unreopenable and
unremovable in the same process.
"""
path = palace.local_path if isinstance(palace, PalaceRef) else palace
if path is None:
return
self._clients.pop(path, None)
_close_client(self._clients.pop(path, None))
self._freshness.pop(path, None)
def close(self) -> None:
for client in self._clients.values():
_close_client(client)
self._clients.clear()
self._freshness.clear()
self._closed = True
@@ -1182,7 +1380,7 @@ class ChromaBackend(BaseBackend):
},
**ef_kwargs,
)
return ChromaCollection(collection)
return ChromaCollection(collection, palace_path=palace_path)
def _normalize_get_collection_args(args, kwargs):
+130 -30
View File
@@ -655,13 +655,19 @@ def cmd_repair(args):
from .backends.chroma import ChromaBackend
from .migrate import confirm_destructive_action, contains_palace_database
from .repair import (
RebuildCollectionError,
TruncationDetected,
_close_chroma_handles,
_extract_drawers,
_rebuild_collection_via_temp,
check_extraction_safety,
maybe_repair_poisoned_max_seq_id_before_rebuild,
)
config = MempalaceConfig()
collection_name = config.collection_name
palace_path = os.path.abspath(
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
os.path.expanduser(args.palace) if args.palace else config.palace_path
)
if getattr(args, "mode", "legacy") == "max-seq-id":
@@ -677,6 +683,57 @@ def cmd_repair(args):
)
return
if getattr(args, "mode", "legacy") == "from-sqlite":
from .migrate import confirm_destructive_action
from .repair import RebuildPartialError, rebuild_from_sqlite
source_path = getattr(args, "source", None)
source_path = (
os.path.abspath(os.path.expanduser(source_path)) if source_path else palace_path
)
archive_existing = getattr(args, "archive_existing", False)
# Gate any path that touches the user's existing palace dir
# behind confirm_destructive_action. The legacy mode already
# gates; from-sqlite needs the same protection because:
# (a) --archive-existing renames the existing palace,
# (b) --source PATH writes into --palace dir which the user
# may not realize is also a palace.
# No prompt when source != dest AND dest does not exist (pure
# extract-into-fresh-dir case is non-destructive to existing
# palaces).
is_destructive_to_dest = source_path == palace_path or os.path.exists(palace_path)
if is_destructive_to_dest and not confirm_destructive_action(
"Rebuild from SQLite", palace_path, assume_yes=getattr(args, "yes", False)
):
return
try:
counts = rebuild_from_sqlite(
source_palace=source_path,
dest_palace=palace_path,
archive_existing_dest=archive_existing,
)
except RebuildPartialError as exc:
# The error itself was already printed by rebuild_from_sqlite
# with recovery instructions; surface a non-zero exit so
# scripts and CI gates see the failure.
print(
"\n Rebuild partial — see message above. "
f"Failed in collection: {exc.failed_collection}"
)
sys.exit(1)
# An empty counts dict is rebuild_from_sqlite's documented signal
# for a validation refusal (missing source, existing dest,
# in-place without --archive-existing). The library already
# printed an actionable message; exit non-zero so unattended
# scripts/CI distinguish "invalid inputs" from a successful
# rebuild that legitimately found zero rows (which still returns
# a populated dict with 0-valued counts).
if not counts:
sys.exit(1)
return
db_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isdir(palace_path):
@@ -704,7 +761,7 @@ def cmd_repair(args):
# Try to read existing drawers
try:
col = backend.get_collection(palace_path, "mempalace_drawers")
col = backend.get_collection(palace_path, collection_name)
total = col.count()
print(f" Drawers found: {total}")
except Exception as e:
@@ -724,18 +781,7 @@ def cmd_repair(args):
# Extract all drawers in batches
print("\n Extracting drawers...")
batch_size = 5000
all_ids = []
all_docs = []
all_metas = []
offset = 0
while offset < total:
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
if not batch["ids"]:
break
all_ids.extend(batch["ids"])
all_docs.extend(batch["documents"])
all_metas.extend(batch["metadatas"])
offset += len(batch["ids"])
all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size)
print(f" Extracted {len(all_ids)} drawers")
# ── #1208 guard ──────────────────────────────────────────────────
@@ -750,12 +796,12 @@ def cmd_repair(args):
palace_path,
len(all_ids),
confirm_truncation_ok=getattr(args, "confirm_truncation_ok", False),
collection_name=collection_name,
)
except TruncationDetected as e:
print(e.message)
return
# Backup and rebuild
palace_path = os.path.normpath(palace_path)
backup_path = palace_path + ".backup"
if os.path.exists(backup_path):
@@ -769,18 +815,34 @@ def cmd_repair(args):
print(f" Backing up to {backup_path}...")
shutil.copytree(palace_path, backup_path)
print(" Rebuilding collection...")
backend.delete_collection(palace_path, "mempalace_drawers")
new_col = backend.create_collection(palace_path, "mempalace_drawers")
filed = 0
for i in range(0, len(all_ids), batch_size):
batch_ids = all_ids[i : i + batch_size]
batch_docs = all_docs[i : i + batch_size]
batch_metas = all_metas[i : i + batch_size]
new_col.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
filed += len(batch_ids)
print(f" Re-filed {filed}/{len(all_ids)} drawers...")
try:
filed = _rebuild_collection_via_temp(
backend,
palace_path,
all_ids,
all_docs,
all_metas,
batch_size,
collection_name=collection_name,
progress=print,
)
except RebuildCollectionError as e:
print(f" Repair failed: {e}")
if getattr(e, "live_replaced", False):
print(" Live collection was already replaced; restoring from backup...")
try:
_close_chroma_handles(palace_path, backend=backend)
if os.path.exists(palace_path):
shutil.rmtree(palace_path)
shutil.copytree(backup_path, palace_path)
print(f" Restore complete from backup: {backup_path}")
except Exception as restore_error:
print(f" Automatic restore failed: {restore_error}")
print(" Manual recovery required:")
print(f" 1. Remove or rename the broken directory: {palace_path}")
print(f" 2. Restore the backup directory to: {palace_path}")
print(f" Backup location: {backup_path}")
sys.exit(1)
print(f"\n Repair complete. {filed} drawers rebuilt.")
print(f" Backup saved at {backup_path}")
@@ -948,7 +1010,25 @@ def cmd_compress(args):
print(" (dry run -- nothing stored)")
def _reconfigure_stdio_utf8_on_windows():
"""Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI.
Thin wrapper around the shared helper in ``mempalace._stdio``. The CLI
overrides stdout/stderr to ``replace`` because ``mempalace search``
prints verbatim drawer text that may carry surrogate halves
round-tripped from filenames -- ``strict`` would crash mid-print and
lose the rest of the search result block. stdin keeps the default
``surrogateescape`` so a redirected non-UTF-8 file does not kill the
read on the first bad byte.
"""
from ._stdio import reconfigure_stdio_utf8_on_windows
reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
def main():
_reconfigure_stdio_utf8_on_windows()
version_label = f"MemPalace {__version__}"
parser = argparse.ArgumentParser(
description="MemPalace — Give your AI a memory. No API key required.",
@@ -1208,11 +1288,31 @@ def main():
)
p_repair.add_argument(
"--mode",
choices=["legacy", "max-seq-id"],
choices=["legacy", "max-seq-id", "from-sqlite"],
default="legacy",
help=(
"legacy: full-palace rebuild (default). "
"max-seq-id: un-poison max_seq_id rows corrupted by the legacy 0.6.x shim."
"legacy: full-palace rebuild via the chromadb client (default). "
"max-seq-id: un-poison max_seq_id rows corrupted by the legacy 0.6.x shim. "
"from-sqlite: rebuild by reading rows directly from chroma.sqlite3, "
"bypassing the chromadb client. Use when legacy mode bails because the "
"chromadb client cannot open the collection."
),
)
p_repair.add_argument(
"--source",
default=None,
help=(
"Source palace path for --mode from-sqlite (defaults to --palace). "
"Use when extracting from an archived corrupt palace into a new location."
),
)
p_repair.add_argument(
"--archive-existing",
action="store_true",
help=(
"For --mode from-sqlite when --source equals --palace: rename the "
"existing palace to <palace>.pre-rebuild-<timestamp> before "
"rebuilding so the corrupt copy is preserved."
),
)
p_repair.add_argument(
+31 -11
View File
@@ -40,6 +40,7 @@ import json
import os
import re
import time
import urllib.parse
import urllib.request
import urllib.error
from datetime import datetime
@@ -101,6 +102,14 @@ class LLMConfig:
self.endpoint = (endpoint or os.environ.get("LLM_ENDPOINT", "")).rstrip("/")
self.key = key or os.environ.get("LLM_KEY", "")
self.model = model or os.environ.get("LLM_MODEL", "")
if self.endpoint:
# Privacy-by-architecture: reject file:// and other non-HTTP schemes
# so a misconfigured endpoint cannot exfiltrate local files.
scheme = urllib.parse.urlparse(self.endpoint).scheme.lower()
if scheme not in ("http", "https"):
raise ValueError(
f"LLM_ENDPOINT must use http:// or https:// (got scheme {scheme!r})"
)
def missing(self) -> list:
missing = []
@@ -221,17 +230,28 @@ def regenerate_closets(
print("No drawers in palace.")
return {"processed": 0}
all_data = drawers_col.get(limit=total, include=["documents", "metadatas"])
by_source = {}
for doc_id, doc, meta in zip(all_data["ids"], all_data["documents"], all_data["metadatas"]):
source = meta.get("source_file", "unknown")
w = meta.get("wing", "")
if wing and w != wing:
continue
if source not in by_source:
by_source[source] = {"drawer_ids": [], "content": [], "meta": meta}
by_source[source]["drawer_ids"].append(doc_id)
by_source[source]["content"].append(doc)
# Paginate the fetch — a single get(limit=total, ...) blows through
# SQLite's SQLITE_MAX_VARIABLE_NUMBER (32766) on large palaces and
# crashes inside chromadb (see #802, #850, #1073).
by_source: dict = {}
batch_size = 5000
offset = 0
while offset < total:
batch = drawers_col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
ids = batch["ids"]
if not ids:
break
for doc_id, doc, meta in zip(ids, batch["documents"], batch["metadatas"]):
meta = meta or {}
source = meta.get("source_file", "unknown")
w = meta.get("wing", "")
if wing and w != wing:
continue
if source not in by_source:
by_source[source] = {"drawer_ids": [], "content": [], "meta": meta}
by_source[source]["drawer_ids"].append(doc_id)
by_source[source]["content"].append(doc)
offset += len(ids)
sources = list(by_source.keys())
if sample > 0:
+40
View File
@@ -7,6 +7,7 @@ Priority: env vars > config file (~/.mempalace/config.json) > defaults
import json
import os
import re
from functools import lru_cache
from pathlib import Path
@@ -81,6 +82,38 @@ def sanitize_kg_value(value: str, field_name: str = "value") -> str:
return value
# ISO-8601 date validator for knowledge-graph temporal parameters
# (as_of, valid_from, valid_to, ended). Parameterized queries already
# prevent SQL injection, but unvalidated date strings silently miss
# every row — callers cannot distinguish "no fact at this time" from
# "your date format was unrecognized." Require full YYYY-MM-DD: KG
# queries compare TEXT dates lexicographically, so partials like "2026"
# would re-introduce silent empty results (e.g. "2026-01-01" <= "2026"
# is False), defeating the purpose of validation.
_ISO_DATE_RE = re.compile(r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$")
def sanitize_iso_date(value, field_name: str = "date"):
"""Validate an ISO-8601 date string, accepting None or empty as-is.
Accepts only ``YYYY-MM-DD``. Raises ValueError on any other
non-empty input so the MCP layer can surface a clear error to the
caller instead of silently returning empty results. Partial dates
(``YYYY``, ``YYYY-MM``) are rejected because KG queries compare
TEXT dates lexicographically and would silently exclude valid facts.
"""
if value is None or value == "":
return value
if not isinstance(value, str):
raise ValueError(f"{field_name} must be a string")
value = value.strip()
if not _ISO_DATE_RE.match(value):
raise ValueError(
f"{field_name}={value!r} is not a valid ISO-8601 date " f"(expected YYYY-MM-DD)"
)
return value
def sanitize_content(value: str, max_length: int = 100_000) -> str:
"""Validate drawer/diary content length."""
if not isinstance(value, str) or not value.strip():
@@ -95,6 +128,13 @@ def sanitize_content(value: str, max_length: int = 100_000) -> str:
DEFAULT_PALACE_PATH = os.path.expanduser("~/.mempalace/palace")
DEFAULT_COLLECTION_NAME = "mempalace_drawers"
@lru_cache(maxsize=1)
def get_configured_collection_name() -> str:
"""Return the configured drawer collection name without repeated config-file reads."""
return MempalaceConfig().collection_name
DEFAULT_TOPIC_WINGS = [
"emotions",
"consciousness",
+4 -1
View File
@@ -11,6 +11,7 @@ Same palace as project mining. Different ingest strategy.
import os
import sys
import hashlib
import logging
from pathlib import Path
from datetime import datetime
from collections import defaultdict
@@ -24,6 +25,8 @@ from .palace import (
mine_lock,
)
logger = logging.getLogger("mempalace_mcp")
# Cached hall keywords — avoids re-reading config per drawer
_HALL_KEYWORDS_CACHE = None
@@ -331,7 +334,7 @@ def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extr
try:
collection.delete(where={"source_file": source_file})
except Exception:
pass
logger.debug("Stale-drawer purge failed for %s", source_file, exc_info=True)
# Batch chunks into bounded upserts so large transcripts keep most of
# the embedding speedup without one huge Chroma/SQLite request. Keep
+1 -1
View File
@@ -89,7 +89,7 @@ def dedup_source_group(col, drawer_ids, threshold=DEFAULT_THRESHOLD, dry_run=Tru
kept = []
to_delete = []
for did, doc, meta in items:
for did, doc, _meta in items:
if not doc or len(doc) < 20:
to_delete.append(did)
continue
+1 -1
View File
@@ -873,7 +873,7 @@ class Dialect:
for date_key in sorted(by_date.keys()):
lines.append(f"=MOMENTS[{date_key}]=")
for z, fnum in by_date[date_key]:
for z, _fnum in by_date[date_key]:
entities = []
for p in z.get("people", []):
code = self.encode_entity(p)
+27 -2
View File
@@ -16,6 +16,7 @@ Usage:
"""
import json
import os
import re
import urllib.request
import urllib.parse
@@ -320,11 +321,35 @@ class EntityRegistry:
self._path.parent.chmod(0o700)
except (OSError, NotImplementedError):
pass
self._path.write_text(json.dumps(self._data, indent=2), encoding="utf-8")
# Atomic write: serialize to a sibling temp file in the same dir
# (so os.replace stays on one filesystem), fsync, then rename over
# the target. A crash mid-write leaves the previous registry intact
# instead of a half-written file or an empty file from the truncate.
payload = json.dumps(self._data, indent=2)
tmp_path = self._path.with_name(self._path.name + ".tmp")
with open(tmp_path, "w", encoding="utf-8") as f:
f.write(payload)
f.flush()
os.fsync(f.fileno())
try:
self._path.chmod(0o600)
tmp_path.chmod(0o600)
except (OSError, NotImplementedError):
pass
os.replace(tmp_path, self._path)
# On ext4 (and similar) the rename's durability across power loss
# requires an additional fsync on the parent directory. Without it,
# the kernel can ack the rename and a crash reverts to the state
# where the temp file is present and the target is at the old version.
try:
dir_fd = os.open(str(self._path.parent), os.O_RDONLY)
try:
os.fsync(dir_fd)
finally:
os.close(dir_fd)
except OSError:
# Windows and some special filesystems reject directory fds — they
# have different durability semantics on rename anyway.
pass
@staticmethod
def _empty() -> dict:
+20
View File
@@ -27,6 +27,7 @@ Usage:
from __future__ import annotations
import logging
import os
import re
from datetime import datetime, timezone
@@ -35,6 +36,8 @@ from datetime import datetime, timezone
# ~/.mempalace/known_entities.json on every check_text call.
from .miner import _load_known_entities_raw
logger = logging.getLogger("mempalace_mcp")
# Narrow detection patterns — parse "X is Y's Z" and "X's Z is Y".
# Names are captured greedily as word sequences (letters + optional
@@ -214,6 +217,7 @@ def _check_kg_contradictions(text: str, palace_path: str) -> list:
try:
facts = kg.query_entity(subject, direction="outgoing")
except Exception:
logger.debug("KG lookup failed for subject %r", subject, exc_info=True)
continue
if not facts:
continue
@@ -303,11 +307,27 @@ def _edit_distance(s1: str, s2: str) -> int:
return prev[-1]
def _reconfigure_stdio_utf8_on_windows():
"""Decode --stdin payload as UTF-8 on Windows.
Thin wrapper around the shared helper in ``mempalace._stdio``. Mirrors
the primary CLI policy: stdout/stderr use ``replace`` because
extracted fact text can include surrogate halves round-tripped from
filenames -- ``strict`` would raise UnicodeEncodeError mid-print.
stdin keeps the default ``surrogateescape``.
"""
from ._stdio import reconfigure_stdio_utf8_on_windows
reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
if __name__ == "__main__":
import argparse
import json
import sys
_reconfigure_stdio_utf8_on_windows()
parser = argparse.ArgumentParser(
description="Check text against known facts in the MemPalace palace.",
epilog="Exits 0 when no issues found, 1 when one or more issues detected.",
+28
View File
@@ -16,6 +16,23 @@ from pathlib import Path
SAVE_INTERVAL = 15
STATE_DIR = Path.home() / ".mempalace" / "hook_state"
PALACE_ROOT = Path.home() / ".mempalace"
def _palace_root_exists() -> bool:
"""User-removable kill-switch.
If ~/.mempalace/ does not exist, the user has explicitly cleared it.
All hook side effects (logging, state dir creation, mining, ingestion)
must respect this and short-circuit BEFORE touching disk — including
before logging the short-circuit itself.
Uses ``is_dir()`` rather than ``exists()`` so a stray regular file at
``~/.mempalace`` (or a broken symlink) is treated as absent — otherwise
the kill-switch would be bypassed and ``STATE_DIR.mkdir()`` would later
crash on ``NotADirectoryError``.
"""
return PALACE_ROOT.is_dir()
def _mempalace_python() -> str:
@@ -142,6 +159,8 @@ _state_dir_initialized = False
def _log(message: str):
"""Append to hook state log file."""
if not _palace_root_exists():
return # User removed the palace; do not recreate by logging
global _state_dir_initialized
try:
if not _state_dir_initialized:
@@ -550,6 +569,9 @@ def _wing_from_transcript_path(transcript_path: str) -> str:
def hook_stop(data: dict, harness: str):
"""Stop hook: block every N messages for auto-save."""
if not _palace_root_exists():
_output({})
return
parsed = _parse_harness_input(data, harness)
session_id = parsed["session_id"]
stop_hook_active = parsed["stop_hook_active"]
@@ -659,6 +681,9 @@ def hook_stop(data: dict, harness: str):
def hook_session_start(data: dict, harness: str):
"""Session start hook: initialize session tracking state."""
if not _palace_root_exists():
_output({})
return
parsed = _parse_harness_input(data, harness)
session_id = parsed["session_id"]
@@ -673,6 +698,9 @@ def hook_session_start(data: dict, harness: str):
def hook_precompact(data: dict, harness: str):
"""Precompact hook: mine transcript synchronously, then allow compaction."""
if not _palace_root_exists():
_output({})
return
parsed = _parse_harness_input(data, harness)
session_id = parsed["session_id"]
transcript_path = parsed["transcript_path"]
+9
View File
@@ -171,6 +171,15 @@ class KnowledgeGraph:
add_triple("Max", "does", "swimming", valid_from="2025-01-01")
add_triple("Alice", "worried_about", "Max injury", valid_from="2026-01", valid_to="2026-02")
"""
# Reject inverted intervals: a triple with valid_to < valid_from
# would never satisfy `valid_from <= as_of AND valid_to >= as_of`,
# so it would be invisible to every query — silently corrupt.
if valid_from is not None and valid_to is not None and valid_to < valid_from:
raise ValueError(
f"valid_to={valid_to!r} is before valid_from={valid_from!r}; "
"an inverted interval would be invisible to every KG query"
)
sub_id = self._entity_id(subject)
obj_id = self._entity_id(obj)
pred = predicate.lower().replace(" ", "_")
+6 -2
View File
@@ -124,6 +124,8 @@ class Layer1:
# Score each drawer: prefer high importance, recent filing
scored = []
for doc, meta in zip(docs, metas):
meta = meta or {}
doc = doc or ""
importance = 3
# Try multiple metadata keys that might carry weight info
for key in ("importance", "emotional_weight", "weight"):
@@ -155,7 +157,7 @@ class Layer1:
lines.append(room_line)
total_len += len(room_line)
for imp, meta, doc in entries:
for _imp, meta, doc in entries:
source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""
# Truncate doc to keep L1 compact
@@ -222,6 +224,8 @@ class Layer2:
lines = [f"## L2 — ON-DEMAND ({len(docs)} drawers)"]
for doc, meta in zip(docs[:n_results], metas[:n_results]):
meta = meta or {}
doc = doc or ""
room_name = meta.get("room", "?")
source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""
snippet = doc.strip().replace("\n", " ")
@@ -283,7 +287,7 @@ class Layer3:
for i, (doc, meta, dist) in enumerate(zip(docs, metas, dists), 1):
meta = meta or {}
doc = doc or ""
similarity = round(1 - dist, 3)
similarity = round(max(0.0, 1 - dist), 3)
wing_name = meta.get("wing", "?")
room_name = meta.get("room", "?")
source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""
+276 -99
View File
@@ -46,6 +46,8 @@ import argparse # noqa: E402 (deferred until after stdio protection above)
import json # noqa: E402
import logging # noqa: E402
import hashlib # noqa: E402
import sqlite3 # noqa: E402
import threading # noqa: E402
import time # noqa: E402
from datetime import date, datetime # noqa: E402
from pathlib import Path # noqa: E402
@@ -55,6 +57,7 @@ from .config import ( # noqa: E402
sanitize_kg_value,
sanitize_name,
sanitize_content,
sanitize_iso_date,
)
from .version import __version__ # noqa: E402
from chromadb.errors import NotFoundError as _ChromaNotFoundError # noqa: E402
@@ -78,7 +81,7 @@ from .palace_graph import ( # noqa: E402
follow_tunnels,
)
from .knowledge_graph import KnowledgeGraph # noqa: E402
from .knowledge_graph import KnowledgeGraph, DEFAULT_KG_PATH # noqa: E402
logging.basicConfig(level=logging.INFO, format="%(message)s", stream=sys.stderr)
logger = logging.getLogger("mempalace_mcp")
@@ -103,12 +106,61 @@ if _args.palace:
os.environ["MEMPALACE_PALACE_PATH"] = os.path.abspath(_args.palace)
_config = MempalaceConfig()
# Only override KG path when --palace is explicitly provided; otherwise use
# KnowledgeGraph's default (~/.mempalace/knowledge_graph.sqlite3).
if _args.palace:
_kg = KnowledgeGraph(db_path=os.path.join(_config.palace_path, "knowledge_graph.sqlite3"))
else:
_kg = KnowledgeGraph()
_kg_by_path: dict[str, KnowledgeGraph] = {}
_kg_cache_lock = threading.Lock()
_palace_flag_given: bool = bool(_args.palace)
def _resolve_kg_path() -> str:
if _palace_flag_given:
return os.path.join(_config.palace_path, "knowledge_graph.sqlite3")
return DEFAULT_KG_PATH
def _get_kg() -> KnowledgeGraph:
path = os.path.abspath(_resolve_kg_path())
kg = _kg_by_path.get(path)
if kg is not None:
return kg
with _kg_cache_lock:
kg = _kg_by_path.get(path)
if kg is None:
kg = KnowledgeGraph(db_path=path)
_kg_by_path[path] = kg
return kg
def _call_kg(op):
"""Run ``op(kg)`` against the cached KG with one-shot retry on close.
Race we're guarding against: a handler grabs ``kg = _get_kg()`` and is
about to call ``kg.add_triple(...)`` when ``tool_reconnect`` fires on
another thread, drains ``_kg_by_path``, and closes the underlying
sqlite3.Connection. The handler's call then raises
``sqlite3.ProgrammingError: Cannot operate on a closed database`` and
bubbles up as a -32000 to the MCP client even though the user just
asked for a reconnect.
Catch that single class of error, evict the stale entry from the
cache (only if it still points at the closed instance — another
thread may have already replaced it), and try once more with a fresh
KG. Beyond one retry give up: a second close means we're losing a
sustained race we won't win in this loop, and a hung loop is worse
than a clear failure surface.
"""
for attempt in range(2):
kg = _get_kg()
try:
return op(kg)
except sqlite3.ProgrammingError:
if attempt == 0:
path = os.path.abspath(_resolve_kg_path())
with _kg_cache_lock:
if _kg_by_path.get(path) is kg:
_kg_by_path.pop(path, None)
continue
raise
_client_cache = None
@@ -141,7 +193,7 @@ def _refresh_vector_disabled_flag() -> None:
"""
global _vector_disabled, _vector_disabled_reason, _vector_capacity_status
try:
info = hnsw_capacity_status(_config.palace_path, "mempalace_drawers")
info = hnsw_capacity_status(_config.palace_path, _config.collection_name)
except Exception:
logger.debug("HNSW capacity probe raised", exc_info=True)
return
@@ -274,68 +326,94 @@ def _get_client():
def _get_collection(create=False):
"""Return the ChromaDB collection, caching the client between calls."""
global _collection_cache, _metadata_cache, _metadata_cache_time
try:
client = _get_client()
# ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
# collection but not the EF *instance/configuration*. So a reader or
# writer that omits ``embedding_function=`` silently gets chromadb's
# built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
# one we spoof in ``mempalace.embedding`` (both report ``"default"``,
# the identity check passes), but the *provider list* is chromadb's
# default rather than the user's resolved device. On bleeding-edge
# interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
# that default provider selection can SIGSEGV the host process on
# first ``col.add()``. The miner / Stop hook ingest path avoids this
# because it routes through ``ChromaBackend.get_collection``, which
# resolves the EF via ``ChromaBackend._resolve_embedding_function``;
# the MCP server bypassed that abstraction. Resolve the EF inside the
# branches that actually open a collection so warm-cache reads stay
# zero-cost. Reuse the backend helper so the two call sites can't
# drift on logging or fallback semantics.
if create:
ef = ChromaBackend._resolve_embedding_function()
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
# hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
# HNSW insert path, which has a race in repairConnectionsForUpdate /
# addPoint (see issues #974, #965). Set via metadata on fresh
# collections and re-applied via _pin_hnsw_threads() for legacy
# palaces whose collections were created before this fix (the
# runtime config does not persist cross-process in chromadb 1.5.x,
# so the retrofit runs every time _get_collection opens a cache).
#
# ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
# is called with metadata that differs from what's stored. The split
# below skips the metadata-comparison codepath for existing
# collections, mirroring the backend-layer fix from #1262.
try:
"""Return the ChromaDB collection, caching the client between calls.
On failure, log the exception and retry once after clearing the client
and collection caches. Tools were silently returning ``None`` when a
cached client/collection went stale — typically after the chromadb
rust bindings invalidated a handle following an out-of-band write —
leaving the LLM with no diagnostic and no recovery path. The retry
forces ``_get_client()`` to rebuild from scratch (which re-runs
``quarantine_stale_hnsw`` per #1322), so the second attempt heals the
common stale-handle / stale-HNSW case automatically.
"""
global _client_cache, _collection_cache, _metadata_cache, _metadata_cache_time
for attempt in range(2):
try:
client = _get_client()
# ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
# collection but not the EF *instance/configuration*. So a reader or
# writer that omits ``embedding_function=`` silently gets chromadb's
# built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
# one we spoof in ``mempalace.embedding`` (both report ``"default"``,
# the identity check passes), but the *provider list* is chromadb's
# default rather than the user's resolved device. On bleeding-edge
# interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
# that default provider selection can SIGSEGV the host process on
# first ``col.add()``. The miner / Stop hook ingest path avoids this
# because it routes through ``ChromaBackend.get_collection``, which
# resolves the EF via ``ChromaBackend._resolve_embedding_function``;
# the MCP server bypassed that abstraction. Resolve the EF inside the
# branches that actually open a collection so warm-cache reads stay
# zero-cost. Reuse the backend helper so the two call sites can't
# drift on logging or fallback semantics.
if create:
ef = ChromaBackend._resolve_embedding_function()
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
# hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
# HNSW insert path, which has a race in repairConnectionsForUpdate /
# addPoint (see issues #974, #965). Set via metadata on fresh
# collections and re-applied via _pin_hnsw_threads() for legacy
# palaces whose collections were created before this fix (the
# runtime config does not persist cross-process in chromadb 1.5.x,
# so the retrofit runs every time _get_collection opens a cache).
#
# ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
# is called with metadata that differs from what's stored. The split
# below skips the metadata-comparison codepath for existing
# collections, mirroring the backend-layer fix from #1262.
try:
raw = client.get_collection(_config.collection_name, **ef_kwargs)
except _ChromaNotFoundError:
raw = client.create_collection(
_config.collection_name,
metadata={
"hnsw:space": "cosine",
"hnsw:num_threads": 1,
**_HNSW_BLOAT_GUARD,
},
**ef_kwargs,
)
_pin_hnsw_threads(raw)
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
_metadata_cache = None
_metadata_cache_time = 0
elif _collection_cache is None:
ef = ChromaBackend._resolve_embedding_function()
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
raw = client.get_collection(_config.collection_name, **ef_kwargs)
except _ChromaNotFoundError:
raw = client.create_collection(
_config.collection_name,
metadata={
"hnsw:space": "cosine",
"hnsw:num_threads": 1,
**_HNSW_BLOAT_GUARD,
},
**ef_kwargs,
)
_pin_hnsw_threads(raw)
_collection_cache = ChromaCollection(raw)
_metadata_cache = None
_metadata_cache_time = 0
elif _collection_cache is None:
ef = ChromaBackend._resolve_embedding_function()
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
raw = client.get_collection(_config.collection_name, **ef_kwargs)
_pin_hnsw_threads(raw)
_collection_cache = ChromaCollection(raw)
_metadata_cache = None
_metadata_cache_time = 0
return _collection_cache
except Exception:
return None
_pin_hnsw_threads(raw)
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
_metadata_cache = None
_metadata_cache_time = 0
return _collection_cache
except Exception:
logger.exception(
"_get_collection attempt %d/2 failed (palace=%s, create=%s)",
attempt + 1,
_config.palace_path,
create,
)
if attempt == 0:
# Reset all caches so the next attempt forces _get_client()
# to rebuild the chromadb client from scratch — that path
# re-runs quarantine_stale_hnsw (#1322) and reopens the
# collection cleanly, healing the common stale-handle case.
_client_cache = None
_collection_cache = None
_metadata_cache = None
_metadata_cache_time = 0
return None
def _no_palace():
@@ -412,6 +490,7 @@ def _tool_status_via_sqlite() -> dict:
db_path = os.path.join(_config.palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
return _no_palace()
collection_name = _config.collection_name
wings: dict = {}
rooms: dict = {}
@@ -425,8 +504,9 @@ def _tool_status_via_sqlite() -> dict:
FROM embeddings e
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = 'mempalace_drawers'
"""
WHERE c.name = ?
""",
(collection_name,),
).fetchone()
total = int(row[0]) if row and row[0] is not None else 0
for key, target in (("wing", wings), ("room", rooms)):
@@ -437,12 +517,12 @@ def _tool_status_via_sqlite() -> dict:
JOIN embeddings e ON em.id = e.id
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = 'mempalace_drawers'
WHERE c.name = ?
AND em.key = ?
AND em.string_value IS NOT NULL
GROUP BY em.string_value
""",
(key,),
(collection_name, key),
):
target[value] = count
finally:
@@ -642,6 +722,7 @@ def tool_search(
n_results=limit,
max_distance=dist,
vector_disabled=_vector_disabled,
collection_name=_config.collection_name,
)
if _vector_disabled:
result["vector_disabled"] = True
@@ -688,10 +769,12 @@ def tool_check_duplicate(content: str, threshold: float = 0.9):
if results["ids"] and results["ids"][0]:
for i, drawer_id in enumerate(results["ids"][0]):
dist = results["distances"][0][i]
similarity = round(1 - dist, 3)
similarity = round(max(0.0, 1 - dist), 3)
if similarity >= threshold:
meta = results["metadatas"][0][i]
doc = results["documents"][0][i]
# Chroma 1.5.x can return None for partially-flushed rows;
# coerce to empty sentinels so downstream .get() is safe.
meta = results["metadatas"][0][i] or {}
doc = results["documents"][0][i] or ""
duplicates.append(
{
"id": drawer_id,
@@ -842,11 +925,11 @@ def tool_add_drawer(
# Idempotency: if the deterministic ID already exists, return success as a no-op.
try:
existing = col.get(ids=[drawer_id])
if existing and existing["ids"]:
existing = col.get(ids=[drawer_id], include=[])
if existing.ids:
return {"success": True, "reason": "already_exists", "drawer_id": drawer_id}
except Exception:
pass
logger.debug("Idempotency pre-check failed for %s", drawer_id, exc_info=True)
try:
col.upsert(
@@ -863,6 +946,12 @@ def tool_add_drawer(
}
],
)
inserted = col.get(ids=[drawer_id], include=[])
if not inserted.ids:
raise RuntimeError(
"Drawer write was acknowledged but the new ID is not readable. "
"The palace index may be stale; run reconnect or repair."
)
_metadata_cache = None
logger.info(f"Filed drawer: {drawer_id}{wing}/{room}")
return {"success": True, "drawer_id": drawer_id, "wing": wing, "room": room}
@@ -961,6 +1050,13 @@ def tool_list_drawers(wing: str = None, room: str = None, limit: int = 20, offse
kwargs["where"] = where
result = col.get(**kwargs)
# Compute total matching drawers for pagination.
if where:
total_result = col.get(where=where, include=[])
total = len(total_result["ids"])
else:
total = col.count()
drawers = []
for i, did in enumerate(result["ids"]):
meta = result["metadatas"][i]
@@ -975,6 +1071,7 @@ def tool_list_drawers(wing: str = None, room: str = None, limit: int = 20, offse
)
return {
"drawers": drawers,
"total": total,
"count": len(drawers),
"offset": offset,
"limit": limit,
@@ -1059,11 +1156,12 @@ def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
"""Query the knowledge graph for an entity's relationships."""
try:
entity = sanitize_kg_value(entity, "entity")
as_of = sanitize_iso_date(as_of, "as_of")
except ValueError as e:
return {"error": str(e)}
if direction not in ("outgoing", "incoming", "both"):
return {"error": "direction must be 'outgoing', 'incoming', or 'both'"}
results = _kg.query_entity(entity, as_of=as_of, direction=direction)
results = _call_kg(lambda kg: kg.query_entity(entity, as_of=as_of, direction=direction))
return {"entity": entity, "as_of": as_of, "facts": results, "count": len(results)}
@@ -1092,6 +1190,7 @@ def tool_kg_add(
subject = sanitize_kg_value(subject, "subject")
predicate = sanitize_name(predicate, "predicate")
object = sanitize_kg_value(object, "object")
valid_from = sanitize_iso_date(valid_from, "valid_from")
except ValueError as e:
return {"success": False, "error": str(e)}
@@ -1108,15 +1207,17 @@ def tool_kg_add(
"source_drawer_id": source_drawer_id,
},
)
triple_id = _kg.add_triple(
subject,
predicate,
object,
valid_from=valid_from,
valid_to=valid_to,
source_closet=source_closet,
source_file=source_file,
source_drawer_id=source_drawer_id,
triple_id = _call_kg(
lambda kg: kg.add_triple(
subject,
predicate,
object,
valid_from=valid_from,
valid_to=valid_to,
source_closet=source_closet,
source_file=source_file,
source_drawer_id=source_drawer_id,
)
)
return {"success": True, "triple_id": triple_id, "fact": f"{subject}{predicate}{object}"}
@@ -1135,6 +1236,7 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
subject = sanitize_kg_value(subject, "subject")
predicate = sanitize_name(predicate, "predicate")
object = sanitize_kg_value(object, "object")
ended = sanitize_iso_date(ended, "ended")
except ValueError as e:
return {"success": False, "error": str(e)}
resolved_ended = ended or date.today().isoformat()
@@ -1147,7 +1249,7 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
"ended": resolved_ended,
},
)
_kg.invalidate(subject, predicate, object, ended=resolved_ended)
_call_kg(lambda kg: kg.invalidate(subject, predicate, object, ended=resolved_ended))
return {
"success": True,
"fact": f"{subject}{predicate}{object}",
@@ -1162,13 +1264,13 @@ def tool_kg_timeline(entity: str = None):
entity = sanitize_kg_value(entity, "entity")
except ValueError as e:
return {"error": str(e)}
results = _kg.timeline(entity)
results = _call_kg(lambda kg: kg.timeline(entity))
return {"entity": entity or "all", "timeline": results, "count": len(results)}
def tool_kg_stats():
"""Knowledge graph overview: entities, triples, relationship types."""
return _kg.stats()
return _call_kg(lambda kg: kg.stats())
# ==================== AGENT DIARY ====================
@@ -1351,7 +1453,7 @@ def tool_hook_settings(silent_save: bool = None, desktop_toast: bool = None):
try:
config = MempalaceConfig()
except Exception:
pass
logger.debug("Could not re-read config after update", exc_info=True)
result = {
"success": True,
@@ -1400,10 +1502,11 @@ def tool_memories_filed_away():
def tool_reconnect():
"""Force the MCP server to drop the cached ChromaDB collection and reconnect.
"""Force the MCP server to drop cached ChromaDB + KnowledgeGraph state.
Use after external scripts or CLI commands modify the palace database
directly, which can leave the in-memory HNSW index stale.
or replace ``knowledge_graph.sqlite3`` directly, which can leave the
in-memory HNSW index stale or pin a closed-on-disk SQLite connection.
"""
global \
_client_cache, \
@@ -1412,6 +1515,30 @@ def tool_reconnect():
_palace_db_mtime, \
_vector_disabled, \
_vector_disabled_reason
from . import palace as palace_module
close_errors = []
try:
palace_module._DEFAULT_BACKEND.close_palace(_config.palace_path)
except Exception as exc:
logger.debug("Failed to close shared palace backend during reconnect", exc_info=True)
close_errors.append(f"backend close_palace failed: {exc}")
try:
from chromadb.api.client import SharedSystemClient
clear_system_cache = getattr(SharedSystemClient, "clear_system_cache", None)
if callable(clear_system_cache):
clear_system_cache()
else:
logger.debug(
"SharedSystemClient.clear_system_cache is unavailable; skipping shared Chroma cache clear during reconnect"
)
except Exception as exc:
logger.debug(
"Failed to clear Chroma shared system cache during reconnect",
exc_info=True,
)
close_errors.append(f"shared Chroma cache clear failed: {exc}")
_client_cache = None
_collection_cache = None
_palace_db_inode = 0
@@ -1421,15 +1548,36 @@ def tool_reconnect():
# still applies after the reconnect.
_vector_disabled = False
_vector_disabled_reason = ""
# Drain the per-path KnowledgeGraph cache so a replaced sqlite file is
# reopened on the next tool call rather than served from a stale handle.
with _kg_cache_lock:
for kg in _kg_by_path.values():
try:
kg.close()
except Exception:
pass
_kg_by_path.clear()
try:
col = _get_collection()
if col is None:
return {
result = {
"success": False,
"message": "No palace found after reconnect",
"drawers": 0,
"vector_disabled": _vector_disabled,
}
if close_errors:
result["error"] = "; ".join(close_errors)
return result
if close_errors:
return {
"success": False,
"message": "Reconnect reopened the palace but failed to fully reset cached handles",
"drawers": col.count(),
"vector_disabled": _vector_disabled,
"vector_disabled_reason": _vector_disabled_reason,
"error": "; ".join(close_errors),
}
return {
"success": True,
"message": "Reconnected to palace",
@@ -1750,7 +1898,7 @@ TOOLS = {
"handler": tool_get_drawer,
},
"mempalace_list_drawers": {
"description": "List drawers with pagination. Optional wing/room filter. Returns IDs, wings, rooms, and content previews.",
"description": "List drawers with pagination. Optional wing/room filter. Returns IDs, wings, rooms, content previews, and total matching count for pagination.",
"input_schema": {
"type": "object",
"properties": {
@@ -1891,6 +2039,12 @@ SUPPORTED_PROTOCOL_VERSIONS = [
def handle_request(request):
if not isinstance(request, dict):
return {
"jsonrpc": "2.0",
"id": None,
"error": {"code": -32600, "message": "Invalid Request"},
}
method = request.get("method") or ""
params = request.get("params") or {}
req_id = request.get("id")
@@ -1928,6 +2082,15 @@ def handle_request(request):
},
}
elif method == "tools/call":
if not isinstance(params, dict) or "name" not in params:
return {
"jsonrpc": "2.0",
"id": req_id,
"error": {
"code": -32602,
"message": "Invalid params: 'name' is required for tools/call",
},
}
tool_name = params.get("name")
tool_args = params.get("arguments") or {}
if tool_name not in TOOLS:
@@ -1976,7 +2139,11 @@ def handle_request(request):
return {
"jsonrpc": "2.0",
"id": req_id,
"result": {"content": [{"type": "text", "text": json.dumps(result, indent=2)}]},
"result": {
"content": [
{"type": "text", "text": json.dumps(result, indent=2, ensure_ascii=False)}
]
},
}
except Exception:
logger.exception(f"Tool error in {tool_name}")
@@ -2011,6 +2178,16 @@ def _restore_stdout():
def main():
_restore_stdout()
# Force UTF-8 on stdio. MCP JSON-RPC is UTF-8, but Python on Windows
# defaults stdin/stdout to the system codepage (e.g. cp1251), which
# corrupts non-ASCII payloads and surfaces as generic -32000 errors on
# Cyrillic/CJK content. See PEP 540.
for stream in (sys.stdin, sys.stdout):
if hasattr(stream, "reconfigure"):
try:
stream.reconfigure(encoding="utf-8", errors="replace")
except (AttributeError, OSError):
pass
logger.info("MemPalace MCP Server starting...")
# Pre-flight: probe HNSW capacity before any tool call so the warning
# is visible at startup rather than on first use (#1222). Pure
@@ -2027,7 +2204,7 @@ def main():
request = json.loads(line)
response = handle_request(request)
if response is not None:
sys.stdout.write(json.dumps(response) + "\n")
sys.stdout.write(json.dumps(response, ensure_ascii=False) + "\n")
sys.stdout.flush()
except KeyboardInterrupt:
break
+67 -6
View File
@@ -22,6 +22,7 @@ import errno
import os
import shutil
import sqlite3
import uuid
from collections import defaultdict
from datetime import datetime
@@ -155,6 +156,55 @@ def confirm_destructive_action(
return True
def _result_ids(result) -> list:
"""Return ids from either the backend typed result or raw Chroma dict."""
if isinstance(result, dict):
return list(result.get("ids") or [])
return list(getattr(result, "ids", []) or [])
def collection_write_roundtrip_works(col) -> bool:
"""Return True only if the collection can upsert, read, and delete.
Some ChromaDB 0.6.x -> 1.5.x migrated collections remain readable while
writes and deletes silently no-op. A plain ``count()`` probe misses that
failure mode, so migrate must verify an actual write round-trip before
deciding that no rebuild is needed.
"""
probe_id = f"_mempalace_migrate_probe_{uuid.uuid4().hex}"
probe_doc = "mempalace migrate write round-trip probe"
probe_meta = {
"wing": "_mempalace_probe",
"room": "_mempalace_probe",
"source_file": "mempalace_migrate_probe",
"chunk_index": 0,
}
try:
col.upsert(
ids=[probe_id],
documents=[probe_doc],
metadatas=[probe_meta],
)
after_upsert = col.get(ids=[probe_id], include=[])
if probe_id not in _result_ids(after_upsert):
return False
col.delete(ids=[probe_id])
after_delete = col.get(ids=[probe_id], include=[])
if probe_id in _result_ids(after_delete):
return False
return True
except Exception:
return False
def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
"""Migrate a palace to the currently installed ChromaDB version."""
from .backends.chroma import ChromaBackend
@@ -179,16 +229,27 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
print(f" Source: ChromaDB {source_version}")
print(f" Target: ChromaDB {target_version}")
# Try reading with current chromadb first
# Try reading and writing with current chromadb first.
#
# A plain count() is not enough: some 0.6.x -> 1.5.x migrated collections
# are readable but silently drop upsert/delete operations. In that state,
# migrate must rebuild from SQLite instead of returning "No migration needed."
try:
col = ChromaBackend().get_collection(palace_path, "mempalace_drawers")
count = col.count()
print(f"\n Palace is already readable by chromadb {target_version}.")
print(f" {count} drawers found. No migration needed.")
return True
if collection_write_roundtrip_works(col):
print(f"\n Palace is already readable and writable by chromadb {target_version}.")
print(f" {count} drawers found. No migration needed.")
return True
print(
f"\n Palace is readable by chromadb {target_version}, but write/delete verification failed."
)
print(" Rebuilding from SQLite to restore native write/delete behavior...")
except Exception:
print(f"\n Palace is NOT readable by chromadb {target_version}.")
print(" Extracting from SQLite directly...")
print(f"\n Palace is NOT readable by chromadb {target_version}.")
print(" Extracting from SQLite directly...")
# Extract all drawers via raw SQL
drawers = extract_drawers_from_sqlite(db_path)
+38 -1
View File
@@ -12,6 +12,7 @@ import sys
import shlex
import hashlib
import fnmatch
import logging
from pathlib import Path
from datetime import datetime
from collections import defaultdict
@@ -31,6 +32,8 @@ from .palace import (
upsert_closet_lines,
)
logger = logging.getLogger("mempalace_mcp")
READABLE_EXTENSIONS = {
".txt",
".md",
@@ -63,6 +66,8 @@ SKIP_FILENAMES = {
"mempal.yml",
".gitignore",
"package-lock.json",
"pnpm-lock.yaml",
"yarn.lock",
}
CHUNK_SIZE = 800 # chars per drawer
@@ -70,6 +75,13 @@ CHUNK_OVERLAP = 100 # overlap between chunks
MIN_CHUNK_SIZE = 50 # skip tiny chunks
DRAWER_UPSERT_BATCH_SIZE = 1000
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
# A single file producing more chunks than this is almost always a generated
# artifact (CSV/JSON dump, lockfile not in SKIP_FILENAMES, etc.). Embedding
# thousands of chunks from one file in one batch has triggered ONNX runtime
# `bad allocation` errors on Windows (#1296). The cap is conservative: a
# 500-chunk file at CHUNK_SIZE=800 is ~400 KB of source, which covers most
# legitimate hand-written content while bounding the worst-case batch.
MAX_CHUNKS_PER_FILE = 500
# Long Claude Code sessions and large transcript exports routinely exceed
# 10 MB. The cap exists as a defensive rail against pathological binary
# files, not as a limit on legitimate text. Per-drawer size is bounded
@@ -822,6 +834,13 @@ def process_file(
room = detect_room(filepath, content, rooms, project_path)
chunks = chunk_text(content, source_file)
if len(chunks) > MAX_CHUNKS_PER_FILE:
print(
f" ! [skip] {filepath.name[:50]:50} produced {len(chunks)} chunks "
f"(> {MAX_CHUNKS_PER_FILE}); add to SKIP_FILENAMES or .gitignore"
)
return 0, room
if dry_run:
print(f" [DRY RUN] {filepath.name} -> room:{room} ({len(chunks)} drawers)")
return len(chunks), room
@@ -842,7 +861,7 @@ def process_file(
try:
collection.delete(where={"source_file": source_file})
except Exception:
pass
logger.debug("Stale-drawer purge failed for %s", source_file, exc_info=True)
# Batch chunks into bounded upserts so the embedding model sees many
# chunks per forward pass without building one huge Chroma/SQLite
@@ -1164,6 +1183,24 @@ def _mine_impl(
"already-filed drawers are\n upserted idempotently and will not duplicate.\n"
)
sys.exit(130)
except Exception as exc:
# Without this, an arbitrary exception (ONNX bad_alloc, chromadb HNSW
# error, OS fault) propagates and the process exits with no completion
# banner — the operator sees only the final progress line and assumes
# the mine succeeded (#1296). Print the partial-progress summary the
# way we do for KeyboardInterrupt, then re-raise so the original
# traceback still surfaces and the exit code is non-zero.
print("\n\n Mine aborted by exception.")
print(f" files_processed: {files_processed}/{len(files)}")
print(f" drawers_filed: {total_drawers}")
print(f" last_file: {last_file or '<none>'}")
print(f" error: {type(exc).__name__}: {exc}")
print(
f"\n Re-run `mempalace mine {shlex.quote(project_dir)}` after addressing "
"the cause — already-filed\n drawers are upserted idempotently and will "
"not duplicate.\n"
)
raise
finally:
# Clean up the hooks-side PID lock if it points at us. Stale
# entries already pass _pid_alive() == False on POSIX, but
+2 -2
View File
@@ -118,14 +118,14 @@ def normalize(filepath: str) -> str:
try:
file_size = os.path.getsize(filepath)
except OSError as e:
raise IOError(f"Could not read {filepath}: {e}")
raise IOError(f"Could not read {filepath}: {e}") from e
if file_size > 500 * 1024 * 1024: # 500 MB safety limit
raise IOError(f"File too large ({file_size // (1024 * 1024)} MB): {filepath}")
try:
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
content = f.read()
except OSError as e:
raise IOError(f"Could not read {filepath}: {e}")
raise IOError(f"Could not read {filepath}: {e}") from e
if not content.strip():
return content
+69 -4
View File
@@ -6,11 +6,16 @@ Consolidates collection access patterns used by both miners and the MCP server.
import contextlib
import hashlib
import logging
import os
import re
import threading
from typing import Optional
from .backends.chroma import ChromaBackend
logger = logging.getLogger("mempalace_mcp")
SKIP_DIRS = {
".git",
"node_modules",
@@ -52,10 +57,14 @@ NORMALIZE_VERSION = 2
def get_collection(
palace_path: str,
collection_name: str = "mempalace_drawers",
collection_name: Optional[str] = None,
create: bool = True,
):
"""Get the palace collection through the backend layer."""
if collection_name is None:
from .config import get_configured_collection_name
collection_name = get_configured_collection_name()
return _DEFAULT_BACKEND.get_collection(
palace_path,
collection_name=collection_name,
@@ -228,7 +237,7 @@ def purge_file_closets(closets_col, source_file: str) -> None:
try:
closets_col.delete(where={"source_file": source_file})
except Exception:
pass
logger.debug("Closet purge failed for %s", source_file, exc_info=True)
def upsert_closet_lines(closets_col, closet_id_base, lines, metadata):
@@ -306,7 +315,7 @@ def mine_lock(source_file: str):
fcntl.flock(lf, fcntl.LOCK_UN)
except Exception:
pass
logger.debug("Mine-lock release failed", exc_info=True)
lf.close()
@@ -314,6 +323,47 @@ class MineAlreadyRunning(RuntimeError):
"""Raised when another `mempalace mine` already holds the per-palace lock."""
# Per-thread record of palaces this thread already holds the lock for. Used by
# `mine_palace_lock` to short-circuit re-entrant acquisition from the same
# thread (e.g. miner.mine() acquires the outer lock then calls
# ChromaCollection.upsert which now also tries to acquire). Without this guard
# the inner call would block on its own outer flock (Linux fcntl locks are per
# open file description, so a same-thread second open of the lock file is a
# distinct lock and self-deadlocks).
#
# The holder set is tagged with ``pid`` so that a forked child does NOT
# inherit re-entrant credit from its parent: the OS-level flock IS NOT
# inherited as a "we hold it" semantically — the child must reacquire — but
# Python's ``threading.local`` IS inherited across fork. The pid check
# clears stale state so a forked child correctly hits the fcntl path.
_palace_lock_holders = threading.local()
def _holder_state():
"""Return the per-thread (pid, keys) record, refreshing after fork."""
keys = getattr(_palace_lock_holders, "keys", None)
pid = getattr(_palace_lock_holders, "pid", None)
current_pid = os.getpid()
if keys is None or pid != current_pid:
keys = set()
_palace_lock_holders.keys = keys
_palace_lock_holders.pid = current_pid
return keys
def _held_by_this_thread(lock_key: str) -> bool:
"""Return True if this thread already holds ``mine_palace_lock`` for ``lock_key``."""
return lock_key in _holder_state()
def _mark_held(lock_key: str) -> None:
_holder_state().add(lock_key)
def _mark_released(lock_key: str) -> None:
_holder_state().discard(lock_key)
@contextlib.contextmanager
def mine_palace_lock(palace_path: str):
"""Per-palace non-blocking lock around the full `mine` pipeline.
@@ -338,6 +388,12 @@ def mine_palace_lock(palace_path: str):
Non-blocking: if another `mine` is already writing to this palace,
raise MineAlreadyRunning so the caller can exit cleanly instead of
piling up as a waiting worker.
Re-entrant: if the current thread already holds the lock for the same
palace, the context manager passes through without re-acquiring. This
lets ChromaCollection write methods (which acquire the lock themselves
to protect MCP/direct callers) compose with miner.mine() (which holds
the outer lock for the entire mine pipeline) without self-deadlock.
"""
lock_dir = os.path.join(os.path.expanduser("~"), ".mempalace", "locks")
os.makedirs(lock_dir, exist_ok=True)
@@ -346,6 +402,11 @@ def mine_palace_lock(palace_path: str):
palace_key = hashlib.sha256(lock_key_source.encode()).hexdigest()[:16]
lock_path = os.path.join(lock_dir, f"mine_palace_{palace_key}.lock")
if _held_by_this_thread(palace_key):
# Same thread already holds the lock for this palace — pass through.
yield
return
lf = open(lock_path, "w")
acquired = False
try:
@@ -369,7 +430,11 @@ def mine_palace_lock(palace_path: str):
raise MineAlreadyRunning(
f"another `mempalace mine` is already running against {resolved}"
) from exc
yield
_mark_held(palace_key)
try:
yield
finally:
_mark_released(palace_key)
finally:
if acquired:
try:
+1 -1
View File
@@ -575,7 +575,7 @@ def follow_tunnels(wing: str, room: str, col=None, config=None):
if did and did in drawer_map:
c["drawer_preview"] = drawer_map[did][:300]
except Exception:
pass
logger.debug("Drawer preview hydration failed", exc_info=True)
return connections
+596 -53
View File
@@ -34,13 +34,58 @@ import os
import shutil
import sqlite3
import time
from collections import defaultdict
from datetime import datetime
from typing import Optional
from typing import Iterator, Optional
from chromadb.errors import NotFoundError as ChromaNotFoundError
from .backends.chroma import ChromaBackend, hnsw_capacity_status
COLLECTION_NAME = "mempalace_drawers"
REPAIR_TEMP_COLLECTION = f"{COLLECTION_NAME}__repair_tmp"
# The closets collection (AAAK index layer) is intentionally fixed —
# closets reference drawer IDs by string and live alongside drawers in the
# same palace; renaming the closets collection per-deployment would break
# cross-palace AAAK lookups. Drawer collection name comes from config
# (see ``_recoverable_collections``).
CLOSETS_COLLECTION_NAME = "mempalace_closets"
def _drawers_collection_name() -> str:
"""Resolve the drawers collection name from user config, falling back
to the module default ``COLLECTION_NAME`` if config is unreadable.
Recovery flows must honor ``MempalaceConfig().collection_name`` so a
user with a non-default drawer collection (e.g. multi-palace setups)
rebuilds the right rows. Closets remain fixed — see
``CLOSETS_COLLECTION_NAME``.
"""
try:
from .config import MempalaceConfig
return MempalaceConfig().collection_name or COLLECTION_NAME
except Exception:
return COLLECTION_NAME
def _recoverable_collections() -> tuple[str, ...]:
"""Collections rebuilt by ``rebuild_from_sqlite``, in upsert order.
Drawers first (bulk data), then closets (AAAK index layer that
references drawer IDs by string in their documents — no
foreign-key validation, so ordering is informational, not
load-bearing).
"""
return (_drawers_collection_name(), CLOSETS_COLLECTION_NAME)
# Back-compat alias for callers that imported the constant. New code
# should call ``_recoverable_collections()`` so config changes are picked
# up at call time.
RECOVERABLE_COLLECTIONS = (COLLECTION_NAME, CLOSETS_COLLECTION_NAME)
def _get_palace_path():
@@ -83,7 +128,111 @@ def _paginate_ids(col, where=None):
return ids
def scan_palace(palace_path=None, only_wing=None):
def _extract_drawers(col, total: int, batch_size: int):
all_ids = []
all_docs = []
all_metas = []
offset = 0
while offset < total:
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
if not batch["ids"]:
break
all_ids.extend(batch["ids"])
all_docs.extend(batch["documents"])
all_metas.extend(batch["metadatas"])
offset += len(batch["ids"])
return all_ids, all_docs, all_metas
def _verify_collection_count(col, expected: int, label: str) -> None:
actual = col.count()
if actual != expected:
raise RuntimeError(f"{label} count mismatch: expected {expected}, got {actual}")
def _is_missing_collection_value_error(exc: ValueError) -> bool:
message = str(exc).lower()
return "does not exist" in message or "not found" in message
def _delete_collection_if_exists(backend, palace_path: str, collection_name: str) -> None:
try:
backend.delete_collection(palace_path, collection_name)
except ValueError as exc:
if _is_missing_collection_value_error(exc):
return
raise
except (FileNotFoundError, ChromaNotFoundError):
return
class RebuildCollectionError(RuntimeError):
"""Raised when temp rebuild fails, carrying whether the live swap happened."""
def __init__(self, message: str, *, live_replaced: bool):
super().__init__(message)
self.live_replaced = live_replaced
def _rebuild_collection_via_temp(
backend,
palace_path: str,
all_ids,
all_docs,
all_metas,
batch_size: int,
collection_name: Optional[str] = None,
progress=print,
) -> int:
expected = len(all_ids)
collection_name = collection_name or _drawers_collection_name()
temp_name = f"{collection_name}__repair_tmp"
live_replaced = False
try:
_delete_collection_if_exists(backend, palace_path, temp_name)
progress(f" Building temporary collection: {temp_name}")
temp_col = backend.create_collection(palace_path, temp_name)
staged = 0
for i in range(0, expected, batch_size):
batch_ids = all_ids[i : i + batch_size]
batch_docs = all_docs[i : i + batch_size]
batch_metas = all_metas[i : i + batch_size]
temp_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
staged += len(batch_ids)
progress(f" Staged {staged}/{expected} drawers...")
_verify_collection_count(temp_col, expected, "temporary rebuild")
progress(" Rebuilding live collection...")
backend.delete_collection(palace_path, collection_name)
live_replaced = True
new_col = backend.create_collection(palace_path, collection_name)
rebuilt = 0
for i in range(0, expected, batch_size):
batch_ids = all_ids[i : i + batch_size]
batch_docs = all_docs[i : i + batch_size]
batch_metas = all_metas[i : i + batch_size]
new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
rebuilt += len(batch_ids)
progress(f" Re-filed {rebuilt}/{expected} drawers...")
_verify_collection_count(new_col, expected, "rebuilt live collection")
try:
_delete_collection_if_exists(backend, palace_path, temp_name)
except Exception:
pass
return rebuilt
except Exception as exc:
try:
_delete_collection_if_exists(backend, palace_path, temp_name)
except Exception:
pass
raise RebuildCollectionError(str(exc), live_replaced=live_replaced) from exc
def scan_palace(palace_path=None, only_wing=None, collection_name: Optional[str] = None):
"""Scan the palace for corrupt/unfetchable IDs.
Probes in batches of 100, falls back to per-ID on failure.
@@ -92,14 +241,15 @@ def scan_palace(palace_path=None, only_wing=None):
Returns (good_set, bad_set).
"""
palace_path = palace_path or _get_palace_path()
collection_name = collection_name or _drawers_collection_name()
print(f"\n Palace: {palace_path}")
print(" Loading...")
col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
col = ChromaBackend().get_collection(palace_path, collection_name)
where = {"wing": only_wing} if only_wing else None
total = col.count()
print(f" Collection: {COLLECTION_NAME}, total: {total:,}")
print(f" Collection: {collection_name}, total: {total:,}")
if only_wing:
print(f" Scanning wing: {only_wing}")
@@ -160,9 +310,10 @@ def scan_palace(palace_path=None, only_wing=None):
return good_set, bad_set
def prune_corrupt(palace_path=None, confirm=False):
def prune_corrupt(palace_path=None, confirm=False, collection_name: Optional[str] = None):
"""Delete corrupt IDs listed in corrupt_ids.txt."""
palace_path = palace_path or _get_palace_path()
collection_name = collection_name or _drawers_collection_name()
bad_file = os.path.join(palace_path, "corrupt_ids.txt")
if not os.path.exists(bad_file):
@@ -178,7 +329,7 @@ def prune_corrupt(palace_path=None, confirm=False):
print(" Re-run with --confirm to actually delete.")
return
col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
col = ChromaBackend().get_collection(palace_path, collection_name)
before = col.count()
print(f" Collection size before: {before:,}")
@@ -232,7 +383,10 @@ class TruncationDetected(Exception):
def check_extraction_safety(
palace_path: str, extracted: int, confirm_truncation_ok: bool = False
palace_path: str,
extracted: int,
confirm_truncation_ok: bool = False,
collection_name: Optional[str] = None,
) -> None:
"""Cross-check that ``extracted`` matches the SQLite ground truth.
@@ -254,7 +408,8 @@ def check_extraction_safety(
if confirm_truncation_ok:
return
sqlite_count = sqlite_drawer_count(palace_path)
collection_name = collection_name or _drawers_collection_name()
sqlite_count = sqlite_drawer_count(palace_path, collection_name)
cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT
if sqlite_count is not None and sqlite_count > extracted:
@@ -290,7 +445,7 @@ def check_extraction_safety(
raise TruncationDetected(message, sqlite_count, extracted)
def sqlite_drawer_count(palace_path: str) -> "int | None":
def sqlite_drawer_count(palace_path: str, collection_name: Optional[str] = None) -> "int | None":
"""Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection.
Used as an independent ground-truth check against the chromadb
@@ -302,6 +457,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
drift, missing tables, locked file). Callers treat ``None`` as
"unknown" and fall back to the cap-detection check.
"""
collection_name = collection_name or _drawers_collection_name()
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.exists(sqlite_path):
return None
@@ -318,7 +474,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
JOIN collections c ON s.collection = c.id
WHERE c.name = ?
""",
(COLLECTION_NAME,),
(collection_name,),
).fetchone()
return int(row[0]) if row and row[0] is not None else None
finally:
@@ -361,14 +517,16 @@ def maybe_repair_poisoned_max_seq_id_before_rebuild(
if not poisoned:
return None
print("\n Detected poisoned max_seq_id rows before repair rebuild.")
print("\n Detected poisoned max_seq_id rows before repair rebuild.")
print(
" This can make writes report success while embeddings_queue grows "
" This can make writes report success while embeddings_queue grows "
"and embeddings stay static."
)
print(" Running the non-destructive max_seq_id repair instead of rebuilding " "the collection.")
print(
" Queued writes remain in chroma.sqlite3 for Chroma to drain after "
" Running the non-destructive max_seq_id repair instead of rebuilding " "the collection."
)
print(
" Queued writes remain in chroma.sqlite3 for Chroma to drain after "
"the bookmark is unpoisoned."
)
@@ -380,7 +538,11 @@ def maybe_repair_poisoned_max_seq_id_before_rebuild(
)
def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
def rebuild_index(
palace_path=None,
confirm_truncation_ok: bool = False,
collection_name: Optional[str] = None,
):
"""Rebuild the HNSW index from scratch.
1. Extract all drawers via ChromaDB get()
@@ -395,6 +557,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
(typically only a concern for palaces sized at exactly 10 000 rows).
"""
palace_path = palace_path or _get_palace_path()
collection_name = collection_name or _drawers_collection_name()
if not os.path.isdir(palace_path):
print(f"\n No palace found at {palace_path}")
@@ -414,7 +577,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
backend = ChromaBackend()
try:
col = backend.get_collection(palace_path, COLLECTION_NAME)
col = backend.get_collection(palace_path, collection_name)
total = col.count()
except Exception as e:
print(f" Error reading palace: {e}")
@@ -430,18 +593,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
# Extract all drawers in batches
print("\n Extracting drawers...")
batch_size = 5000
all_ids = []
all_docs = []
all_metas = []
offset = 0
while offset < total:
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
if not batch["ids"]:
break
all_ids.extend(batch["ids"])
all_docs.extend(batch["documents"])
all_metas.extend(batch["metadatas"])
offset += len(batch["ids"])
all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size)
print(f" Extracted {len(all_ids)} drawers")
# ── #1208 guard ──────────────────────────────────────────────────
@@ -449,7 +601,12 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
# short of the SQLite ground truth (or when extraction == chromadb
# default get() cap and the SQLite check couldn't run).
try:
check_extraction_safety(palace_path, len(all_ids), confirm_truncation_ok)
check_extraction_safety(
palace_path,
len(all_ids),
confirm_truncation_ok,
collection_name=collection_name,
)
except TruncationDetected as e:
print(e.message)
return
@@ -464,28 +621,34 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
# Rebuild with correct HNSW settings
print(" Rebuilding collection with hnsw:space=cosine...")
backend.delete_collection(palace_path, COLLECTION_NAME)
new_col = backend.create_collection(palace_path, COLLECTION_NAME)
filed = 0
try:
for i in range(0, len(all_ids), batch_size):
batch_ids = all_ids[i : i + batch_size]
batch_docs = all_docs[i : i + batch_size]
batch_metas = all_metas[i : i + batch_size]
new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
filed += len(batch_ids)
print(f" Re-filed {filed}/{len(all_ids)} drawers...")
except Exception as e:
filed = _rebuild_collection_via_temp(
backend,
palace_path,
all_ids,
all_docs,
all_metas,
batch_size,
collection_name=collection_name,
progress=print,
)
except RebuildCollectionError as e:
print(f"\n ERROR during rebuild: {e}")
print(f" Only {filed}/{len(all_ids)} drawers were re-filed.")
if os.path.exists(backup_path):
print(" Rebuild aborted before completion.")
if e.live_replaced and os.path.exists(backup_path):
print(f" Restoring from backup: {backup_path}")
backend.delete_collection(palace_path, COLLECTION_NAME)
shutil.copy2(backup_path, sqlite_path)
print(" Backup restored. Palace is back to pre-repair state.")
else:
try:
_close_chroma_handles(palace_path, backend=backend)
_delete_collection_if_exists(backend, palace_path, collection_name)
shutil.copy2(backup_path, sqlite_path)
print(" Backup restored. Palace is back to pre-repair state.")
except Exception as restore_error:
print(f" Backup restore failed: {restore_error}")
print(f" Manual restore required from: {backup_path}")
elif e.live_replaced:
print(" No backup available. Re-mine from source files to recover.")
else:
print(" Live collection was not replaced; leaving the original palace untouched.")
raise
print(f"\n Repair complete. {filed} drawers rebuilt.")
@@ -493,7 +656,380 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
print(f"\n{'=' * 55}\n")
def status(palace_path=None) -> dict:
class RebuildPartialError(Exception):
"""Raised when ``rebuild_from_sqlite`` fails partway through upserts.
Carries enough state for the user (or CLI) to recover: the
per-collection counts that succeeded, the collection that failed,
the dest path holding the partial palace, and the archive path
(when an in-place rebuild had moved the original aside). Re-raises
the underlying chromadb error as ``__cause__``.
"""
def __init__(
self,
message: str,
*,
partial_counts: dict[str, int],
failed_collection: str,
dest_palace: str,
archive_path: Optional[str],
):
super().__init__(message)
self.message = message
self.partial_counts = partial_counts
self.failed_collection = failed_collection
self.dest_palace = dest_palace
self.archive_path = archive_path
def _rebuild_one_collection(
*,
backend: ChromaBackend,
source_palace: str,
dest_palace: str,
collection_name: str,
batch_size: int,
archive_path: Optional[str],
counts_so_far: dict[str, int],
) -> int:
"""Stream rows for one collection from SQLite and upsert into a
freshly-created collection at ``dest_palace``. Returns rows
upserted. Raises :class:`RebuildPartialError` (with the underlying
chromadb exception as ``__cause__``) on any upsert failure so the
caller can stop the loop and print recovery instructions instead of
silently shipping a partial palace.
"""
ids: list[str] = []
docs: list[str] = []
metas: list[dict] = []
upserted = 0
col = None
def _flush() -> int:
nonlocal upserted
if not ids:
return upserted
col.upsert(ids=list(ids), documents=list(docs), metadatas=list(metas))
upserted += len(ids)
print(f" upserted {upserted}")
ids.clear()
docs.clear()
metas.clear()
return upserted
try:
# ``create_collection`` lives inside the try so a Chroma-side
# "Collection already exists" failure (which can happen when the
# process-wide System cache still holds a pre-archive schema) is
# reported as a structured ``RebuildPartialError`` carrying
# ``archive_path`` — instead of an unstructured exception that
# strands the user without recovery instructions.
col = backend.create_collection(dest_palace, collection_name)
for emb_id, doc, meta in extract_via_sqlite(source_palace, collection_name):
ids.append(emb_id)
docs.append(doc or "")
# chromadb 1.5.x rejects None entries in the metadatas list
# but accepts empty dicts. Mempalace drawers always carry at
# least wing/room, so this branch is defensive — corruption
# in embedding_metadata could yield an emb_id with no rows.
metas.append(meta if meta else {})
if len(ids) >= batch_size:
_flush()
_flush()
except Exception as exc: # noqa: BLE001 — chromadb raises many shapes
partial = dict(counts_so_far)
partial[collection_name] = upserted
msg_parts = [
f"Upsert failed in collection {collection_name!r} after {upserted} rows: {exc!r}",
f"Partial palace left at: {dest_palace}",
]
if archive_path is not None:
msg_parts.append(f"Original palace archived at: {archive_path}")
msg_parts.append(
" Recover by removing the partial dest and re-running with "
f"--source {archive_path}"
)
else:
msg_parts.append(" Source palace is unchanged. Remove the partial dest and re-run.")
message = "\n ".join(msg_parts)
print(f"\n ERROR: {message}")
raise RebuildPartialError(
message,
partial_counts=partial,
failed_collection=collection_name,
dest_palace=dest_palace,
archive_path=archive_path,
) from exc
return upserted
def extract_via_sqlite(palace_path: str, collection_name: str) -> Iterator[tuple[str, str, dict]]:
"""Yield ``(embedding_id, document, metadata)`` for every row in
``collection_name``'s metadata segment by reading ``chroma.sqlite3``
directly.
Bypasses the chromadb client entirely — never opens a
``PersistentClient``, never imports hnswlib, never invokes the
HNSW segment writer. This is the recovery path for palaces where
``Collection.count()`` / ``Collection.get()`` raise ``InternalError``
because the compactor cannot apply WAL logs to the HNSW segment
(#1308). The drawer rows are still on disk in
``embeddings`` + ``embedding_metadata``; the corruption lives in the
on-disk index files, not the SQLite tables.
Resolution rule for chromadb's typed metadata columns: each
``embedding_metadata`` row stores its value in exactly one of
``string_value`` / ``int_value`` / ``float_value`` / ``bool_value``;
we pick the first non-NULL column in that order. Rows where every
typed column is NULL are dropped (chromadb never writes that shape).
The ``chroma:document`` key is removed from the metadata dict and
returned as the document; this matches how chromadb itself stores
``add(documents=...)``.
Silent on missing palace, missing ``chroma.sqlite3``, or unknown
collection name — yields nothing. Callers that need to distinguish
"empty collection" from "collection not present" should query
:func:`sqlite_drawer_count` first.
"""
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isfile(sqlite_path):
return
conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
try:
seg_row = conn.execute(
"""
SELECT s.id FROM segments s
JOIN collections c ON s.collection = c.id
WHERE c.name = ? AND s.scope = 'METADATA'
""",
(collection_name,),
).fetchone()
if not seg_row:
return
segment_id = seg_row[0]
per_id: dict[str, dict] = defaultdict(dict)
order: list[str] = []
for emb_id, key, sv, iv, fv, bv in conn.execute(
"""
SELECT e.embedding_id, em.key, em.string_value, em.int_value,
em.float_value, em.bool_value
FROM embedding_metadata em
JOIN embeddings e ON em.id = e.id
WHERE e.segment_id = ?
ORDER BY em.id
""",
(segment_id,),
):
if emb_id not in per_id:
order.append(emb_id)
if sv is not None:
per_id[emb_id][key] = sv
elif iv is not None:
per_id[emb_id][key] = iv
elif fv is not None:
per_id[emb_id][key] = fv
elif bv is not None:
per_id[emb_id][key] = bool(bv)
for emb_id in order:
kv = per_id[emb_id]
doc = kv.pop("chroma:document", "")
yield emb_id, doc, kv
finally:
conn.close()
def rebuild_from_sqlite(
source_palace: str,
dest_palace: str,
*,
archive_existing_dest: bool = False,
batch_size: int = 1000,
) -> dict[str, int]:
"""Rebuild a palace by reading drawers from ``source_palace``'s
``chroma.sqlite3`` and upserting them into a fresh palace at
``dest_palace``.
Recovery path for the #1308 failure mode: the chromadb client raises
``InternalError: Failed to apply logs to the hnsw segment writer``
on every operation that touches the index (``count``, ``get``,
``query``), but the underlying SQLite tables are intact. Both the
legacy ``rebuild_index`` and the inline ``cli.cmd_repair`` path call
``Collection.count()`` as their first read — exactly the call that
fails — so neither can recover this class of corruption. This
function bypasses the chromadb read path entirely via
:func:`extract_via_sqlite`.
Re-embeds documents at upsert time using the configured embedding
function; the original HNSW vectors are not preserved (they live in
the corrupt ``data_level0.bin`` / ``link_lists.bin``, not in
SQLite). Acceptable for a corruption-recovery flow because the
embedding model is deterministic — same model + same document text
yields semantically equivalent search results.
``archive_existing_dest`` controls behavior when ``dest_palace``
already exists:
* ``False`` (default) — refuse with a clear message. Callers must
manually move the existing palace aside first.
* ``True`` — rename ``dest_palace`` to
``<dest_palace>.pre-rebuild-<timestamp>`` and read from there
instead. Used by the in-place CLI flow where ``--source`` defaults
to the same path as ``--palace``.
Returns a ``{collection_name: row_count}`` dict so callers (CLI,
tests) can verify the per-collection rebuild count without parsing
stdout. A successful rebuild always returns a dict with one key per
recoverable collection (values may be ``0`` when a collection is
legitimately empty in the source). The empty dict ``{}`` is reserved
for validation refusals (missing source DB, refusing to overwrite an
existing dest, in-place mode without ``archive_existing_dest``); CLI
callers should treat ``{}`` as an error and exit non-zero so CI and
scripts can distinguish "invalid inputs" from "successful recovery
that found zero rows." Raises :class:`RebuildPartialError` if a
chromadb upsert fails partway through; the dest palace is left in
place so the user can inspect what landed, and the in-place archive
(when applicable) is reported in the error so the user can re-run
against it.
.. warning::
In-place mode (``source_palace == dest_palace`` with
``archive_existing_dest=True``) calls
``chromadb.api.client.SharedSystemClient.clear_system_cache()`` to
drop chromadb's process-wide System registry — required because
an existing cached System built against the original palace will
refuse ``create_collection`` after the dir is renamed (chromadb
still thinks the collections exist). This invalidates any
PersistentClient instances held elsewhere in the same process for
*any* palace, not just this one. Do not call this function from
inside a long-running mempalace process (MCP server, daemon)
while other callers hold live ``PersistentClient`` references —
use the CLI in a separate process instead. Cross-palace use
(``source != dest``) does not touch the cache.
Note on metadata fidelity: the resolution rule
(``string_value`` → ``int_value`` → ``float_value`` → ``bool_value``)
matches the precedent in :mod:`mempalace.migrate`. ChromaDB 0.4.x
occasionally wrote booleans as ``int_value=0/1``; those will
round-trip as ``int`` rather than ``bool`` after this rebuild. This
is a known divergence and matches the existing migrate-path
behavior.
"""
source_palace = os.path.abspath(os.path.expanduser(source_palace))
dest_palace = os.path.abspath(os.path.expanduser(dest_palace))
src_db = os.path.join(source_palace, "chroma.sqlite3")
in_place = source_palace == dest_palace
print(f"\n{'=' * 55}")
print(" MemPalace Repair — Rebuild from SQLite")
print(f"{'=' * 55}\n")
print(f" Source: {source_palace}")
print(f" Dest: {dest_palace}")
# Validate source BEFORE any destructive moves. An earlier draft
# archived the dest first and surfaced the missing-chroma.sqlite3
# error after — leaving the user with a renamed dir to manually undo
# when the archive itself was empty. Validate first so a user error
# (--source pointing at a non-palace dir) bails cleanly.
if in_place:
if not archive_existing_dest:
print(
"\n Source and dest are the same path. Pass "
"archive_existing_dest=True (CLI: --archive-existing) to move "
"the existing palace aside, or pass a different source_palace= "
"(CLI: --source)."
)
return {}
if not os.path.isfile(src_db):
print(f"\n Source palace has no chroma.sqlite3 at {src_db}")
return {}
else:
if not os.path.isfile(src_db):
print(f"\n Source palace has no chroma.sqlite3 at {src_db}")
return {}
if os.path.exists(dest_palace):
print(
f"\n Refusing to rebuild into existing path: {dest_palace}\n"
" Move it aside, pass a different dest, or set "
"archive_existing_dest=True if rebuilding in place "
"(source_palace == dest_palace)."
)
return {}
archive_path: Optional[str] = None
if in_place:
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
archive_path = f"{dest_palace}.pre-rebuild-{ts}"
print(f" Archiving {dest_palace}{archive_path}")
shutil.move(dest_palace, archive_path)
source_palace = archive_path
src_db = os.path.join(source_palace, "chroma.sqlite3")
# In-place only: drop chromadb's process-wide System registry so
# the new client at dest_palace builds a fresh System. Without
# this, ``create_collection`` raises "Collection already exists"
# because the cached System still holds the pre-rename schema.
# Cross-palace mode does not need this and would needlessly
# invalidate other callers' clients (see docstring warning).
try:
from chromadb.api.client import SharedSystemClient
SharedSystemClient.clear_system_cache()
except Exception as exc: # noqa: BLE001
print(
f" Warning: could not clear chromadb system cache ({exc!r}); "
"in-place rebuild may fail with 'Collection already exists'."
)
os.makedirs(dest_palace, exist_ok=True)
# Backend lifetime is wrapped in try/finally so the dest palace's
# PersistentClient handle (opened lazily inside ``create_collection``
# / ``get_collection``) is released on every exit path: success,
# ``RebuildPartialError``, or any unexpected exception. Without this,
# a long-running process that calls ``rebuild_from_sqlite`` would
# leak SQLite/HNSW file handles into Chroma's ``SharedSystemClient``
# cache, surfacing later as "Collection already exists" on the next
# in-place rebuild or as a Windows file-lock failure on cleanup
# (cf. #1285's lifecycle hardening for the legacy rebuild path).
backend = ChromaBackend()
counts: dict[str, int] = {}
try:
for cname in _recoverable_collections():
print(f"\n [{cname}]")
upserted = _rebuild_one_collection(
backend=backend,
source_palace=source_palace,
dest_palace=dest_palace,
collection_name=cname,
batch_size=batch_size,
archive_path=archive_path,
counts_so_far=counts,
)
counts[cname] = upserted
if upserted == 0:
print(f" no rows found for {cname} in source palace")
else:
print(f" done: {upserted} rows in {cname}")
print(f"\n Rebuild complete. {sum(counts.values())} total rows.")
if archive_path is not None:
print(f" Original palace archived at: {archive_path}")
print(f"{'=' * 55}\n")
return counts
finally:
backend.close()
def status(palace_path=None, collection_name: Optional[str] = None) -> dict:
"""Read-only health check: compare sqlite vs HNSW element counts.
Catches the #1222 failure mode where chromadb's HNSW segment freezes
@@ -511,6 +1047,7 @@ def status(palace_path=None) -> dict:
``status="unknown"`` when no palace exists at the given path.
"""
palace_path = palace_path or _get_palace_path()
collection_name = collection_name or _drawers_collection_name()
print(f"\n{'=' * 55}")
print(" MemPalace Repair — Status")
print(f"{'=' * 55}\n")
@@ -520,8 +1057,8 @@ def status(palace_path=None) -> dict:
print(" No palace found.\n")
return {"status": "unknown", "message": "no palace at path"}
drawers = hnsw_capacity_status(palace_path, "mempalace_drawers")
closets = hnsw_capacity_status(palace_path, "mempalace_closets")
drawers = hnsw_capacity_status(palace_path, collection_name)
closets = hnsw_capacity_status(palace_path, CLOSETS_COLLECTION_NAME)
for label, info in (("drawers", drawers), ("closets", closets)):
print(f"\n [{label}]")
@@ -551,12 +1088,18 @@ def status(palace_path=None) -> dict:
# ---------------------------------------------------------------------------
def _close_chroma_handles(palace_path: str) -> None:
"""Drop ChromaBackend + chromadb singleton caches so OS mmap handles release."""
def _close_chroma_handles(palace_path: str, backend: "ChromaBackend | None" = None) -> None:
"""Drop ChromaBackend + chromadb singleton caches so OS mmap handles release.
When ``backend`` is provided, close the live instance so rollback/restore
releases the handles it was already using. Otherwise fall back to a
transient backend instance for the max-seq-id repair path.
"""
import gc
try:
ChromaBackend().close_palace(palace_path)
closer = backend if backend is not None else ChromaBackend()
closer.close_palace(palace_path)
except Exception:
pass
try:
+1 -1
View File
@@ -202,7 +202,7 @@ def detect_rooms_from_files(project_dir: str) -> list:
SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv", "dist", "build"}
for root, dirs, filenames in os.walk(project_path):
for _root, dirs, filenames in os.walk(project_path):
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
for filename in filenames:
name_lower = filename.lower().replace("-", "_").replace(" ", "_")
+80 -25
View File
@@ -245,7 +245,7 @@ def _expand_with_neighbors(drawers_col, matched_doc: str, matched_meta: dict, ra
all_meta = drawers_col.get(where={"source_file": src}, include=["metadatas"])
total_drawers = len(all_meta.ids) if all_meta.ids else None
except Exception:
pass
logger.debug("total_drawers lookup failed for %s", src, exc_info=True)
return {
"text": combined_text,
@@ -297,10 +297,10 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
"""
try:
col = get_collection(palace_path, create=False)
except Exception:
except Exception as e:
print(f"\n No palace found at {palace_path}")
print(" Run: mempalace init <dir> then mempalace mine <dir>")
raise SearchError(f"No palace found at {palace_path}")
raise SearchError(f"No palace found at {palace_path}") from e
# Alert the user if this palace predates hnsw:space=cosine being set on
# creation — their similarity scores will be junk until they run repair.
@@ -340,7 +340,7 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
# `_hybrid_rank`; do the same here so CLI results match what agents
# see via `mempalace_search`.
hits = [
{"text": doc, "distance": float(dist), "metadata": meta or {}}
{"text": doc or "", "distance": float(dist), "metadata": meta or {}}
for doc, meta, dist in zip(docs, metas, dists)
]
hits = _hybrid_rank(hits, query)
@@ -382,6 +382,7 @@ def _bm25_only_via_sqlite(
n_results: int = 5,
max_candidates: int = 500,
_include_internal: bool = False,
collection_name: str = None,
) -> dict:
"""BM25-only search reading drawers directly from chroma.sqlite3.
@@ -405,6 +406,35 @@ def _bm25_only_via_sqlite(
"error": "No palace found",
"hint": "Run: mempalace init <dir> && mempalace mine <dir>",
}
if collection_name is None:
from .config import get_configured_collection_name
collection_name = get_configured_collection_name()
def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]:
clauses = []
params = []
for key, value in (("wing", wing), ("room", room)):
if not value:
continue
clauses.append(
f"""
AND EXISTS (
SELECT 1
FROM embedding_metadata mf
WHERE mf.id = {row_id_expr}
AND mf.key = ?
AND COALESCE(
mf.string_value,
CAST(mf.int_value AS TEXT),
CAST(mf.float_value AS TEXT),
CAST(mf.bool_value AS TEXT)
) = ?
)
"""
)
params.extend([key, value])
return "".join(clauses), params
try:
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
@@ -416,45 +446,57 @@ def _bm25_only_via_sqlite(
# shorter than 3 chars (trigram tokenizer can't match them).
tokens = [t for t in _tokenize(query) if len(t) >= 3]
candidate_ids: list[int] = []
use_recency_fallback = not tokens
if tokens:
fts_query = " OR ".join(tokens)
filter_sql, filter_params = _metadata_filter_sql("embedding_fulltext_search.rowid")
try:
rows = conn.execute(
"""
SELECT rowid
f"""
SELECT embedding_fulltext_search.rowid
FROM embedding_fulltext_search
JOIN embeddings e ON e.id = embedding_fulltext_search.rowid
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE embedding_fulltext_search MATCH ?
AND c.name = ?
{filter_sql}
LIMIT ?
""",
(fts_query, max_candidates),
(fts_query, collection_name, *filter_params, max_candidates),
).fetchall()
candidate_ids = [r[0] for r in rows]
except sqlite3.Error:
# FTS5 tokenizer mismatch or syntax error — fall through
# to the recency-window selector below.
logger.debug("FTS5 MATCH failed; using recency fallback", exc_info=True)
use_recency_fallback = True
if not candidate_ids:
# No FTS hits (or no usable tokens) — pull the most recent
# rows for the drawers segment so we can BM25-rank something
# rather than return empty-handed. Wrapped in try/except
# because the schema may differ on legacy palaces (older
# chromadb without ``created_at``, missing ``segments``
# rows after partial restore, etc.); on schema mismatch we
# fall back to ordering by primary-key id and finally to an
# empty result rather than letting search raise.
if not candidate_ids and use_recency_fallback:
# No usable FTS tokens, or FTS itself failed — pull the most
# recent rows for the drawers segment so we can BM25-rank
# something rather than return empty-handed. A clean FTS miss
# must stay empty, especially after wing/room filtering, because
# recency fallback would return unrelated scoped drawers.
# Wrapped in try/except because the schema may differ on legacy
# palaces (older chromadb without ``created_at``, missing
# ``segments`` rows after partial restore, etc.); on schema
# mismatch we fall back to ordering by primary-key id and finally
# to an empty result rather than letting search raise.
try:
filter_sql, filter_params = _metadata_filter_sql("e.id")
rows = conn.execute(
"""
f"""
SELECT e.id
FROM embeddings e
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = 'mempalace_drawers'
WHERE c.name = ?
{filter_sql}
ORDER BY e.created_at DESC
LIMIT ?
""",
(max_candidates,),
(collection_name, *filter_params, max_candidates),
).fetchall()
candidate_ids = [r[0] for r in rows]
except sqlite3.Error:
@@ -463,17 +505,19 @@ def _bm25_only_via_sqlite(
exc_info=True,
)
try:
filter_sql, filter_params = _metadata_filter_sql("e.id")
rows = conn.execute(
"""
f"""
SELECT e.id
FROM embeddings e
JOIN segments s ON e.segment_id = s.id
JOIN collections c ON s.collection = c.id
WHERE c.name = 'mempalace_drawers'
WHERE c.name = ?
{filter_sql}
ORDER BY e.id DESC
LIMIT ?
""",
(max_candidates,),
(collection_name, *filter_params, max_candidates),
).fetchall()
candidate_ids = [r[0] for r in rows]
except sqlite3.Error:
@@ -689,6 +733,7 @@ def search_memories(
max_distance: float = 0.0,
vector_disabled: bool = False,
candidate_strategy: str = "vector",
collection_name: str = None,
) -> dict:
"""Programmatic search — returns a dict instead of printing.
@@ -739,10 +784,11 @@ def search_memories(
wing=wing,
room=room,
n_results=n_results,
collection_name=collection_name,
)
try:
drawers_col = get_collection(palace_path, create=False)
drawers_col = get_collection(palace_path, collection_name=collection_name, create=False)
except Exception as e:
logger.error("No palace found at %s: %s", palace_path, e)
return {
@@ -795,7 +841,8 @@ def search_memories(
if source and source not in closet_boost_by_source:
closet_boost_by_source[source] = (rank, cdist, cdoc[:200])
except Exception:
pass # no closets yet — hybrid degrades to pure drawer search
# No closets yet — hybrid degrades to pure drawer search.
logger.debug("Closet collection unavailable; using drawer-only search", exc_info=True)
# Rank-based boost. The ordinal signal ("which closet matched best") is
# more reliable than absolute distance on narrative content, where
@@ -809,6 +856,8 @@ def search_memories(
_first_or_empty(drawer_results, "metadatas"),
_first_or_empty(drawer_results, "distances"),
):
meta = meta or {}
doc = doc or ""
# Filter on raw distance before rounding to avoid precision loss.
if max_distance > 0.0 and dist > max_distance:
continue
@@ -825,7 +874,12 @@ def search_memories(
matched_via = "drawer+closet"
closet_preview = c_preview
effective_dist = dist - boost
# Clamp to the valid cosine-distance range [0, 2]. When a strong
# closet boost (up to 0.40) exceeds the raw distance, the subtraction
# can go negative — which (a) yields ``similarity > 1.0`` downstream
# and (b) makes the sort key land *below* ordinary positive distances,
# inverting the ranking so the best hybrid matches sort last.
effective_dist = max(0.0, min(2.0, dist - boost))
entry = {
"text": doc,
"wing": meta.get("wing", "unknown"),
@@ -870,6 +924,7 @@ def search_memories(
include=["documents", "metadatas"],
)
except Exception:
logger.debug("Neighbor fetch failed for %s", full_source, exc_info=True)
continue
docs = source_drawers.documents
metas_ = source_drawers.metadatas