merge: develop into fix/1274-missing-hnsw-metadata-gate
Two conflicts, both because #1339 (bloated link payloads) merged into develop after this branch was authored: - mempalace/backends/chroma.py: _segment_appears_healthy now stacks three checks — bloated-link from #1339 (top), missing-metadata-with- data-floor from this branch (middle), pickle format sniff (bottom). All three are complementary; #1339 catches structural payload corruption, this branch catches pickle truncation, the original catches pickle protocol-byte corruption. - tests/test_backends.py: kept both new imports (_segment_appears_healthy from this branch, quarantine_invalid_hnsw_metadata from #1285). Local: 1618 tests pass, ruff lint+format clean against 0.4.x CI pin.
This commit is contained in:
@@ -0,0 +1,71 @@
|
||||
"""Stdio UTF-8 reconfiguration helper for Windows entry points.
|
||||
|
||||
Python on Windows defaults stdio to the system ANSI codepage
|
||||
(cp1252/cp1251/cp950 depending on locale), which mojibakes UTF-8 input
|
||||
or output the moment a non-Latin character shows up. Every console
|
||||
entry point that touches stdio needs to fix this on Windows -- the MCP
|
||||
server, the CLI, the fact_checker `--stdin` mode -- so the
|
||||
reconfigure code lives here in one place to keep the per-stream
|
||||
errors policies aligned across them.
|
||||
|
||||
Per-stream errors policy is caller-chosen:
|
||||
|
||||
* MCP server uses ``strict`` on stdout/stderr because everything written
|
||||
there is server-controlled JSON-RPC; any encode failure is a real bug
|
||||
the operator wants loud.
|
||||
* CLI / fact_checker use ``replace`` on stdout/stderr because they print
|
||||
verbatim drawer text that may contain surrogate halves round-tripped
|
||||
from filenames -- ``strict`` would crash mid-print.
|
||||
* All callers use ``surrogateescape`` on stdin so a malformed byte from
|
||||
a redirected file or a misbehaving client survives as a lone surrogate
|
||||
the consumer's parser surfaces, instead of ``UnicodeDecodeError``
|
||||
killing the read loop on the first bad byte.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from typing import Callable, Optional
|
||||
|
||||
|
||||
def reconfigure_stdio_utf8_on_windows(
|
||||
*,
|
||||
stdin_errors: str = "surrogateescape",
|
||||
stdout_errors: str = "strict",
|
||||
stderr_errors: str = "strict",
|
||||
on_failure: Optional[Callable[[str, BaseException], None]] = None,
|
||||
) -> None:
|
||||
"""Reconfigure stdio to UTF-8 on Windows. No-op elsewhere.
|
||||
|
||||
Args:
|
||||
stdin_errors: errors= policy for stdin.reconfigure().
|
||||
stdout_errors: errors= policy for stdout.reconfigure().
|
||||
stderr_errors: errors= policy for stderr.reconfigure().
|
||||
on_failure: optional ``(stream_name, exc) -> None`` callback for
|
||||
streams whose ``reconfigure`` raises (e.g. Jupyter-replaced
|
||||
streams that lack the method-shape we expect). Defaults to a
|
||||
``WARNING:`` line on the original sys.stderr.
|
||||
"""
|
||||
if sys.platform != "win32":
|
||||
return
|
||||
|
||||
policies = (
|
||||
("stdin", stdin_errors),
|
||||
("stdout", stdout_errors),
|
||||
("stderr", stderr_errors),
|
||||
)
|
||||
for name, errors in policies:
|
||||
stream = getattr(sys, name, None)
|
||||
reconfigure = getattr(stream, "reconfigure", None)
|
||||
if reconfigure is None:
|
||||
continue
|
||||
try:
|
||||
reconfigure(encoding="utf-8", errors=errors)
|
||||
except Exception as exc: # noqa: BLE001 -- last-resort guard
|
||||
if on_failure is not None:
|
||||
on_failure(name, exc)
|
||||
else:
|
||||
print(
|
||||
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
+298
-101
@@ -1,9 +1,12 @@
|
||||
"""ChromaDB-backed MemPalace storage backend (RFC 001 reference implementation)."""
|
||||
|
||||
import contextlib
|
||||
import datetime as _dt
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import sqlite3
|
||||
from numbers import Integral
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
@@ -29,6 +32,51 @@ _REQUIRED_OPERATORS = frozenset({"$eq", "$ne", "$in", "$nin", "$and", "$or", "$c
|
||||
_OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
|
||||
_SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
|
||||
|
||||
# A healthy HNSW payload should keep link_lists.bin proportional to
|
||||
# data_level0.bin. When link_lists.bin grows orders of magnitude larger than
|
||||
# data_level0.bin, Chroma/HNSW can segfault while opening the segment even if
|
||||
# index_metadata.pickle is structurally valid.
|
||||
#
|
||||
# The report in #1218 showed ratios above 300x, while healthy snapshots were far below 1x.
|
||||
# Treat only >10x as corruption so normal flush lag or small segments do not get
|
||||
# quarantined.
|
||||
_HNSW_LINK_TO_DATA_MAX_RATIO = 10.0
|
||||
|
||||
|
||||
def _hnsw_link_to_data_ratio(seg_dir: str) -> Optional[float]:
|
||||
"""Return link_lists.bin / data_level0.bin size ratio for a segment.
|
||||
|
||||
``None`` means the ratio is not meaningful, usually because one file is
|
||||
missing or data_level0.bin is empty. ``float("inf")`` means the files were
|
||||
present but could not be statted safely, which should be treated as
|
||||
suspicious by callers.
|
||||
"""
|
||||
|
||||
link_path = os.path.join(seg_dir, "link_lists.bin")
|
||||
data_path = os.path.join(seg_dir, "data_level0.bin")
|
||||
|
||||
if not (os.path.isfile(link_path) and os.path.isfile(data_path)):
|
||||
return None
|
||||
|
||||
try:
|
||||
data_size = os.path.getsize(data_path)
|
||||
link_size = os.path.getsize(link_path)
|
||||
except OSError:
|
||||
return float("inf")
|
||||
|
||||
if data_size <= 0:
|
||||
return None
|
||||
|
||||
return link_size / data_size
|
||||
|
||||
|
||||
def _hnsw_payload_appears_sane(seg_dir: str) -> bool:
|
||||
"""Return False when HNSW payload files are structurally implausible."""
|
||||
|
||||
ratio = _hnsw_link_to_data_ratio(seg_dir)
|
||||
return ratio is None or ratio <= _HNSW_LINK_TO_DATA_MAX_RATIO
|
||||
|
||||
|
||||
# HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
|
||||
#
|
||||
# With default params (batch_size=100, sync_threshold=1000, initial capacity
|
||||
@@ -110,6 +158,8 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
|
||||
files and quarantine_stale_hnsw would conservatively rename them
|
||||
out of the way.
|
||||
"""
|
||||
if not _hnsw_payload_appears_sane(seg_dir):
|
||||
return False
|
||||
|
||||
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
|
||||
if not os.path.isfile(meta_path):
|
||||
@@ -142,64 +192,35 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
|
||||
|
||||
|
||||
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
|
||||
"""Rename HNSW segment dirs that are both stale-by-mtime AND fail an
|
||||
integrity sniff-test.
|
||||
"""Rename HNSW segment dirs that look unsafe to open.
|
||||
|
||||
Catches the segfault failure mode from #823 (semantic search stale
|
||||
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
|
||||
``count()`` with chromadb 1.5.5), and acknowledged as by-design at
|
||||
chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb
|
||||
rebuild lazily on next open instead of segfaulting.
|
||||
This catches two classes of HNSW corruption before ChromaDB opens the
|
||||
native segment reader:
|
||||
|
||||
Two-stage check:
|
||||
1. stale-by-mtime segments whose ``index_metadata.pickle`` fails the
|
||||
existing format sniff-test;
|
||||
2. structurally impossible HNSW payloads where ``link_lists.bin`` is much
|
||||
larger than ``data_level0.bin``.
|
||||
|
||||
1. **mtime gate.** If ``chroma.sqlite3`` is less than
|
||||
``stale_seconds`` newer than the segment's ``data_level0.bin``,
|
||||
skip — chromadb is in normal write-path territory.
|
||||
The second check is intentionally not gated by mtime. A segment with a
|
||||
300x link/data ratio is unsafe regardless of whether its mtime is recent;
|
||||
letting Chroma open it can SIGSEGV before Python fallback code runs.
|
||||
|
||||
2. **Integrity gate** (``_segment_appears_healthy``). Even when the
|
||||
mtime gap exceeds the threshold, a segment whose
|
||||
``index_metadata.pickle`` passes a format sniff-test is healthy:
|
||||
chromadb 1.5.x flushes HNSW state asynchronously and a clean
|
||||
shutdown does NOT force-flush, so the on-disk HNSW is *always*
|
||||
somewhat older than ``chroma.sqlite3``. Production observation
|
||||
(2026-04-26 disks daemon): three of three segments quarantined
|
||||
on every cold start, with 538-557s gaps, leaving the 151K-drawer
|
||||
palace with vector_ranked=0 until rebuild. Renaming a healthy
|
||||
segment based on mtime alone destroys a valid index — chromadb
|
||||
creates an empty replacement, orphaning every drawer in sqlite
|
||||
from vector recall until the operator runs ``mempalace repair
|
||||
--mode rebuild`` (15+ min on a 151K palace).
|
||||
|
||||
Only segments that pass stage 1 (suspiciously stale) AND fail stage
|
||||
2 (metadata file truncated, zero-filled, or absent-with-data) are
|
||||
renamed to ``<uuid>.drift-<timestamp>``. The original directory is
|
||||
renamed, not deleted, so recovery remains possible if the heuristic
|
||||
misfires.
|
||||
|
||||
The default threshold (5 min) is advisory under daemon-strict; the
|
||||
integrity gate is what actually distinguishes corruption from flush
|
||||
lag. The threshold still matters for the cross-machine replication
|
||||
case (#823), where it bounds how stale a Syncthing-replicated
|
||||
segment can be before we look harder at it.
|
||||
|
||||
Args:
|
||||
palace_path: path to the palace directory containing ``chroma.sqlite3``
|
||||
stale_seconds: minimum mtime gap to *consider* a segment for quarantine
|
||||
|
||||
Returns:
|
||||
List of paths that were quarantined (empty if nothing actually
|
||||
looked corrupt).
|
||||
The original directory is renamed, not deleted, so recovery remains
|
||||
possible if the heuristic ever misfires.
|
||||
"""
|
||||
|
||||
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||
if not os.path.isfile(db_path):
|
||||
return []
|
||||
|
||||
try:
|
||||
sqlite_mtime = os.path.getmtime(db_path)
|
||||
except OSError:
|
||||
return []
|
||||
|
||||
moved: list[str] = []
|
||||
|
||||
try:
|
||||
entries = os.listdir(palace_path)
|
||||
except OSError:
|
||||
@@ -208,29 +229,34 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
|
||||
for name in entries:
|
||||
if "-" not in name or name.startswith(".") or ".drift-" in name:
|
||||
continue
|
||||
|
||||
seg_dir = os.path.join(palace_path, name)
|
||||
if not os.path.isdir(seg_dir):
|
||||
continue
|
||||
|
||||
hnsw_bin = os.path.join(seg_dir, "data_level0.bin")
|
||||
if not os.path.isfile(hnsw_bin):
|
||||
continue
|
||||
|
||||
try:
|
||||
hnsw_mtime = os.path.getmtime(hnsw_bin)
|
||||
except OSError:
|
||||
continue
|
||||
if sqlite_mtime - hnsw_mtime < stale_seconds:
|
||||
|
||||
payload_ratio = _hnsw_link_to_data_ratio(seg_dir)
|
||||
payload_corrupt = payload_ratio is not None and payload_ratio > _HNSW_LINK_TO_DATA_MAX_RATIO
|
||||
|
||||
if not payload_corrupt and sqlite_mtime - hnsw_mtime < stale_seconds:
|
||||
continue
|
||||
|
||||
# Stage 2: integrity gate. mtime drift is necessary but not
|
||||
# sufficient — chromadb's async flush makes drift the steady-
|
||||
# state condition. A healthy segment metadata file proves
|
||||
# chromadb can open the segment without segfault; don't
|
||||
# quarantine a healthy index.
|
||||
if _segment_appears_healthy(seg_dir):
|
||||
# Stage 2: integrity gate. Mtime drift alone is not corruption because
|
||||
# Chroma flushes HNSW asynchronously. A healthy metadata file proves the
|
||||
# ordinary stale-by-mtime case is just flush lag.
|
||||
if not payload_corrupt and _segment_appears_healthy(seg_dir):
|
||||
logger.info(
|
||||
"HNSW mtime gap %.0fs on %s exceeds threshold but segment "
|
||||
"metadata file is intact — flush-lag, not corruption. "
|
||||
"Leaving in place.",
|
||||
"metadata and payload size are intact — flush-lag, not "
|
||||
"corruption. Leaving in place.",
|
||||
sqlite_mtime - hnsw_mtime,
|
||||
seg_dir,
|
||||
)
|
||||
@@ -238,17 +264,30 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
|
||||
|
||||
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
target = f"{seg_dir}.drift-{stamp}"
|
||||
|
||||
if payload_corrupt:
|
||||
reason = (
|
||||
f"link_lists.bin/data_level0.bin ratio {payload_ratio:.1f}x "
|
||||
f"exceeds {_HNSW_LINK_TO_DATA_MAX_RATIO:.1f}x"
|
||||
)
|
||||
else:
|
||||
reason = (
|
||||
f"sqlite {sqlite_mtime - hnsw_mtime:.0f}s newer than HNSW "
|
||||
"and integrity check failed"
|
||||
)
|
||||
|
||||
try:
|
||||
os.rename(seg_dir, target)
|
||||
moved.append(target)
|
||||
logger.warning(
|
||||
"Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s",
|
||||
"Quarantined corrupt HNSW segment %s (%s); renamed to %s",
|
||||
seg_dir,
|
||||
sqlite_mtime - hnsw_mtime,
|
||||
reason,
|
||||
target,
|
||||
)
|
||||
except OSError:
|
||||
logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir)
|
||||
|
||||
return moved
|
||||
|
||||
|
||||
@@ -504,22 +543,17 @@ def hnsw_capacity_status(palace_path: str, collection_name: str = "mempalace_dra
|
||||
divergence_floor = max(_HNSW_DIVERGENCE_FALLBACK_FLOOR, 2 * sync_threshold)
|
||||
|
||||
if hnsw_count is None:
|
||||
# No pickle yet — segment hasn't persisted metadata. Could be
|
||||
# fresh-but-unflushed (normal) or interrupted-mid-flush (bad).
|
||||
# We can't distinguish without the pickle, so only flag
|
||||
# divergence when sqlite holds clearly more than two flush
|
||||
# windows worth — same threshold as the with-pickle path.
|
||||
if sqlite_count > divergence_floor:
|
||||
out["status"] = "diverged"
|
||||
out["diverged"] = True
|
||||
out["divergence"] = sqlite_count
|
||||
out["message"] = (
|
||||
f"sqlite holds {sqlite_count:,} embeddings but the HNSW segment "
|
||||
"has never flushed metadata — vector search will return nothing "
|
||||
"until the segment is rebuilt. Run `mempalace repair`."
|
||||
)
|
||||
else:
|
||||
out["message"] = "HNSW segment metadata not yet flushed; skipping"
|
||||
# No pickle yet, so this probe cannot measure HNSW capacity.
|
||||
# Chroma 1.5.x can have binary HNSW files without a flushed
|
||||
# metadata pickle; absence of the pickle alone is not proof that
|
||||
# vector search is unusable or dangerous. Keep the status unknown
|
||||
# so MCP does not globally disable vectors on an inconclusive
|
||||
# signal. Corrupt/invalid metadata, when present, is handled by
|
||||
# quarantine_invalid_hnsw_metadata before Chroma opens.
|
||||
out["message"] = (
|
||||
"HNSW capacity unavailable: metadata has not been flushed; "
|
||||
"leaving vector search enabled"
|
||||
)
|
||||
return out
|
||||
|
||||
divergence = sqlite_count - hnsw_count
|
||||
@@ -606,6 +640,97 @@ def _pin_hnsw_threads(collection) -> None:
|
||||
_BLOB_FIX_MARKER = ".blob_seq_ids_migrated"
|
||||
|
||||
|
||||
def _valid_dimensionality(value: object) -> bool:
|
||||
return isinstance(value, Integral) and not isinstance(value, bool) and int(value) > 0
|
||||
|
||||
|
||||
def _persisted_metadata_fields(obj: object) -> tuple[object, object]:
|
||||
if isinstance(obj, dict):
|
||||
return obj.get("dimensionality"), obj.get("id_to_label")
|
||||
return getattr(obj, "dimensionality", None), getattr(obj, "id_to_label", None)
|
||||
|
||||
|
||||
def quarantine_invalid_hnsw_metadata(palace_path: str) -> list[str]:
|
||||
"""Quarantine segment dirs whose ``index_metadata.pickle`` is unreadable or invalid.
|
||||
|
||||
Chroma's persisted HNSW metadata is untrusted disk state. If a segment has
|
||||
labels but no valid positive dimensionality, current Chroma versions can
|
||||
accept the pickle and crash later in the Rust loader. We rename the entire
|
||||
segment out of the way before ``PersistentClient`` opens so Chroma can
|
||||
rebuild cleanly instead of touching known-bad metadata.
|
||||
"""
|
||||
try:
|
||||
entries = os.listdir(palace_path)
|
||||
except OSError:
|
||||
return []
|
||||
|
||||
moved: list[str] = []
|
||||
for name in entries:
|
||||
if "-" not in name or name.startswith(".") or ".drift-" in name or ".corrupt-" in name:
|
||||
continue
|
||||
seg_dir = os.path.join(palace_path, name)
|
||||
if not os.path.isdir(seg_dir):
|
||||
continue
|
||||
|
||||
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
|
||||
if not os.path.isfile(meta_path):
|
||||
continue
|
||||
|
||||
reason = None
|
||||
try:
|
||||
persisted = _SafePersistentDataUnpickler.load(meta_path)
|
||||
except (EOFError, OSError):
|
||||
logger.debug(
|
||||
"Skipping invalid-HNSW quarantine for transient metadata read in %s",
|
||||
meta_path,
|
||||
exc_info=True,
|
||||
)
|
||||
continue
|
||||
except pickle.UnpicklingError as exc:
|
||||
if "truncated" in str(exc).lower() or "ran out of input" in str(exc).lower():
|
||||
logger.debug(
|
||||
"Skipping invalid-HNSW quarantine for transient metadata read in %s",
|
||||
meta_path,
|
||||
exc_info=True,
|
||||
)
|
||||
continue
|
||||
reason = f"invalid index_metadata.pickle: {exc}"
|
||||
except Exception as exc:
|
||||
reason = f"invalid index_metadata.pickle: {exc}"
|
||||
else:
|
||||
if not isinstance(persisted, dict) and not (
|
||||
hasattr(persisted, "dimensionality") or hasattr(persisted, "id_to_label")
|
||||
):
|
||||
reason = f"unrecognized index_metadata.pickle payload: {type(persisted).__name__}"
|
||||
else:
|
||||
dimensionality, id_to_label = _persisted_metadata_fields(persisted)
|
||||
if id_to_label is not None and not isinstance(id_to_label, dict):
|
||||
reason = f"invalid id_to_label type {type(id_to_label).__name__}"
|
||||
else:
|
||||
has_labels = bool(id_to_label)
|
||||
if has_labels and not _valid_dimensionality(dimensionality):
|
||||
reason = (
|
||||
"labels present but dimensionality is missing or invalid "
|
||||
f"({dimensionality!r})"
|
||||
)
|
||||
elif dimensionality is not None and not _valid_dimensionality(dimensionality):
|
||||
reason = f"invalid dimensionality {dimensionality!r}"
|
||||
|
||||
if reason is None:
|
||||
continue
|
||||
|
||||
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
target = f"{seg_dir}.corrupt-{stamp}"
|
||||
try:
|
||||
os.rename(seg_dir, target)
|
||||
moved.append(target)
|
||||
logger.warning("Quarantined invalid HNSW metadata in %s: %s", seg_dir, reason)
|
||||
except OSError:
|
||||
logger.exception("Failed to quarantine invalid HNSW metadata in %s", seg_dir)
|
||||
|
||||
return moved
|
||||
|
||||
|
||||
def _fix_blob_seq_ids(palace_path: str) -> None:
|
||||
"""Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
|
||||
|
||||
@@ -691,11 +816,58 @@ def _as_list(v: Any) -> list:
|
||||
return [v]
|
||||
|
||||
|
||||
class ChromaCollection(BaseCollection):
|
||||
"""Thin adapter translating ChromaDB dict returns into typed results."""
|
||||
def _close_client(client) -> None:
|
||||
"""Call ``PersistentClient.close()`` if available, swallow otherwise.
|
||||
|
||||
def __init__(self, collection):
|
||||
chromadb 1.5.x exposes ``Client.close()`` to release rust-side SQLite
|
||||
file locks; older versions relied on GC. Try/except keeps forward-compat.
|
||||
"""
|
||||
if client is None:
|
||||
return
|
||||
try:
|
||||
client.close()
|
||||
except Exception:
|
||||
logger.debug("client.close() unavailable or failed", exc_info=True)
|
||||
|
||||
|
||||
class ChromaCollection(BaseCollection):
|
||||
"""Thin adapter translating ChromaDB dict returns into typed results.
|
||||
|
||||
When ``palace_path`` is set, all write methods (``add``, ``upsert``,
|
||||
``update``, ``delete``) acquire ``mine_palace_lock(palace_path)`` for the
|
||||
duration of the underlying chromadb call. This serializes MCP and other
|
||||
direct-backend writers against ``mempalace mine`` and against each other,
|
||||
closing the race between concurrent writers that triggers ChromaDB's
|
||||
multi-threaded HNSW corruption (#974/#965).
|
||||
|
||||
The lock is the same primitive used by ``miner.mine()`` so re-entrant
|
||||
acquisition from inside the mine pipeline (mine -> _mine_body ->
|
||||
collection.upsert) is short-circuited by the per-thread guard inside
|
||||
``mine_palace_lock`` — no self-deadlock.
|
||||
|
||||
``palace_path=None`` disables the wrapping, preserving the legacy
|
||||
no-lock behaviour for callers that construct a ``ChromaCollection``
|
||||
directly without going through ``ChromaBackend``.
|
||||
"""
|
||||
|
||||
def __init__(self, collection, palace_path: Optional[str] = None):
|
||||
self._collection = collection
|
||||
self._palace_path = palace_path
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _write_lock(self):
|
||||
"""Acquire ``mine_palace_lock`` for the configured palace, if any.
|
||||
|
||||
No-op (yields immediately) when ``self._palace_path`` is None.
|
||||
"""
|
||||
if self._palace_path is None:
|
||||
yield
|
||||
return
|
||||
# Late import — palace.py imports ChromaBackend from this module.
|
||||
from ..palace import mine_palace_lock
|
||||
|
||||
with mine_palace_lock(self._palace_path):
|
||||
yield
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Writes
|
||||
@@ -707,7 +879,8 @@ class ChromaCollection(BaseCollection):
|
||||
kwargs["metadatas"] = metadatas
|
||||
if embeddings is not None:
|
||||
kwargs["embeddings"] = embeddings
|
||||
self._collection.add(**kwargs)
|
||||
with self._write_lock():
|
||||
self._collection.add(**kwargs)
|
||||
|
||||
def upsert(self, *, documents, ids, metadatas=None, embeddings=None):
|
||||
kwargs: dict[str, Any] = {"documents": documents, "ids": ids}
|
||||
@@ -715,7 +888,8 @@ class ChromaCollection(BaseCollection):
|
||||
kwargs["metadatas"] = metadatas
|
||||
if embeddings is not None:
|
||||
kwargs["embeddings"] = embeddings
|
||||
self._collection.upsert(**kwargs)
|
||||
with self._write_lock():
|
||||
self._collection.upsert(**kwargs)
|
||||
|
||||
def update(
|
||||
self,
|
||||
@@ -734,7 +908,8 @@ class ChromaCollection(BaseCollection):
|
||||
kwargs["metadatas"] = metadatas
|
||||
if embeddings is not None:
|
||||
kwargs["embeddings"] = embeddings
|
||||
self._collection.update(**kwargs)
|
||||
with self._write_lock():
|
||||
self._collection.update(**kwargs)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Reads
|
||||
@@ -878,7 +1053,8 @@ class ChromaCollection(BaseCollection):
|
||||
kwargs["ids"] = ids
|
||||
if where is not None:
|
||||
kwargs["where"] = where
|
||||
self._collection.delete(**kwargs)
|
||||
with self._write_lock():
|
||||
self._collection.delete(**kwargs)
|
||||
|
||||
def count(self):
|
||||
return self._collection.count()
|
||||
@@ -992,7 +1168,7 @@ class ChromaBackend(BaseBackend):
|
||||
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||
# DB was present when cache was built but is now missing → invalidate.
|
||||
if cached is not None and not os.path.isfile(db_path):
|
||||
self._clients.pop(palace_path, None)
|
||||
_close_client(self._clients.pop(palace_path, None))
|
||||
self._freshness.pop(palace_path, None)
|
||||
cached = None
|
||||
cached_inode, cached_mtime = 0, 0.0
|
||||
@@ -1008,6 +1184,13 @@ class ChromaBackend(BaseBackend):
|
||||
)
|
||||
|
||||
if cached is None or inode_changed or mtime_changed or mtime_appeared:
|
||||
# An inode swap means we are reopening a different physical DB
|
||||
# (post-restore, fresh palace at the same path, etc.); drop the
|
||||
# per-process gate so the quarantine pre-checks run again
|
||||
# against the new disk state instead of trusting cached "we
|
||||
# already cleaned this path" credit from the prior inode.
|
||||
if inode_changed:
|
||||
ChromaBackend._quarantined_paths.discard(palace_path)
|
||||
ChromaBackend._prepare_palace_for_open(palace_path)
|
||||
cached = chromadb.PersistentClient(path=palace_path)
|
||||
self._clients[palace_path] = cached
|
||||
@@ -1021,26 +1204,27 @@ class ChromaBackend(BaseBackend):
|
||||
# Public static helpers (legacy; prefer :meth:`get_collection`)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
# Per-process record of palaces that have already had quarantine_stale_hnsw
|
||||
# invoked at least once. The proactive drift check is a *cold-start*
|
||||
# protection — it catches HNSW segments that arrived stale relative to
|
||||
# ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore,
|
||||
# crashed-mid-write). Once a long-running process has opened the palace
|
||||
# cleanly, re-firing on every reconnect is a *runtime thrash*: the
|
||||
# daemon's own writes bump sqlite mtime but HNSW flushes batch on
|
||||
# chromadb's internal cadence, so the mtime gap naturally exceeds the
|
||||
# threshold under steady write load even though nothing is corrupt.
|
||||
# Per-process record of palaces that have already had the cold-start
|
||||
# quarantine invoked at least once. The proactive HNSW checks are a
|
||||
# *cold-start* protection — they catch segments that arrive stale relative
|
||||
# to ``chroma.sqlite3`` or invalid on disk (e.g. cross-machine replication,
|
||||
# partial restore, crashed-mid-write). Once a long-running process has
|
||||
# opened the palace cleanly, re-firing the stale check on every reconnect
|
||||
# is a *runtime thrash*: the daemon's own writes bump sqlite mtime but HNSW
|
||||
# flushes batch on chromadb's internal cadence, so the mtime gap naturally
|
||||
# exceeds the threshold under steady write load even though nothing is
|
||||
# corrupt.
|
||||
# Real runtime drift is still handled — palace-daemon's ``_auto_repair``
|
||||
# calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors,
|
||||
# which bypasses this gate.
|
||||
#
|
||||
# Thread-safety: this set is mutated without a lock. Two concurrent
|
||||
# ``make_client()`` calls for the same palace can both pass the
|
||||
# membership check and both invoke ``quarantine_stale_hnsw``. That's
|
||||
# safe because the function is idempotent (mtime check + timestamped
|
||||
# rename of distinct directories), so the worst-case race produces
|
||||
# one redundant rename attempt that no-ops. Idempotency is the
|
||||
# safety property; locking would add cost without correctness gain.
|
||||
# membership check and both invoke the cold-start quarantine. That's
|
||||
# safe because the functions are idempotent (mtime checks + timestamped
|
||||
# rename of distinct directories), so the worst-case race produces one
|
||||
# redundant rename attempt that no-ops. Idempotency is the safety
|
||||
# property; locking would add cost without correctness gain.
|
||||
_quarantined_paths: set[str] = set()
|
||||
|
||||
@staticmethod
|
||||
@@ -1048,12 +1232,16 @@ class ChromaBackend(BaseBackend):
|
||||
"""Run the pre-open safety pass shared by :meth:`make_client` and
|
||||
:meth:`_client`.
|
||||
|
||||
Two steps, both required before constructing a ``PersistentClient``:
|
||||
Three steps, all required before constructing a ``PersistentClient``:
|
||||
|
||||
1. ``_fix_blob_seq_ids`` — repairs the BLOB seq_id quirk that bites
|
||||
certain chromadb migrations.
|
||||
2. ``quarantine_stale_hnsw`` — gated by :attr:`_quarantined_paths` so
|
||||
it fires once per palace per process. This is the SIGSEGV
|
||||
2. ``quarantine_invalid_hnsw_metadata`` — renames aside any HNSW
|
||||
``index_metadata.pickle`` that fails to load, so chromadb opens
|
||||
against an empty index instead of crashing on the unloadable
|
||||
pickle (#1266 / PR #1285).
|
||||
3. ``quarantine_stale_hnsw`` — also gated by :attr:`_quarantined_paths`
|
||||
so it fires once per palace per process. This is the SIGSEGV
|
||||
prevention path for stale HNSW segments (see #1121, #1132, #1263);
|
||||
wiring it through this helper means CLI mining, search, repair,
|
||||
and status all benefit, not just the legacy ``make_client``
|
||||
@@ -1065,6 +1253,7 @@ class ChromaBackend(BaseBackend):
|
||||
"""
|
||||
_fix_blob_seq_ids(palace_path)
|
||||
if palace_path not in ChromaBackend._quarantined_paths:
|
||||
quarantine_invalid_hnsw_metadata(palace_path)
|
||||
quarantine_stale_hnsw(palace_path)
|
||||
ChromaBackend._quarantined_paths.add(palace_path)
|
||||
|
||||
@@ -1076,7 +1265,7 @@ class ChromaBackend(BaseBackend):
|
||||
own client cache. New code should obtain a collection through
|
||||
:meth:`get_collection` which manages caching internally.
|
||||
|
||||
Quarantines stale HNSW segments **once per palace per process**. See
|
||||
Quarantines HNSW segments **once per palace per process**. See
|
||||
:attr:`_quarantined_paths` for the rationale (cold-start protection
|
||||
vs. runtime thrash on steady-write daemons).
|
||||
"""
|
||||
@@ -1146,17 +1335,25 @@ class ChromaBackend(BaseBackend):
|
||||
else:
|
||||
collection = client.get_collection(collection_name, **ef_kwargs)
|
||||
_pin_hnsw_threads(collection)
|
||||
return ChromaCollection(collection)
|
||||
return ChromaCollection(collection, palace_path=palace_path)
|
||||
|
||||
def close_palace(self, palace) -> None:
|
||||
"""Drop cached handles for ``palace``. Accepts ``PalaceRef`` or legacy path str."""
|
||||
"""Drop cached handles for ``palace`` and release its SQLite file lock.
|
||||
|
||||
Accepts ``PalaceRef`` or legacy path str. chromadb's rust-side file
|
||||
lock is held until ``PersistentClient.close()`` is called, so plain
|
||||
dict eviction would leave the palace path unreopenable and
|
||||
unremovable in the same process.
|
||||
"""
|
||||
path = palace.local_path if isinstance(palace, PalaceRef) else palace
|
||||
if path is None:
|
||||
return
|
||||
self._clients.pop(path, None)
|
||||
_close_client(self._clients.pop(path, None))
|
||||
self._freshness.pop(path, None)
|
||||
|
||||
def close(self) -> None:
|
||||
for client in self._clients.values():
|
||||
_close_client(client)
|
||||
self._clients.clear()
|
||||
self._freshness.clear()
|
||||
self._closed = True
|
||||
@@ -1197,7 +1394,7 @@ class ChromaBackend(BaseBackend):
|
||||
},
|
||||
**ef_kwargs,
|
||||
)
|
||||
return ChromaCollection(collection)
|
||||
return ChromaCollection(collection, palace_path=palace_path)
|
||||
|
||||
|
||||
def _normalize_get_collection_args(args, kwargs):
|
||||
|
||||
+134
-31
@@ -654,10 +654,19 @@ def cmd_repair(args):
|
||||
import shutil
|
||||
from .backends.chroma import ChromaBackend
|
||||
from .migrate import confirm_destructive_action, contains_palace_database
|
||||
from .repair import TruncationDetected, check_extraction_safety
|
||||
from .repair import (
|
||||
RebuildCollectionError,
|
||||
TruncationDetected,
|
||||
_close_chroma_handles,
|
||||
_extract_drawers,
|
||||
_rebuild_collection_via_temp,
|
||||
check_extraction_safety,
|
||||
)
|
||||
|
||||
config = MempalaceConfig()
|
||||
collection_name = config.collection_name
|
||||
palace_path = os.path.abspath(
|
||||
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
|
||||
os.path.expanduser(args.palace) if args.palace else config.palace_path
|
||||
)
|
||||
|
||||
if getattr(args, "mode", "legacy") == "max-seq-id":
|
||||
@@ -673,6 +682,57 @@ def cmd_repair(args):
|
||||
)
|
||||
return
|
||||
|
||||
if getattr(args, "mode", "legacy") == "from-sqlite":
|
||||
from .migrate import confirm_destructive_action
|
||||
from .repair import RebuildPartialError, rebuild_from_sqlite
|
||||
|
||||
source_path = getattr(args, "source", None)
|
||||
source_path = (
|
||||
os.path.abspath(os.path.expanduser(source_path)) if source_path else palace_path
|
||||
)
|
||||
archive_existing = getattr(args, "archive_existing", False)
|
||||
|
||||
# Gate any path that touches the user's existing palace dir
|
||||
# behind confirm_destructive_action. The legacy mode already
|
||||
# gates; from-sqlite needs the same protection because:
|
||||
# (a) --archive-existing renames the existing palace,
|
||||
# (b) --source PATH writes into --palace dir which the user
|
||||
# may not realize is also a palace.
|
||||
# No prompt when source != dest AND dest does not exist (pure
|
||||
# extract-into-fresh-dir case is non-destructive to existing
|
||||
# palaces).
|
||||
is_destructive_to_dest = source_path == palace_path or os.path.exists(palace_path)
|
||||
if is_destructive_to_dest and not confirm_destructive_action(
|
||||
"Rebuild from SQLite", palace_path, assume_yes=getattr(args, "yes", False)
|
||||
):
|
||||
return
|
||||
|
||||
try:
|
||||
counts = rebuild_from_sqlite(
|
||||
source_palace=source_path,
|
||||
dest_palace=palace_path,
|
||||
archive_existing_dest=archive_existing,
|
||||
)
|
||||
except RebuildPartialError as exc:
|
||||
# The error itself was already printed by rebuild_from_sqlite
|
||||
# with recovery instructions; surface a non-zero exit so
|
||||
# scripts and CI gates see the failure.
|
||||
print(
|
||||
"\n Rebuild partial — see message above. "
|
||||
f"Failed in collection: {exc.failed_collection}"
|
||||
)
|
||||
sys.exit(1)
|
||||
# An empty counts dict is rebuild_from_sqlite's documented signal
|
||||
# for a validation refusal (missing source, existing dest,
|
||||
# in-place without --archive-existing). The library already
|
||||
# printed an actionable message; exit non-zero so unattended
|
||||
# scripts/CI distinguish "invalid inputs" from a successful
|
||||
# rebuild that legitimately found zero rows (which still returns
|
||||
# a populated dict with 0-valued counts).
|
||||
if not counts:
|
||||
sys.exit(1)
|
||||
return
|
||||
|
||||
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||
|
||||
if not os.path.isdir(palace_path):
|
||||
@@ -691,7 +751,7 @@ def cmd_repair(args):
|
||||
|
||||
# Try to read existing drawers
|
||||
try:
|
||||
col = backend.get_collection(palace_path, "mempalace_drawers")
|
||||
col = backend.get_collection(palace_path, collection_name)
|
||||
total = col.count()
|
||||
print(f" Drawers found: {total}")
|
||||
except Exception as e:
|
||||
@@ -711,18 +771,7 @@ def cmd_repair(args):
|
||||
# Extract all drawers in batches
|
||||
print("\n Extracting drawers...")
|
||||
batch_size = 5000
|
||||
all_ids = []
|
||||
all_docs = []
|
||||
all_metas = []
|
||||
offset = 0
|
||||
while offset < total:
|
||||
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
|
||||
if not batch["ids"]:
|
||||
break
|
||||
all_ids.extend(batch["ids"])
|
||||
all_docs.extend(batch["documents"])
|
||||
all_metas.extend(batch["metadatas"])
|
||||
offset += len(batch["ids"])
|
||||
all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size)
|
||||
print(f" Extracted {len(all_ids)} drawers")
|
||||
|
||||
# ── #1208 guard ──────────────────────────────────────────────────
|
||||
@@ -737,12 +786,12 @@ def cmd_repair(args):
|
||||
palace_path,
|
||||
len(all_ids),
|
||||
confirm_truncation_ok=getattr(args, "confirm_truncation_ok", False),
|
||||
collection_name=collection_name,
|
||||
)
|
||||
except TruncationDetected as e:
|
||||
print(e.message)
|
||||
return
|
||||
|
||||
# Backup and rebuild
|
||||
palace_path = os.path.normpath(palace_path)
|
||||
backup_path = palace_path + ".backup"
|
||||
if os.path.exists(backup_path):
|
||||
@@ -756,18 +805,34 @@ def cmd_repair(args):
|
||||
print(f" Backing up to {backup_path}...")
|
||||
shutil.copytree(palace_path, backup_path)
|
||||
|
||||
print(" Rebuilding collection...")
|
||||
backend.delete_collection(palace_path, "mempalace_drawers")
|
||||
new_col = backend.create_collection(palace_path, "mempalace_drawers")
|
||||
|
||||
filed = 0
|
||||
for i in range(0, len(all_ids), batch_size):
|
||||
batch_ids = all_ids[i : i + batch_size]
|
||||
batch_docs = all_docs[i : i + batch_size]
|
||||
batch_metas = all_metas[i : i + batch_size]
|
||||
new_col.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
||||
filed += len(batch_ids)
|
||||
print(f" Re-filed {filed}/{len(all_ids)} drawers...")
|
||||
try:
|
||||
filed = _rebuild_collection_via_temp(
|
||||
backend,
|
||||
palace_path,
|
||||
all_ids,
|
||||
all_docs,
|
||||
all_metas,
|
||||
batch_size,
|
||||
collection_name=collection_name,
|
||||
progress=print,
|
||||
)
|
||||
except RebuildCollectionError as e:
|
||||
print(f" Repair failed: {e}")
|
||||
if getattr(e, "live_replaced", False):
|
||||
print(" Live collection was already replaced; restoring from backup...")
|
||||
try:
|
||||
_close_chroma_handles(palace_path, backend=backend)
|
||||
if os.path.exists(palace_path):
|
||||
shutil.rmtree(palace_path)
|
||||
shutil.copytree(backup_path, palace_path)
|
||||
print(f" Restore complete from backup: {backup_path}")
|
||||
except Exception as restore_error:
|
||||
print(f" Automatic restore failed: {restore_error}")
|
||||
print(" Manual recovery required:")
|
||||
print(f" 1. Remove or rename the broken directory: {palace_path}")
|
||||
print(f" 2. Restore the backup directory to: {palace_path}")
|
||||
print(f" Backup location: {backup_path}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\n Repair complete. {filed} drawers rebuilt.")
|
||||
print(f" Backup saved at {backup_path}")
|
||||
@@ -935,7 +1000,25 @@ def cmd_compress(args):
|
||||
print(" (dry run -- nothing stored)")
|
||||
|
||||
|
||||
def _reconfigure_stdio_utf8_on_windows():
|
||||
"""Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI.
|
||||
|
||||
Thin wrapper around the shared helper in ``mempalace._stdio``. The CLI
|
||||
overrides stdout/stderr to ``replace`` because ``mempalace search``
|
||||
prints verbatim drawer text that may carry surrogate halves
|
||||
round-tripped from filenames -- ``strict`` would crash mid-print and
|
||||
lose the rest of the search result block. stdin keeps the default
|
||||
``surrogateescape`` so a redirected non-UTF-8 file does not kill the
|
||||
read on the first bad byte.
|
||||
"""
|
||||
from ._stdio import reconfigure_stdio_utf8_on_windows
|
||||
|
||||
reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
|
||||
|
||||
|
||||
def main():
|
||||
_reconfigure_stdio_utf8_on_windows()
|
||||
|
||||
version_label = f"MemPalace {__version__}"
|
||||
parser = argparse.ArgumentParser(
|
||||
description="MemPalace — Give your AI a memory. No API key required.",
|
||||
@@ -1195,11 +1278,31 @@ def main():
|
||||
)
|
||||
p_repair.add_argument(
|
||||
"--mode",
|
||||
choices=["legacy", "max-seq-id"],
|
||||
choices=["legacy", "max-seq-id", "from-sqlite"],
|
||||
default="legacy",
|
||||
help=(
|
||||
"legacy: full-palace rebuild (default). "
|
||||
"max-seq-id: un-poison max_seq_id rows corrupted by the legacy 0.6.x shim."
|
||||
"legacy: full-palace rebuild via the chromadb client (default). "
|
||||
"max-seq-id: un-poison max_seq_id rows corrupted by the legacy 0.6.x shim. "
|
||||
"from-sqlite: rebuild by reading rows directly from chroma.sqlite3, "
|
||||
"bypassing the chromadb client. Use when legacy mode bails because the "
|
||||
"chromadb client cannot open the collection."
|
||||
),
|
||||
)
|
||||
p_repair.add_argument(
|
||||
"--source",
|
||||
default=None,
|
||||
help=(
|
||||
"Source palace path for --mode from-sqlite (defaults to --palace). "
|
||||
"Use when extracting from an archived corrupt palace into a new location."
|
||||
),
|
||||
)
|
||||
p_repair.add_argument(
|
||||
"--archive-existing",
|
||||
action="store_true",
|
||||
help=(
|
||||
"For --mode from-sqlite when --source equals --palace: rename the "
|
||||
"existing palace to <palace>.pre-rebuild-<timestamp> before "
|
||||
"rebuilding so the corrupt copy is preserved."
|
||||
),
|
||||
)
|
||||
p_repair.add_argument(
|
||||
|
||||
+31
-11
@@ -40,6 +40,7 @@ import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from datetime import datetime
|
||||
@@ -101,6 +102,14 @@ class LLMConfig:
|
||||
self.endpoint = (endpoint or os.environ.get("LLM_ENDPOINT", "")).rstrip("/")
|
||||
self.key = key or os.environ.get("LLM_KEY", "")
|
||||
self.model = model or os.environ.get("LLM_MODEL", "")
|
||||
if self.endpoint:
|
||||
# Privacy-by-architecture: reject file:// and other non-HTTP schemes
|
||||
# so a misconfigured endpoint cannot exfiltrate local files.
|
||||
scheme = urllib.parse.urlparse(self.endpoint).scheme.lower()
|
||||
if scheme not in ("http", "https"):
|
||||
raise ValueError(
|
||||
f"LLM_ENDPOINT must use http:// or https:// (got scheme {scheme!r})"
|
||||
)
|
||||
|
||||
def missing(self) -> list:
|
||||
missing = []
|
||||
@@ -221,17 +230,28 @@ def regenerate_closets(
|
||||
print("No drawers in palace.")
|
||||
return {"processed": 0}
|
||||
|
||||
all_data = drawers_col.get(limit=total, include=["documents", "metadatas"])
|
||||
by_source = {}
|
||||
for doc_id, doc, meta in zip(all_data["ids"], all_data["documents"], all_data["metadatas"]):
|
||||
source = meta.get("source_file", "unknown")
|
||||
w = meta.get("wing", "")
|
||||
if wing and w != wing:
|
||||
continue
|
||||
if source not in by_source:
|
||||
by_source[source] = {"drawer_ids": [], "content": [], "meta": meta}
|
||||
by_source[source]["drawer_ids"].append(doc_id)
|
||||
by_source[source]["content"].append(doc)
|
||||
# Paginate the fetch — a single get(limit=total, ...) blows through
|
||||
# SQLite's SQLITE_MAX_VARIABLE_NUMBER (32766) on large palaces and
|
||||
# crashes inside chromadb (see #802, #850, #1073).
|
||||
by_source: dict = {}
|
||||
batch_size = 5000
|
||||
offset = 0
|
||||
while offset < total:
|
||||
batch = drawers_col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
|
||||
ids = batch["ids"]
|
||||
if not ids:
|
||||
break
|
||||
for doc_id, doc, meta in zip(ids, batch["documents"], batch["metadatas"]):
|
||||
meta = meta or {}
|
||||
source = meta.get("source_file", "unknown")
|
||||
w = meta.get("wing", "")
|
||||
if wing and w != wing:
|
||||
continue
|
||||
if source not in by_source:
|
||||
by_source[source] = {"drawer_ids": [], "content": [], "meta": meta}
|
||||
by_source[source]["drawer_ids"].append(doc_id)
|
||||
by_source[source]["content"].append(doc)
|
||||
offset += len(ids)
|
||||
|
||||
sources = list(by_source.keys())
|
||||
if sample > 0:
|
||||
|
||||
@@ -7,6 +7,7 @@ Priority: env vars > config file (~/.mempalace/config.json) > defaults
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@@ -81,6 +82,38 @@ def sanitize_kg_value(value: str, field_name: str = "value") -> str:
|
||||
return value
|
||||
|
||||
|
||||
# ISO-8601 date validator for knowledge-graph temporal parameters
|
||||
# (as_of, valid_from, valid_to, ended). Parameterized queries already
|
||||
# prevent SQL injection, but unvalidated date strings silently miss
|
||||
# every row — callers cannot distinguish "no fact at this time" from
|
||||
# "your date format was unrecognized." Require full YYYY-MM-DD: KG
|
||||
# queries compare TEXT dates lexicographically, so partials like "2026"
|
||||
# would re-introduce silent empty results (e.g. "2026-01-01" <= "2026"
|
||||
# is False), defeating the purpose of validation.
|
||||
_ISO_DATE_RE = re.compile(r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$")
|
||||
|
||||
|
||||
def sanitize_iso_date(value, field_name: str = "date"):
|
||||
"""Validate an ISO-8601 date string, accepting None or empty as-is.
|
||||
|
||||
Accepts only ``YYYY-MM-DD``. Raises ValueError on any other
|
||||
non-empty input so the MCP layer can surface a clear error to the
|
||||
caller instead of silently returning empty results. Partial dates
|
||||
(``YYYY``, ``YYYY-MM``) are rejected because KG queries compare
|
||||
TEXT dates lexicographically and would silently exclude valid facts.
|
||||
"""
|
||||
if value is None or value == "":
|
||||
return value
|
||||
if not isinstance(value, str):
|
||||
raise ValueError(f"{field_name} must be a string")
|
||||
value = value.strip()
|
||||
if not _ISO_DATE_RE.match(value):
|
||||
raise ValueError(
|
||||
f"{field_name}={value!r} is not a valid ISO-8601 date " f"(expected YYYY-MM-DD)"
|
||||
)
|
||||
return value
|
||||
|
||||
|
||||
def sanitize_content(value: str, max_length: int = 100_000) -> str:
|
||||
"""Validate drawer/diary content length."""
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
@@ -95,6 +128,13 @@ def sanitize_content(value: str, max_length: int = 100_000) -> str:
|
||||
DEFAULT_PALACE_PATH = os.path.expanduser("~/.mempalace/palace")
|
||||
DEFAULT_COLLECTION_NAME = "mempalace_drawers"
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_configured_collection_name() -> str:
|
||||
"""Return the configured drawer collection name without repeated config-file reads."""
|
||||
return MempalaceConfig().collection_name
|
||||
|
||||
|
||||
DEFAULT_TOPIC_WINGS = [
|
||||
"emotions",
|
||||
"consciousness",
|
||||
|
||||
@@ -11,6 +11,7 @@ Same palace as project mining. Different ingest strategy.
|
||||
import os
|
||||
import sys
|
||||
import hashlib
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
@@ -24,6 +25,8 @@ from .palace import (
|
||||
mine_lock,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("mempalace_mcp")
|
||||
|
||||
|
||||
# Cached hall keywords — avoids re-reading config per drawer
|
||||
_HALL_KEYWORDS_CACHE = None
|
||||
@@ -331,7 +334,7 @@ def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extr
|
||||
try:
|
||||
collection.delete(where={"source_file": source_file})
|
||||
except Exception:
|
||||
pass
|
||||
logger.debug("Stale-drawer purge failed for %s", source_file, exc_info=True)
|
||||
|
||||
# Batch chunks into bounded upserts so large transcripts keep most of
|
||||
# the embedding speedup without one huge Chroma/SQLite request. Keep
|
||||
|
||||
+1
-1
@@ -89,7 +89,7 @@ def dedup_source_group(col, drawer_ids, threshold=DEFAULT_THRESHOLD, dry_run=Tru
|
||||
kept = []
|
||||
to_delete = []
|
||||
|
||||
for did, doc, meta in items:
|
||||
for did, doc, _meta in items:
|
||||
if not doc or len(doc) < 20:
|
||||
to_delete.append(did)
|
||||
continue
|
||||
|
||||
@@ -873,7 +873,7 @@ class Dialect:
|
||||
|
||||
for date_key in sorted(by_date.keys()):
|
||||
lines.append(f"=MOMENTS[{date_key}]=")
|
||||
for z, fnum in by_date[date_key]:
|
||||
for z, _fnum in by_date[date_key]:
|
||||
entities = []
|
||||
for p in z.get("people", []):
|
||||
code = self.encode_entity(p)
|
||||
|
||||
@@ -16,6 +16,7 @@ Usage:
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
@@ -320,11 +321,35 @@ class EntityRegistry:
|
||||
self._path.parent.chmod(0o700)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
self._path.write_text(json.dumps(self._data, indent=2), encoding="utf-8")
|
||||
# Atomic write: serialize to a sibling temp file in the same dir
|
||||
# (so os.replace stays on one filesystem), fsync, then rename over
|
||||
# the target. A crash mid-write leaves the previous registry intact
|
||||
# instead of a half-written file or an empty file from the truncate.
|
||||
payload = json.dumps(self._data, indent=2)
|
||||
tmp_path = self._path.with_name(self._path.name + ".tmp")
|
||||
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||
f.write(payload)
|
||||
f.flush()
|
||||
os.fsync(f.fileno())
|
||||
try:
|
||||
self._path.chmod(0o600)
|
||||
tmp_path.chmod(0o600)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
os.replace(tmp_path, self._path)
|
||||
# On ext4 (and similar) the rename's durability across power loss
|
||||
# requires an additional fsync on the parent directory. Without it,
|
||||
# the kernel can ack the rename and a crash reverts to the state
|
||||
# where the temp file is present and the target is at the old version.
|
||||
try:
|
||||
dir_fd = os.open(str(self._path.parent), os.O_RDONLY)
|
||||
try:
|
||||
os.fsync(dir_fd)
|
||||
finally:
|
||||
os.close(dir_fd)
|
||||
except OSError:
|
||||
# Windows and some special filesystems reject directory fds — they
|
||||
# have different durability semantics on rename anyway.
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _empty() -> dict:
|
||||
|
||||
@@ -27,6 +27,7 @@ Usage:
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
@@ -35,6 +36,8 @@ from datetime import datetime, timezone
|
||||
# ~/.mempalace/known_entities.json on every check_text call.
|
||||
from .miner import _load_known_entities_raw
|
||||
|
||||
logger = logging.getLogger("mempalace_mcp")
|
||||
|
||||
|
||||
# Narrow detection patterns — parse "X is Y's Z" and "X's Z is Y".
|
||||
# Names are captured greedily as word sequences (letters + optional
|
||||
@@ -214,6 +217,7 @@ def _check_kg_contradictions(text: str, palace_path: str) -> list:
|
||||
try:
|
||||
facts = kg.query_entity(subject, direction="outgoing")
|
||||
except Exception:
|
||||
logger.debug("KG lookup failed for subject %r", subject, exc_info=True)
|
||||
continue
|
||||
if not facts:
|
||||
continue
|
||||
@@ -303,11 +307,27 @@ def _edit_distance(s1: str, s2: str) -> int:
|
||||
return prev[-1]
|
||||
|
||||
|
||||
def _reconfigure_stdio_utf8_on_windows():
|
||||
"""Decode --stdin payload as UTF-8 on Windows.
|
||||
|
||||
Thin wrapper around the shared helper in ``mempalace._stdio``. Mirrors
|
||||
the primary CLI policy: stdout/stderr use ``replace`` because
|
||||
extracted fact text can include surrogate halves round-tripped from
|
||||
filenames -- ``strict`` would raise UnicodeEncodeError mid-print.
|
||||
stdin keeps the default ``surrogateescape``.
|
||||
"""
|
||||
from ._stdio import reconfigure_stdio_utf8_on_windows
|
||||
|
||||
reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
_reconfigure_stdio_utf8_on_windows()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Check text against known facts in the MemPalace palace.",
|
||||
epilog="Exits 0 when no issues found, 1 when one or more issues detected.",
|
||||
|
||||
@@ -16,6 +16,23 @@ from pathlib import Path
|
||||
|
||||
SAVE_INTERVAL = 15
|
||||
STATE_DIR = Path.home() / ".mempalace" / "hook_state"
|
||||
PALACE_ROOT = Path.home() / ".mempalace"
|
||||
|
||||
|
||||
def _palace_root_exists() -> bool:
|
||||
"""User-removable kill-switch.
|
||||
|
||||
If ~/.mempalace/ does not exist, the user has explicitly cleared it.
|
||||
All hook side effects (logging, state dir creation, mining, ingestion)
|
||||
must respect this and short-circuit BEFORE touching disk — including
|
||||
before logging the short-circuit itself.
|
||||
|
||||
Uses ``is_dir()`` rather than ``exists()`` so a stray regular file at
|
||||
``~/.mempalace`` (or a broken symlink) is treated as absent — otherwise
|
||||
the kill-switch would be bypassed and ``STATE_DIR.mkdir()`` would later
|
||||
crash on ``NotADirectoryError``.
|
||||
"""
|
||||
return PALACE_ROOT.is_dir()
|
||||
|
||||
|
||||
def _mempalace_python() -> str:
|
||||
@@ -142,6 +159,8 @@ _state_dir_initialized = False
|
||||
|
||||
def _log(message: str):
|
||||
"""Append to hook state log file."""
|
||||
if not _palace_root_exists():
|
||||
return # User removed the palace; do not recreate by logging
|
||||
global _state_dir_initialized
|
||||
try:
|
||||
if not _state_dir_initialized:
|
||||
@@ -550,6 +569,9 @@ def _wing_from_transcript_path(transcript_path: str) -> str:
|
||||
|
||||
def hook_stop(data: dict, harness: str):
|
||||
"""Stop hook: block every N messages for auto-save."""
|
||||
if not _palace_root_exists():
|
||||
_output({})
|
||||
return
|
||||
parsed = _parse_harness_input(data, harness)
|
||||
session_id = parsed["session_id"]
|
||||
stop_hook_active = parsed["stop_hook_active"]
|
||||
@@ -659,6 +681,9 @@ def hook_stop(data: dict, harness: str):
|
||||
|
||||
def hook_session_start(data: dict, harness: str):
|
||||
"""Session start hook: initialize session tracking state."""
|
||||
if not _palace_root_exists():
|
||||
_output({})
|
||||
return
|
||||
parsed = _parse_harness_input(data, harness)
|
||||
session_id = parsed["session_id"]
|
||||
|
||||
@@ -673,6 +698,9 @@ def hook_session_start(data: dict, harness: str):
|
||||
|
||||
def hook_precompact(data: dict, harness: str):
|
||||
"""Precompact hook: mine transcript synchronously, then allow compaction."""
|
||||
if not _palace_root_exists():
|
||||
_output({})
|
||||
return
|
||||
parsed = _parse_harness_input(data, harness)
|
||||
session_id = parsed["session_id"]
|
||||
transcript_path = parsed["transcript_path"]
|
||||
|
||||
@@ -171,6 +171,15 @@ class KnowledgeGraph:
|
||||
add_triple("Max", "does", "swimming", valid_from="2025-01-01")
|
||||
add_triple("Alice", "worried_about", "Max injury", valid_from="2026-01", valid_to="2026-02")
|
||||
"""
|
||||
# Reject inverted intervals: a triple with valid_to < valid_from
|
||||
# would never satisfy `valid_from <= as_of AND valid_to >= as_of`,
|
||||
# so it would be invisible to every query — silently corrupt.
|
||||
if valid_from is not None and valid_to is not None and valid_to < valid_from:
|
||||
raise ValueError(
|
||||
f"valid_to={valid_to!r} is before valid_from={valid_from!r}; "
|
||||
"an inverted interval would be invisible to every KG query"
|
||||
)
|
||||
|
||||
sub_id = self._entity_id(subject)
|
||||
obj_id = self._entity_id(obj)
|
||||
pred = predicate.lower().replace(" ", "_")
|
||||
|
||||
+6
-2
@@ -124,6 +124,8 @@ class Layer1:
|
||||
# Score each drawer: prefer high importance, recent filing
|
||||
scored = []
|
||||
for doc, meta in zip(docs, metas):
|
||||
meta = meta or {}
|
||||
doc = doc or ""
|
||||
importance = 3
|
||||
# Try multiple metadata keys that might carry weight info
|
||||
for key in ("importance", "emotional_weight", "weight"):
|
||||
@@ -155,7 +157,7 @@ class Layer1:
|
||||
lines.append(room_line)
|
||||
total_len += len(room_line)
|
||||
|
||||
for imp, meta, doc in entries:
|
||||
for _imp, meta, doc in entries:
|
||||
source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""
|
||||
|
||||
# Truncate doc to keep L1 compact
|
||||
@@ -222,6 +224,8 @@ class Layer2:
|
||||
|
||||
lines = [f"## L2 — ON-DEMAND ({len(docs)} drawers)"]
|
||||
for doc, meta in zip(docs[:n_results], metas[:n_results]):
|
||||
meta = meta or {}
|
||||
doc = doc or ""
|
||||
room_name = meta.get("room", "?")
|
||||
source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""
|
||||
snippet = doc.strip().replace("\n", " ")
|
||||
@@ -283,7 +287,7 @@ class Layer3:
|
||||
for i, (doc, meta, dist) in enumerate(zip(docs, metas, dists), 1):
|
||||
meta = meta or {}
|
||||
doc = doc or ""
|
||||
similarity = round(1 - dist, 3)
|
||||
similarity = round(max(0.0, 1 - dist), 3)
|
||||
wing_name = meta.get("wing", "?")
|
||||
room_name = meta.get("room", "?")
|
||||
source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""
|
||||
|
||||
+276
-99
@@ -46,6 +46,8 @@ import argparse # noqa: E402 (deferred until after stdio protection above)
|
||||
import json # noqa: E402
|
||||
import logging # noqa: E402
|
||||
import hashlib # noqa: E402
|
||||
import sqlite3 # noqa: E402
|
||||
import threading # noqa: E402
|
||||
import time # noqa: E402
|
||||
from datetime import date, datetime # noqa: E402
|
||||
from pathlib import Path # noqa: E402
|
||||
@@ -55,6 +57,7 @@ from .config import ( # noqa: E402
|
||||
sanitize_kg_value,
|
||||
sanitize_name,
|
||||
sanitize_content,
|
||||
sanitize_iso_date,
|
||||
)
|
||||
from .version import __version__ # noqa: E402
|
||||
from chromadb.errors import NotFoundError as _ChromaNotFoundError # noqa: E402
|
||||
@@ -78,7 +81,7 @@ from .palace_graph import ( # noqa: E402
|
||||
follow_tunnels,
|
||||
)
|
||||
|
||||
from .knowledge_graph import KnowledgeGraph # noqa: E402
|
||||
from .knowledge_graph import KnowledgeGraph, DEFAULT_KG_PATH # noqa: E402
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(message)s", stream=sys.stderr)
|
||||
logger = logging.getLogger("mempalace_mcp")
|
||||
@@ -103,12 +106,61 @@ if _args.palace:
|
||||
os.environ["MEMPALACE_PALACE_PATH"] = os.path.abspath(_args.palace)
|
||||
|
||||
_config = MempalaceConfig()
|
||||
# Only override KG path when --palace is explicitly provided; otherwise use
|
||||
# KnowledgeGraph's default (~/.mempalace/knowledge_graph.sqlite3).
|
||||
if _args.palace:
|
||||
_kg = KnowledgeGraph(db_path=os.path.join(_config.palace_path, "knowledge_graph.sqlite3"))
|
||||
else:
|
||||
_kg = KnowledgeGraph()
|
||||
|
||||
_kg_by_path: dict[str, KnowledgeGraph] = {}
|
||||
_kg_cache_lock = threading.Lock()
|
||||
_palace_flag_given: bool = bool(_args.palace)
|
||||
|
||||
|
||||
def _resolve_kg_path() -> str:
|
||||
if _palace_flag_given:
|
||||
return os.path.join(_config.palace_path, "knowledge_graph.sqlite3")
|
||||
return DEFAULT_KG_PATH
|
||||
|
||||
|
||||
def _get_kg() -> KnowledgeGraph:
|
||||
path = os.path.abspath(_resolve_kg_path())
|
||||
kg = _kg_by_path.get(path)
|
||||
if kg is not None:
|
||||
return kg
|
||||
with _kg_cache_lock:
|
||||
kg = _kg_by_path.get(path)
|
||||
if kg is None:
|
||||
kg = KnowledgeGraph(db_path=path)
|
||||
_kg_by_path[path] = kg
|
||||
return kg
|
||||
|
||||
|
||||
def _call_kg(op):
|
||||
"""Run ``op(kg)`` against the cached KG with one-shot retry on close.
|
||||
|
||||
Race we're guarding against: a handler grabs ``kg = _get_kg()`` and is
|
||||
about to call ``kg.add_triple(...)`` when ``tool_reconnect`` fires on
|
||||
another thread, drains ``_kg_by_path``, and closes the underlying
|
||||
sqlite3.Connection. The handler's call then raises
|
||||
``sqlite3.ProgrammingError: Cannot operate on a closed database`` and
|
||||
bubbles up as a -32000 to the MCP client even though the user just
|
||||
asked for a reconnect.
|
||||
|
||||
Catch that single class of error, evict the stale entry from the
|
||||
cache (only if it still points at the closed instance — another
|
||||
thread may have already replaced it), and try once more with a fresh
|
||||
KG. Beyond one retry give up: a second close means we're losing a
|
||||
sustained race we won't win in this loop, and a hung loop is worse
|
||||
than a clear failure surface.
|
||||
"""
|
||||
for attempt in range(2):
|
||||
kg = _get_kg()
|
||||
try:
|
||||
return op(kg)
|
||||
except sqlite3.ProgrammingError:
|
||||
if attempt == 0:
|
||||
path = os.path.abspath(_resolve_kg_path())
|
||||
with _kg_cache_lock:
|
||||
if _kg_by_path.get(path) is kg:
|
||||
_kg_by_path.pop(path, None)
|
||||
continue
|
||||
raise
|
||||
|
||||
|
||||
_client_cache = None
|
||||
@@ -141,7 +193,7 @@ def _refresh_vector_disabled_flag() -> None:
|
||||
"""
|
||||
global _vector_disabled, _vector_disabled_reason, _vector_capacity_status
|
||||
try:
|
||||
info = hnsw_capacity_status(_config.palace_path, "mempalace_drawers")
|
||||
info = hnsw_capacity_status(_config.palace_path, _config.collection_name)
|
||||
except Exception:
|
||||
logger.debug("HNSW capacity probe raised", exc_info=True)
|
||||
return
|
||||
@@ -274,68 +326,94 @@ def _get_client():
|
||||
|
||||
|
||||
def _get_collection(create=False):
|
||||
"""Return the ChromaDB collection, caching the client between calls."""
|
||||
global _collection_cache, _metadata_cache, _metadata_cache_time
|
||||
try:
|
||||
client = _get_client()
|
||||
# ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
|
||||
# collection but not the EF *instance/configuration*. So a reader or
|
||||
# writer that omits ``embedding_function=`` silently gets chromadb's
|
||||
# built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
|
||||
# one we spoof in ``mempalace.embedding`` (both report ``"default"``,
|
||||
# the identity check passes), but the *provider list* is chromadb's
|
||||
# default rather than the user's resolved device. On bleeding-edge
|
||||
# interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
|
||||
# that default provider selection can SIGSEGV the host process on
|
||||
# first ``col.add()``. The miner / Stop hook ingest path avoids this
|
||||
# because it routes through ``ChromaBackend.get_collection``, which
|
||||
# resolves the EF via ``ChromaBackend._resolve_embedding_function``;
|
||||
# the MCP server bypassed that abstraction. Resolve the EF inside the
|
||||
# branches that actually open a collection so warm-cache reads stay
|
||||
# zero-cost. Reuse the backend helper so the two call sites can't
|
||||
# drift on logging or fallback semantics.
|
||||
if create:
|
||||
ef = ChromaBackend._resolve_embedding_function()
|
||||
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
|
||||
# hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
|
||||
# HNSW insert path, which has a race in repairConnectionsForUpdate /
|
||||
# addPoint (see issues #974, #965). Set via metadata on fresh
|
||||
# collections and re-applied via _pin_hnsw_threads() for legacy
|
||||
# palaces whose collections were created before this fix (the
|
||||
# runtime config does not persist cross-process in chromadb 1.5.x,
|
||||
# so the retrofit runs every time _get_collection opens a cache).
|
||||
#
|
||||
# ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
|
||||
# is called with metadata that differs from what's stored. The split
|
||||
# below skips the metadata-comparison codepath for existing
|
||||
# collections, mirroring the backend-layer fix from #1262.
|
||||
try:
|
||||
"""Return the ChromaDB collection, caching the client between calls.
|
||||
|
||||
On failure, log the exception and retry once after clearing the client
|
||||
and collection caches. Tools were silently returning ``None`` when a
|
||||
cached client/collection went stale — typically after the chromadb
|
||||
rust bindings invalidated a handle following an out-of-band write —
|
||||
leaving the LLM with no diagnostic and no recovery path. The retry
|
||||
forces ``_get_client()`` to rebuild from scratch (which re-runs
|
||||
``quarantine_stale_hnsw`` per #1322), so the second attempt heals the
|
||||
common stale-handle / stale-HNSW case automatically.
|
||||
"""
|
||||
global _client_cache, _collection_cache, _metadata_cache, _metadata_cache_time
|
||||
for attempt in range(2):
|
||||
try:
|
||||
client = _get_client()
|
||||
# ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
|
||||
# collection but not the EF *instance/configuration*. So a reader or
|
||||
# writer that omits ``embedding_function=`` silently gets chromadb's
|
||||
# built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
|
||||
# one we spoof in ``mempalace.embedding`` (both report ``"default"``,
|
||||
# the identity check passes), but the *provider list* is chromadb's
|
||||
# default rather than the user's resolved device. On bleeding-edge
|
||||
# interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
|
||||
# that default provider selection can SIGSEGV the host process on
|
||||
# first ``col.add()``. The miner / Stop hook ingest path avoids this
|
||||
# because it routes through ``ChromaBackend.get_collection``, which
|
||||
# resolves the EF via ``ChromaBackend._resolve_embedding_function``;
|
||||
# the MCP server bypassed that abstraction. Resolve the EF inside the
|
||||
# branches that actually open a collection so warm-cache reads stay
|
||||
# zero-cost. Reuse the backend helper so the two call sites can't
|
||||
# drift on logging or fallback semantics.
|
||||
if create:
|
||||
ef = ChromaBackend._resolve_embedding_function()
|
||||
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
|
||||
# hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
|
||||
# HNSW insert path, which has a race in repairConnectionsForUpdate /
|
||||
# addPoint (see issues #974, #965). Set via metadata on fresh
|
||||
# collections and re-applied via _pin_hnsw_threads() for legacy
|
||||
# palaces whose collections were created before this fix (the
|
||||
# runtime config does not persist cross-process in chromadb 1.5.x,
|
||||
# so the retrofit runs every time _get_collection opens a cache).
|
||||
#
|
||||
# ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
|
||||
# is called with metadata that differs from what's stored. The split
|
||||
# below skips the metadata-comparison codepath for existing
|
||||
# collections, mirroring the backend-layer fix from #1262.
|
||||
try:
|
||||
raw = client.get_collection(_config.collection_name, **ef_kwargs)
|
||||
except _ChromaNotFoundError:
|
||||
raw = client.create_collection(
|
||||
_config.collection_name,
|
||||
metadata={
|
||||
"hnsw:space": "cosine",
|
||||
"hnsw:num_threads": 1,
|
||||
**_HNSW_BLOAT_GUARD,
|
||||
},
|
||||
**ef_kwargs,
|
||||
)
|
||||
_pin_hnsw_threads(raw)
|
||||
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
elif _collection_cache is None:
|
||||
ef = ChromaBackend._resolve_embedding_function()
|
||||
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
|
||||
raw = client.get_collection(_config.collection_name, **ef_kwargs)
|
||||
except _ChromaNotFoundError:
|
||||
raw = client.create_collection(
|
||||
_config.collection_name,
|
||||
metadata={
|
||||
"hnsw:space": "cosine",
|
||||
"hnsw:num_threads": 1,
|
||||
**_HNSW_BLOAT_GUARD,
|
||||
},
|
||||
**ef_kwargs,
|
||||
)
|
||||
_pin_hnsw_threads(raw)
|
||||
_collection_cache = ChromaCollection(raw)
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
elif _collection_cache is None:
|
||||
ef = ChromaBackend._resolve_embedding_function()
|
||||
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
|
||||
raw = client.get_collection(_config.collection_name, **ef_kwargs)
|
||||
_pin_hnsw_threads(raw)
|
||||
_collection_cache = ChromaCollection(raw)
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
return _collection_cache
|
||||
except Exception:
|
||||
return None
|
||||
_pin_hnsw_threads(raw)
|
||||
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
return _collection_cache
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"_get_collection attempt %d/2 failed (palace=%s, create=%s)",
|
||||
attempt + 1,
|
||||
_config.palace_path,
|
||||
create,
|
||||
)
|
||||
if attempt == 0:
|
||||
# Reset all caches so the next attempt forces _get_client()
|
||||
# to rebuild the chromadb client from scratch — that path
|
||||
# re-runs quarantine_stale_hnsw (#1322) and reopens the
|
||||
# collection cleanly, healing the common stale-handle case.
|
||||
_client_cache = None
|
||||
_collection_cache = None
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
return None
|
||||
|
||||
|
||||
def _no_palace():
|
||||
@@ -412,6 +490,7 @@ def _tool_status_via_sqlite() -> dict:
|
||||
db_path = os.path.join(_config.palace_path, "chroma.sqlite3")
|
||||
if not os.path.isfile(db_path):
|
||||
return _no_palace()
|
||||
collection_name = _config.collection_name
|
||||
|
||||
wings: dict = {}
|
||||
rooms: dict = {}
|
||||
@@ -425,8 +504,9 @@ def _tool_status_via_sqlite() -> dict:
|
||||
FROM embeddings e
|
||||
JOIN segments s ON e.segment_id = s.id
|
||||
JOIN collections c ON s.collection = c.id
|
||||
WHERE c.name = 'mempalace_drawers'
|
||||
"""
|
||||
WHERE c.name = ?
|
||||
""",
|
||||
(collection_name,),
|
||||
).fetchone()
|
||||
total = int(row[0]) if row and row[0] is not None else 0
|
||||
for key, target in (("wing", wings), ("room", rooms)):
|
||||
@@ -437,12 +517,12 @@ def _tool_status_via_sqlite() -> dict:
|
||||
JOIN embeddings e ON em.id = e.id
|
||||
JOIN segments s ON e.segment_id = s.id
|
||||
JOIN collections c ON s.collection = c.id
|
||||
WHERE c.name = 'mempalace_drawers'
|
||||
WHERE c.name = ?
|
||||
AND em.key = ?
|
||||
AND em.string_value IS NOT NULL
|
||||
GROUP BY em.string_value
|
||||
""",
|
||||
(key,),
|
||||
(collection_name, key),
|
||||
):
|
||||
target[value] = count
|
||||
finally:
|
||||
@@ -642,6 +722,7 @@ def tool_search(
|
||||
n_results=limit,
|
||||
max_distance=dist,
|
||||
vector_disabled=_vector_disabled,
|
||||
collection_name=_config.collection_name,
|
||||
)
|
||||
if _vector_disabled:
|
||||
result["vector_disabled"] = True
|
||||
@@ -688,10 +769,12 @@ def tool_check_duplicate(content: str, threshold: float = 0.9):
|
||||
if results["ids"] and results["ids"][0]:
|
||||
for i, drawer_id in enumerate(results["ids"][0]):
|
||||
dist = results["distances"][0][i]
|
||||
similarity = round(1 - dist, 3)
|
||||
similarity = round(max(0.0, 1 - dist), 3)
|
||||
if similarity >= threshold:
|
||||
meta = results["metadatas"][0][i]
|
||||
doc = results["documents"][0][i]
|
||||
# Chroma 1.5.x can return None for partially-flushed rows;
|
||||
# coerce to empty sentinels so downstream .get() is safe.
|
||||
meta = results["metadatas"][0][i] or {}
|
||||
doc = results["documents"][0][i] or ""
|
||||
duplicates.append(
|
||||
{
|
||||
"id": drawer_id,
|
||||
@@ -842,11 +925,11 @@ def tool_add_drawer(
|
||||
|
||||
# Idempotency: if the deterministic ID already exists, return success as a no-op.
|
||||
try:
|
||||
existing = col.get(ids=[drawer_id])
|
||||
if existing and existing["ids"]:
|
||||
existing = col.get(ids=[drawer_id], include=[])
|
||||
if existing.ids:
|
||||
return {"success": True, "reason": "already_exists", "drawer_id": drawer_id}
|
||||
except Exception:
|
||||
pass
|
||||
logger.debug("Idempotency pre-check failed for %s", drawer_id, exc_info=True)
|
||||
|
||||
try:
|
||||
col.upsert(
|
||||
@@ -863,6 +946,12 @@ def tool_add_drawer(
|
||||
}
|
||||
],
|
||||
)
|
||||
inserted = col.get(ids=[drawer_id], include=[])
|
||||
if not inserted.ids:
|
||||
raise RuntimeError(
|
||||
"Drawer write was acknowledged but the new ID is not readable. "
|
||||
"The palace index may be stale; run reconnect or repair."
|
||||
)
|
||||
_metadata_cache = None
|
||||
logger.info(f"Filed drawer: {drawer_id} → {wing}/{room}")
|
||||
return {"success": True, "drawer_id": drawer_id, "wing": wing, "room": room}
|
||||
@@ -961,6 +1050,13 @@ def tool_list_drawers(wing: str = None, room: str = None, limit: int = 20, offse
|
||||
kwargs["where"] = where
|
||||
result = col.get(**kwargs)
|
||||
|
||||
# Compute total matching drawers for pagination.
|
||||
if where:
|
||||
total_result = col.get(where=where, include=[])
|
||||
total = len(total_result["ids"])
|
||||
else:
|
||||
total = col.count()
|
||||
|
||||
drawers = []
|
||||
for i, did in enumerate(result["ids"]):
|
||||
meta = result["metadatas"][i]
|
||||
@@ -975,6 +1071,7 @@ def tool_list_drawers(wing: str = None, room: str = None, limit: int = 20, offse
|
||||
)
|
||||
return {
|
||||
"drawers": drawers,
|
||||
"total": total,
|
||||
"count": len(drawers),
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
@@ -1059,11 +1156,12 @@ def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
|
||||
"""Query the knowledge graph for an entity's relationships."""
|
||||
try:
|
||||
entity = sanitize_kg_value(entity, "entity")
|
||||
as_of = sanitize_iso_date(as_of, "as_of")
|
||||
except ValueError as e:
|
||||
return {"error": str(e)}
|
||||
if direction not in ("outgoing", "incoming", "both"):
|
||||
return {"error": "direction must be 'outgoing', 'incoming', or 'both'"}
|
||||
results = _kg.query_entity(entity, as_of=as_of, direction=direction)
|
||||
results = _call_kg(lambda kg: kg.query_entity(entity, as_of=as_of, direction=direction))
|
||||
return {"entity": entity, "as_of": as_of, "facts": results, "count": len(results)}
|
||||
|
||||
|
||||
@@ -1092,6 +1190,7 @@ def tool_kg_add(
|
||||
subject = sanitize_kg_value(subject, "subject")
|
||||
predicate = sanitize_name(predicate, "predicate")
|
||||
object = sanitize_kg_value(object, "object")
|
||||
valid_from = sanitize_iso_date(valid_from, "valid_from")
|
||||
except ValueError as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
@@ -1108,15 +1207,17 @@ def tool_kg_add(
|
||||
"source_drawer_id": source_drawer_id,
|
||||
},
|
||||
)
|
||||
triple_id = _kg.add_triple(
|
||||
subject,
|
||||
predicate,
|
||||
object,
|
||||
valid_from=valid_from,
|
||||
valid_to=valid_to,
|
||||
source_closet=source_closet,
|
||||
source_file=source_file,
|
||||
source_drawer_id=source_drawer_id,
|
||||
triple_id = _call_kg(
|
||||
lambda kg: kg.add_triple(
|
||||
subject,
|
||||
predicate,
|
||||
object,
|
||||
valid_from=valid_from,
|
||||
valid_to=valid_to,
|
||||
source_closet=source_closet,
|
||||
source_file=source_file,
|
||||
source_drawer_id=source_drawer_id,
|
||||
)
|
||||
)
|
||||
return {"success": True, "triple_id": triple_id, "fact": f"{subject} → {predicate} → {object}"}
|
||||
|
||||
@@ -1135,6 +1236,7 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
|
||||
subject = sanitize_kg_value(subject, "subject")
|
||||
predicate = sanitize_name(predicate, "predicate")
|
||||
object = sanitize_kg_value(object, "object")
|
||||
ended = sanitize_iso_date(ended, "ended")
|
||||
except ValueError as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
resolved_ended = ended or date.today().isoformat()
|
||||
@@ -1147,7 +1249,7 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
|
||||
"ended": resolved_ended,
|
||||
},
|
||||
)
|
||||
_kg.invalidate(subject, predicate, object, ended=resolved_ended)
|
||||
_call_kg(lambda kg: kg.invalidate(subject, predicate, object, ended=resolved_ended))
|
||||
return {
|
||||
"success": True,
|
||||
"fact": f"{subject} → {predicate} → {object}",
|
||||
@@ -1162,13 +1264,13 @@ def tool_kg_timeline(entity: str = None):
|
||||
entity = sanitize_kg_value(entity, "entity")
|
||||
except ValueError as e:
|
||||
return {"error": str(e)}
|
||||
results = _kg.timeline(entity)
|
||||
results = _call_kg(lambda kg: kg.timeline(entity))
|
||||
return {"entity": entity or "all", "timeline": results, "count": len(results)}
|
||||
|
||||
|
||||
def tool_kg_stats():
|
||||
"""Knowledge graph overview: entities, triples, relationship types."""
|
||||
return _kg.stats()
|
||||
return _call_kg(lambda kg: kg.stats())
|
||||
|
||||
|
||||
# ==================== AGENT DIARY ====================
|
||||
@@ -1351,7 +1453,7 @@ def tool_hook_settings(silent_save: bool = None, desktop_toast: bool = None):
|
||||
try:
|
||||
config = MempalaceConfig()
|
||||
except Exception:
|
||||
pass
|
||||
logger.debug("Could not re-read config after update", exc_info=True)
|
||||
|
||||
result = {
|
||||
"success": True,
|
||||
@@ -1400,10 +1502,11 @@ def tool_memories_filed_away():
|
||||
|
||||
|
||||
def tool_reconnect():
|
||||
"""Force the MCP server to drop the cached ChromaDB collection and reconnect.
|
||||
"""Force the MCP server to drop cached ChromaDB + KnowledgeGraph state.
|
||||
|
||||
Use after external scripts or CLI commands modify the palace database
|
||||
directly, which can leave the in-memory HNSW index stale.
|
||||
or replace ``knowledge_graph.sqlite3`` directly, which can leave the
|
||||
in-memory HNSW index stale or pin a closed-on-disk SQLite connection.
|
||||
"""
|
||||
global \
|
||||
_client_cache, \
|
||||
@@ -1412,6 +1515,30 @@ def tool_reconnect():
|
||||
_palace_db_mtime, \
|
||||
_vector_disabled, \
|
||||
_vector_disabled_reason
|
||||
from . import palace as palace_module
|
||||
|
||||
close_errors = []
|
||||
try:
|
||||
palace_module._DEFAULT_BACKEND.close_palace(_config.palace_path)
|
||||
except Exception as exc:
|
||||
logger.debug("Failed to close shared palace backend during reconnect", exc_info=True)
|
||||
close_errors.append(f"backend close_palace failed: {exc}")
|
||||
try:
|
||||
from chromadb.api.client import SharedSystemClient
|
||||
|
||||
clear_system_cache = getattr(SharedSystemClient, "clear_system_cache", None)
|
||||
if callable(clear_system_cache):
|
||||
clear_system_cache()
|
||||
else:
|
||||
logger.debug(
|
||||
"SharedSystemClient.clear_system_cache is unavailable; skipping shared Chroma cache clear during reconnect"
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug(
|
||||
"Failed to clear Chroma shared system cache during reconnect",
|
||||
exc_info=True,
|
||||
)
|
||||
close_errors.append(f"shared Chroma cache clear failed: {exc}")
|
||||
_client_cache = None
|
||||
_collection_cache = None
|
||||
_palace_db_inode = 0
|
||||
@@ -1421,15 +1548,36 @@ def tool_reconnect():
|
||||
# still applies after the reconnect.
|
||||
_vector_disabled = False
|
||||
_vector_disabled_reason = ""
|
||||
# Drain the per-path KnowledgeGraph cache so a replaced sqlite file is
|
||||
# reopened on the next tool call rather than served from a stale handle.
|
||||
with _kg_cache_lock:
|
||||
for kg in _kg_by_path.values():
|
||||
try:
|
||||
kg.close()
|
||||
except Exception:
|
||||
pass
|
||||
_kg_by_path.clear()
|
||||
try:
|
||||
col = _get_collection()
|
||||
if col is None:
|
||||
return {
|
||||
result = {
|
||||
"success": False,
|
||||
"message": "No palace found after reconnect",
|
||||
"drawers": 0,
|
||||
"vector_disabled": _vector_disabled,
|
||||
}
|
||||
if close_errors:
|
||||
result["error"] = "; ".join(close_errors)
|
||||
return result
|
||||
if close_errors:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "Reconnect reopened the palace but failed to fully reset cached handles",
|
||||
"drawers": col.count(),
|
||||
"vector_disabled": _vector_disabled,
|
||||
"vector_disabled_reason": _vector_disabled_reason,
|
||||
"error": "; ".join(close_errors),
|
||||
}
|
||||
return {
|
||||
"success": True,
|
||||
"message": "Reconnected to palace",
|
||||
@@ -1750,7 +1898,7 @@ TOOLS = {
|
||||
"handler": tool_get_drawer,
|
||||
},
|
||||
"mempalace_list_drawers": {
|
||||
"description": "List drawers with pagination. Optional wing/room filter. Returns IDs, wings, rooms, and content previews.",
|
||||
"description": "List drawers with pagination. Optional wing/room filter. Returns IDs, wings, rooms, content previews, and total matching count for pagination.",
|
||||
"input_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@@ -1891,6 +2039,12 @@ SUPPORTED_PROTOCOL_VERSIONS = [
|
||||
|
||||
|
||||
def handle_request(request):
|
||||
if not isinstance(request, dict):
|
||||
return {
|
||||
"jsonrpc": "2.0",
|
||||
"id": None,
|
||||
"error": {"code": -32600, "message": "Invalid Request"},
|
||||
}
|
||||
method = request.get("method") or ""
|
||||
params = request.get("params") or {}
|
||||
req_id = request.get("id")
|
||||
@@ -1928,6 +2082,15 @@ def handle_request(request):
|
||||
},
|
||||
}
|
||||
elif method == "tools/call":
|
||||
if not isinstance(params, dict) or "name" not in params:
|
||||
return {
|
||||
"jsonrpc": "2.0",
|
||||
"id": req_id,
|
||||
"error": {
|
||||
"code": -32602,
|
||||
"message": "Invalid params: 'name' is required for tools/call",
|
||||
},
|
||||
}
|
||||
tool_name = params.get("name")
|
||||
tool_args = params.get("arguments") or {}
|
||||
if tool_name not in TOOLS:
|
||||
@@ -1976,7 +2139,11 @@ def handle_request(request):
|
||||
return {
|
||||
"jsonrpc": "2.0",
|
||||
"id": req_id,
|
||||
"result": {"content": [{"type": "text", "text": json.dumps(result, indent=2)}]},
|
||||
"result": {
|
||||
"content": [
|
||||
{"type": "text", "text": json.dumps(result, indent=2, ensure_ascii=False)}
|
||||
]
|
||||
},
|
||||
}
|
||||
except Exception:
|
||||
logger.exception(f"Tool error in {tool_name}")
|
||||
@@ -2011,6 +2178,16 @@ def _restore_stdout():
|
||||
|
||||
def main():
|
||||
_restore_stdout()
|
||||
# Force UTF-8 on stdio. MCP JSON-RPC is UTF-8, but Python on Windows
|
||||
# defaults stdin/stdout to the system codepage (e.g. cp1251), which
|
||||
# corrupts non-ASCII payloads and surfaces as generic -32000 errors on
|
||||
# Cyrillic/CJK content. See PEP 540.
|
||||
for stream in (sys.stdin, sys.stdout):
|
||||
if hasattr(stream, "reconfigure"):
|
||||
try:
|
||||
stream.reconfigure(encoding="utf-8", errors="replace")
|
||||
except (AttributeError, OSError):
|
||||
pass
|
||||
logger.info("MemPalace MCP Server starting...")
|
||||
# Pre-flight: probe HNSW capacity before any tool call so the warning
|
||||
# is visible at startup rather than on first use (#1222). Pure
|
||||
@@ -2027,7 +2204,7 @@ def main():
|
||||
request = json.loads(line)
|
||||
response = handle_request(request)
|
||||
if response is not None:
|
||||
sys.stdout.write(json.dumps(response) + "\n")
|
||||
sys.stdout.write(json.dumps(response, ensure_ascii=False) + "\n")
|
||||
sys.stdout.flush()
|
||||
except KeyboardInterrupt:
|
||||
break
|
||||
|
||||
+67
-6
@@ -22,6 +22,7 @@ import errno
|
||||
import os
|
||||
import shutil
|
||||
import sqlite3
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
@@ -155,6 +156,55 @@ def confirm_destructive_action(
|
||||
return True
|
||||
|
||||
|
||||
def _result_ids(result) -> list:
|
||||
"""Return ids from either the backend typed result or raw Chroma dict."""
|
||||
|
||||
if isinstance(result, dict):
|
||||
return list(result.get("ids") or [])
|
||||
|
||||
return list(getattr(result, "ids", []) or [])
|
||||
|
||||
|
||||
def collection_write_roundtrip_works(col) -> bool:
|
||||
"""Return True only if the collection can upsert, read, and delete.
|
||||
|
||||
Some ChromaDB 0.6.x -> 1.5.x migrated collections remain readable while
|
||||
writes and deletes silently no-op. A plain ``count()`` probe misses that
|
||||
failure mode, so migrate must verify an actual write round-trip before
|
||||
deciding that no rebuild is needed.
|
||||
"""
|
||||
|
||||
probe_id = f"_mempalace_migrate_probe_{uuid.uuid4().hex}"
|
||||
probe_doc = "mempalace migrate write round-trip probe"
|
||||
probe_meta = {
|
||||
"wing": "_mempalace_probe",
|
||||
"room": "_mempalace_probe",
|
||||
"source_file": "mempalace_migrate_probe",
|
||||
"chunk_index": 0,
|
||||
}
|
||||
|
||||
try:
|
||||
col.upsert(
|
||||
ids=[probe_id],
|
||||
documents=[probe_doc],
|
||||
metadatas=[probe_meta],
|
||||
)
|
||||
|
||||
after_upsert = col.get(ids=[probe_id], include=[])
|
||||
if probe_id not in _result_ids(after_upsert):
|
||||
return False
|
||||
|
||||
col.delete(ids=[probe_id])
|
||||
|
||||
after_delete = col.get(ids=[probe_id], include=[])
|
||||
if probe_id in _result_ids(after_delete):
|
||||
return False
|
||||
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
|
||||
"""Migrate a palace to the currently installed ChromaDB version."""
|
||||
from .backends.chroma import ChromaBackend
|
||||
@@ -179,16 +229,27 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
|
||||
print(f" Source: ChromaDB {source_version}")
|
||||
print(f" Target: ChromaDB {target_version}")
|
||||
|
||||
# Try reading with current chromadb first
|
||||
# Try reading and writing with current chromadb first.
|
||||
#
|
||||
# A plain count() is not enough: some 0.6.x -> 1.5.x migrated collections
|
||||
# are readable but silently drop upsert/delete operations. In that state,
|
||||
# migrate must rebuild from SQLite instead of returning "No migration needed."
|
||||
try:
|
||||
col = ChromaBackend().get_collection(palace_path, "mempalace_drawers")
|
||||
count = col.count()
|
||||
print(f"\n Palace is already readable by chromadb {target_version}.")
|
||||
print(f" {count} drawers found. No migration needed.")
|
||||
return True
|
||||
|
||||
if collection_write_roundtrip_works(col):
|
||||
print(f"\n Palace is already readable and writable by chromadb {target_version}.")
|
||||
print(f" {count} drawers found. No migration needed.")
|
||||
return True
|
||||
|
||||
print(
|
||||
f"\n Palace is readable by chromadb {target_version}, but write/delete verification failed."
|
||||
)
|
||||
print(" Rebuilding from SQLite to restore native write/delete behavior...")
|
||||
except Exception:
|
||||
print(f"\n Palace is NOT readable by chromadb {target_version}.")
|
||||
print(" Extracting from SQLite directly...")
|
||||
print(f"\n Palace is NOT readable by chromadb {target_version}.")
|
||||
print(" Extracting from SQLite directly...")
|
||||
|
||||
# Extract all drawers via raw SQL
|
||||
drawers = extract_drawers_from_sqlite(db_path)
|
||||
|
||||
+38
-1
@@ -12,6 +12,7 @@ import sys
|
||||
import shlex
|
||||
import hashlib
|
||||
import fnmatch
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
@@ -31,6 +32,8 @@ from .palace import (
|
||||
upsert_closet_lines,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("mempalace_mcp")
|
||||
|
||||
READABLE_EXTENSIONS = {
|
||||
".txt",
|
||||
".md",
|
||||
@@ -63,6 +66,8 @@ SKIP_FILENAMES = {
|
||||
"mempal.yml",
|
||||
".gitignore",
|
||||
"package-lock.json",
|
||||
"pnpm-lock.yaml",
|
||||
"yarn.lock",
|
||||
}
|
||||
|
||||
CHUNK_SIZE = 800 # chars per drawer
|
||||
@@ -70,6 +75,13 @@ CHUNK_OVERLAP = 100 # overlap between chunks
|
||||
MIN_CHUNK_SIZE = 50 # skip tiny chunks
|
||||
DRAWER_UPSERT_BATCH_SIZE = 1000
|
||||
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
|
||||
# A single file producing more chunks than this is almost always a generated
|
||||
# artifact (CSV/JSON dump, lockfile not in SKIP_FILENAMES, etc.). Embedding
|
||||
# thousands of chunks from one file in one batch has triggered ONNX runtime
|
||||
# `bad allocation` errors on Windows (#1296). The cap is conservative: a
|
||||
# 500-chunk file at CHUNK_SIZE=800 is ~400 KB of source, which covers most
|
||||
# legitimate hand-written content while bounding the worst-case batch.
|
||||
MAX_CHUNKS_PER_FILE = 500
|
||||
# Long Claude Code sessions and large transcript exports routinely exceed
|
||||
# 10 MB. The cap exists as a defensive rail against pathological binary
|
||||
# files, not as a limit on legitimate text. Per-drawer size is bounded
|
||||
@@ -822,6 +834,13 @@ def process_file(
|
||||
room = detect_room(filepath, content, rooms, project_path)
|
||||
chunks = chunk_text(content, source_file)
|
||||
|
||||
if len(chunks) > MAX_CHUNKS_PER_FILE:
|
||||
print(
|
||||
f" ! [skip] {filepath.name[:50]:50} produced {len(chunks)} chunks "
|
||||
f"(> {MAX_CHUNKS_PER_FILE}); add to SKIP_FILENAMES or .gitignore"
|
||||
)
|
||||
return 0, room
|
||||
|
||||
if dry_run:
|
||||
print(f" [DRY RUN] {filepath.name} -> room:{room} ({len(chunks)} drawers)")
|
||||
return len(chunks), room
|
||||
@@ -842,7 +861,7 @@ def process_file(
|
||||
try:
|
||||
collection.delete(where={"source_file": source_file})
|
||||
except Exception:
|
||||
pass
|
||||
logger.debug("Stale-drawer purge failed for %s", source_file, exc_info=True)
|
||||
|
||||
# Batch chunks into bounded upserts so the embedding model sees many
|
||||
# chunks per forward pass without building one huge Chroma/SQLite
|
||||
@@ -1164,6 +1183,24 @@ def _mine_impl(
|
||||
"already-filed drawers are\n upserted idempotently and will not duplicate.\n"
|
||||
)
|
||||
sys.exit(130)
|
||||
except Exception as exc:
|
||||
# Without this, an arbitrary exception (ONNX bad_alloc, chromadb HNSW
|
||||
# error, OS fault) propagates and the process exits with no completion
|
||||
# banner — the operator sees only the final progress line and assumes
|
||||
# the mine succeeded (#1296). Print the partial-progress summary the
|
||||
# way we do for KeyboardInterrupt, then re-raise so the original
|
||||
# traceback still surfaces and the exit code is non-zero.
|
||||
print("\n\n Mine aborted by exception.")
|
||||
print(f" files_processed: {files_processed}/{len(files)}")
|
||||
print(f" drawers_filed: {total_drawers}")
|
||||
print(f" last_file: {last_file or '<none>'}")
|
||||
print(f" error: {type(exc).__name__}: {exc}")
|
||||
print(
|
||||
f"\n Re-run `mempalace mine {shlex.quote(project_dir)}` after addressing "
|
||||
"the cause — already-filed\n drawers are upserted idempotently and will "
|
||||
"not duplicate.\n"
|
||||
)
|
||||
raise
|
||||
finally:
|
||||
# Clean up the hooks-side PID lock if it points at us. Stale
|
||||
# entries already pass _pid_alive() == False on POSIX, but
|
||||
|
||||
@@ -118,14 +118,14 @@ def normalize(filepath: str) -> str:
|
||||
try:
|
||||
file_size = os.path.getsize(filepath)
|
||||
except OSError as e:
|
||||
raise IOError(f"Could not read {filepath}: {e}")
|
||||
raise IOError(f"Could not read {filepath}: {e}") from e
|
||||
if file_size > 500 * 1024 * 1024: # 500 MB safety limit
|
||||
raise IOError(f"File too large ({file_size // (1024 * 1024)} MB): {filepath}")
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
|
||||
content = f.read()
|
||||
except OSError as e:
|
||||
raise IOError(f"Could not read {filepath}: {e}")
|
||||
raise IOError(f"Could not read {filepath}: {e}") from e
|
||||
|
||||
if not content.strip():
|
||||
return content
|
||||
|
||||
+69
-4
@@ -6,11 +6,16 @@ Consolidates collection access patterns used by both miners and the MCP server.
|
||||
|
||||
import contextlib
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
from typing import Optional
|
||||
|
||||
from .backends.chroma import ChromaBackend
|
||||
|
||||
logger = logging.getLogger("mempalace_mcp")
|
||||
|
||||
SKIP_DIRS = {
|
||||
".git",
|
||||
"node_modules",
|
||||
@@ -52,10 +57,14 @@ NORMALIZE_VERSION = 2
|
||||
|
||||
def get_collection(
|
||||
palace_path: str,
|
||||
collection_name: str = "mempalace_drawers",
|
||||
collection_name: Optional[str] = None,
|
||||
create: bool = True,
|
||||
):
|
||||
"""Get the palace collection through the backend layer."""
|
||||
if collection_name is None:
|
||||
from .config import get_configured_collection_name
|
||||
|
||||
collection_name = get_configured_collection_name()
|
||||
return _DEFAULT_BACKEND.get_collection(
|
||||
palace_path,
|
||||
collection_name=collection_name,
|
||||
@@ -228,7 +237,7 @@ def purge_file_closets(closets_col, source_file: str) -> None:
|
||||
try:
|
||||
closets_col.delete(where={"source_file": source_file})
|
||||
except Exception:
|
||||
pass
|
||||
logger.debug("Closet purge failed for %s", source_file, exc_info=True)
|
||||
|
||||
|
||||
def upsert_closet_lines(closets_col, closet_id_base, lines, metadata):
|
||||
@@ -306,7 +315,7 @@ def mine_lock(source_file: str):
|
||||
|
||||
fcntl.flock(lf, fcntl.LOCK_UN)
|
||||
except Exception:
|
||||
pass
|
||||
logger.debug("Mine-lock release failed", exc_info=True)
|
||||
lf.close()
|
||||
|
||||
|
||||
@@ -314,6 +323,47 @@ class MineAlreadyRunning(RuntimeError):
|
||||
"""Raised when another `mempalace mine` already holds the per-palace lock."""
|
||||
|
||||
|
||||
# Per-thread record of palaces this thread already holds the lock for. Used by
|
||||
# `mine_palace_lock` to short-circuit re-entrant acquisition from the same
|
||||
# thread (e.g. miner.mine() acquires the outer lock then calls
|
||||
# ChromaCollection.upsert which now also tries to acquire). Without this guard
|
||||
# the inner call would block on its own outer flock (Linux fcntl locks are per
|
||||
# open file description, so a same-thread second open of the lock file is a
|
||||
# distinct lock and self-deadlocks).
|
||||
#
|
||||
# The holder set is tagged with ``pid`` so that a forked child does NOT
|
||||
# inherit re-entrant credit from its parent: the OS-level flock IS NOT
|
||||
# inherited as a "we hold it" semantically — the child must reacquire — but
|
||||
# Python's ``threading.local`` IS inherited across fork. The pid check
|
||||
# clears stale state so a forked child correctly hits the fcntl path.
|
||||
_palace_lock_holders = threading.local()
|
||||
|
||||
|
||||
def _holder_state():
|
||||
"""Return the per-thread (pid, keys) record, refreshing after fork."""
|
||||
keys = getattr(_palace_lock_holders, "keys", None)
|
||||
pid = getattr(_palace_lock_holders, "pid", None)
|
||||
current_pid = os.getpid()
|
||||
if keys is None or pid != current_pid:
|
||||
keys = set()
|
||||
_palace_lock_holders.keys = keys
|
||||
_palace_lock_holders.pid = current_pid
|
||||
return keys
|
||||
|
||||
|
||||
def _held_by_this_thread(lock_key: str) -> bool:
|
||||
"""Return True if this thread already holds ``mine_palace_lock`` for ``lock_key``."""
|
||||
return lock_key in _holder_state()
|
||||
|
||||
|
||||
def _mark_held(lock_key: str) -> None:
|
||||
_holder_state().add(lock_key)
|
||||
|
||||
|
||||
def _mark_released(lock_key: str) -> None:
|
||||
_holder_state().discard(lock_key)
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def mine_palace_lock(palace_path: str):
|
||||
"""Per-palace non-blocking lock around the full `mine` pipeline.
|
||||
@@ -338,6 +388,12 @@ def mine_palace_lock(palace_path: str):
|
||||
Non-blocking: if another `mine` is already writing to this palace,
|
||||
raise MineAlreadyRunning so the caller can exit cleanly instead of
|
||||
piling up as a waiting worker.
|
||||
|
||||
Re-entrant: if the current thread already holds the lock for the same
|
||||
palace, the context manager passes through without re-acquiring. This
|
||||
lets ChromaCollection write methods (which acquire the lock themselves
|
||||
to protect MCP/direct callers) compose with miner.mine() (which holds
|
||||
the outer lock for the entire mine pipeline) without self-deadlock.
|
||||
"""
|
||||
lock_dir = os.path.join(os.path.expanduser("~"), ".mempalace", "locks")
|
||||
os.makedirs(lock_dir, exist_ok=True)
|
||||
@@ -346,6 +402,11 @@ def mine_palace_lock(palace_path: str):
|
||||
palace_key = hashlib.sha256(lock_key_source.encode()).hexdigest()[:16]
|
||||
lock_path = os.path.join(lock_dir, f"mine_palace_{palace_key}.lock")
|
||||
|
||||
if _held_by_this_thread(palace_key):
|
||||
# Same thread already holds the lock for this palace — pass through.
|
||||
yield
|
||||
return
|
||||
|
||||
lf = open(lock_path, "w")
|
||||
acquired = False
|
||||
try:
|
||||
@@ -369,7 +430,11 @@ def mine_palace_lock(palace_path: str):
|
||||
raise MineAlreadyRunning(
|
||||
f"another `mempalace mine` is already running against {resolved}"
|
||||
) from exc
|
||||
yield
|
||||
_mark_held(palace_key)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
_mark_released(palace_key)
|
||||
finally:
|
||||
if acquired:
|
||||
try:
|
||||
|
||||
@@ -575,7 +575,7 @@ def follow_tunnels(wing: str, room: str, col=None, config=None):
|
||||
if did and did in drawer_map:
|
||||
c["drawer_preview"] = drawer_map[did][:300]
|
||||
except Exception:
|
||||
pass
|
||||
logger.debug("Drawer preview hydration failed", exc_info=True)
|
||||
|
||||
return connections
|
||||
|
||||
|
||||
+590
-49
@@ -34,13 +34,58 @@ import os
|
||||
import shutil
|
||||
import sqlite3
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from typing import Iterator, Optional
|
||||
|
||||
from chromadb.errors import NotFoundError as ChromaNotFoundError
|
||||
|
||||
from .backends.chroma import ChromaBackend, hnsw_capacity_status
|
||||
|
||||
|
||||
COLLECTION_NAME = "mempalace_drawers"
|
||||
REPAIR_TEMP_COLLECTION = f"{COLLECTION_NAME}__repair_tmp"
|
||||
|
||||
# The closets collection (AAAK index layer) is intentionally fixed —
|
||||
# closets reference drawer IDs by string and live alongside drawers in the
|
||||
# same palace; renaming the closets collection per-deployment would break
|
||||
# cross-palace AAAK lookups. Drawer collection name comes from config
|
||||
# (see ``_recoverable_collections``).
|
||||
CLOSETS_COLLECTION_NAME = "mempalace_closets"
|
||||
|
||||
|
||||
def _drawers_collection_name() -> str:
|
||||
"""Resolve the drawers collection name from user config, falling back
|
||||
to the module default ``COLLECTION_NAME`` if config is unreadable.
|
||||
|
||||
Recovery flows must honor ``MempalaceConfig().collection_name`` so a
|
||||
user with a non-default drawer collection (e.g. multi-palace setups)
|
||||
rebuilds the right rows. Closets remain fixed — see
|
||||
``CLOSETS_COLLECTION_NAME``.
|
||||
"""
|
||||
try:
|
||||
from .config import MempalaceConfig
|
||||
|
||||
return MempalaceConfig().collection_name or COLLECTION_NAME
|
||||
except Exception:
|
||||
return COLLECTION_NAME
|
||||
|
||||
|
||||
def _recoverable_collections() -> tuple[str, ...]:
|
||||
"""Collections rebuilt by ``rebuild_from_sqlite``, in upsert order.
|
||||
|
||||
Drawers first (bulk data), then closets (AAAK index layer that
|
||||
references drawer IDs by string in their documents — no
|
||||
foreign-key validation, so ordering is informational, not
|
||||
load-bearing).
|
||||
"""
|
||||
return (_drawers_collection_name(), CLOSETS_COLLECTION_NAME)
|
||||
|
||||
|
||||
# Back-compat alias for callers that imported the constant. New code
|
||||
# should call ``_recoverable_collections()`` so config changes are picked
|
||||
# up at call time.
|
||||
RECOVERABLE_COLLECTIONS = (COLLECTION_NAME, CLOSETS_COLLECTION_NAME)
|
||||
|
||||
|
||||
def _get_palace_path():
|
||||
@@ -83,7 +128,111 @@ def _paginate_ids(col, where=None):
|
||||
return ids
|
||||
|
||||
|
||||
def scan_palace(palace_path=None, only_wing=None):
|
||||
def _extract_drawers(col, total: int, batch_size: int):
|
||||
all_ids = []
|
||||
all_docs = []
|
||||
all_metas = []
|
||||
offset = 0
|
||||
while offset < total:
|
||||
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
|
||||
if not batch["ids"]:
|
||||
break
|
||||
all_ids.extend(batch["ids"])
|
||||
all_docs.extend(batch["documents"])
|
||||
all_metas.extend(batch["metadatas"])
|
||||
offset += len(batch["ids"])
|
||||
return all_ids, all_docs, all_metas
|
||||
|
||||
|
||||
def _verify_collection_count(col, expected: int, label: str) -> None:
|
||||
actual = col.count()
|
||||
if actual != expected:
|
||||
raise RuntimeError(f"{label} count mismatch: expected {expected}, got {actual}")
|
||||
|
||||
|
||||
def _is_missing_collection_value_error(exc: ValueError) -> bool:
|
||||
message = str(exc).lower()
|
||||
return "does not exist" in message or "not found" in message
|
||||
|
||||
|
||||
def _delete_collection_if_exists(backend, palace_path: str, collection_name: str) -> None:
|
||||
try:
|
||||
backend.delete_collection(palace_path, collection_name)
|
||||
except ValueError as exc:
|
||||
if _is_missing_collection_value_error(exc):
|
||||
return
|
||||
raise
|
||||
except (FileNotFoundError, ChromaNotFoundError):
|
||||
return
|
||||
|
||||
|
||||
class RebuildCollectionError(RuntimeError):
|
||||
"""Raised when temp rebuild fails, carrying whether the live swap happened."""
|
||||
|
||||
def __init__(self, message: str, *, live_replaced: bool):
|
||||
super().__init__(message)
|
||||
self.live_replaced = live_replaced
|
||||
|
||||
|
||||
def _rebuild_collection_via_temp(
|
||||
backend,
|
||||
palace_path: str,
|
||||
all_ids,
|
||||
all_docs,
|
||||
all_metas,
|
||||
batch_size: int,
|
||||
collection_name: Optional[str] = None,
|
||||
progress=print,
|
||||
) -> int:
|
||||
expected = len(all_ids)
|
||||
collection_name = collection_name or _drawers_collection_name()
|
||||
temp_name = f"{collection_name}__repair_tmp"
|
||||
live_replaced = False
|
||||
|
||||
try:
|
||||
_delete_collection_if_exists(backend, palace_path, temp_name)
|
||||
|
||||
progress(f" Building temporary collection: {temp_name}")
|
||||
temp_col = backend.create_collection(palace_path, temp_name)
|
||||
staged = 0
|
||||
for i in range(0, expected, batch_size):
|
||||
batch_ids = all_ids[i : i + batch_size]
|
||||
batch_docs = all_docs[i : i + batch_size]
|
||||
batch_metas = all_metas[i : i + batch_size]
|
||||
temp_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
||||
staged += len(batch_ids)
|
||||
progress(f" Staged {staged}/{expected} drawers...")
|
||||
_verify_collection_count(temp_col, expected, "temporary rebuild")
|
||||
|
||||
progress(" Rebuilding live collection...")
|
||||
backend.delete_collection(palace_path, collection_name)
|
||||
live_replaced = True
|
||||
new_col = backend.create_collection(palace_path, collection_name)
|
||||
|
||||
rebuilt = 0
|
||||
for i in range(0, expected, batch_size):
|
||||
batch_ids = all_ids[i : i + batch_size]
|
||||
batch_docs = all_docs[i : i + batch_size]
|
||||
batch_metas = all_metas[i : i + batch_size]
|
||||
new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
||||
rebuilt += len(batch_ids)
|
||||
progress(f" Re-filed {rebuilt}/{expected} drawers...")
|
||||
_verify_collection_count(new_col, expected, "rebuilt live collection")
|
||||
|
||||
try:
|
||||
_delete_collection_if_exists(backend, palace_path, temp_name)
|
||||
except Exception:
|
||||
pass
|
||||
return rebuilt
|
||||
except Exception as exc:
|
||||
try:
|
||||
_delete_collection_if_exists(backend, palace_path, temp_name)
|
||||
except Exception:
|
||||
pass
|
||||
raise RebuildCollectionError(str(exc), live_replaced=live_replaced) from exc
|
||||
|
||||
|
||||
def scan_palace(palace_path=None, only_wing=None, collection_name: Optional[str] = None):
|
||||
"""Scan the palace for corrupt/unfetchable IDs.
|
||||
|
||||
Probes in batches of 100, falls back to per-ID on failure.
|
||||
@@ -92,14 +241,15 @@ def scan_palace(palace_path=None, only_wing=None):
|
||||
Returns (good_set, bad_set).
|
||||
"""
|
||||
palace_path = palace_path or _get_palace_path()
|
||||
collection_name = collection_name or _drawers_collection_name()
|
||||
print(f"\n Palace: {palace_path}")
|
||||
print(" Loading...")
|
||||
|
||||
col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
|
||||
col = ChromaBackend().get_collection(palace_path, collection_name)
|
||||
|
||||
where = {"wing": only_wing} if only_wing else None
|
||||
total = col.count()
|
||||
print(f" Collection: {COLLECTION_NAME}, total: {total:,}")
|
||||
print(f" Collection: {collection_name}, total: {total:,}")
|
||||
if only_wing:
|
||||
print(f" Scanning wing: {only_wing}")
|
||||
|
||||
@@ -160,9 +310,10 @@ def scan_palace(palace_path=None, only_wing=None):
|
||||
return good_set, bad_set
|
||||
|
||||
|
||||
def prune_corrupt(palace_path=None, confirm=False):
|
||||
def prune_corrupt(palace_path=None, confirm=False, collection_name: Optional[str] = None):
|
||||
"""Delete corrupt IDs listed in corrupt_ids.txt."""
|
||||
palace_path = palace_path or _get_palace_path()
|
||||
collection_name = collection_name or _drawers_collection_name()
|
||||
bad_file = os.path.join(palace_path, "corrupt_ids.txt")
|
||||
|
||||
if not os.path.exists(bad_file):
|
||||
@@ -178,7 +329,7 @@ def prune_corrupt(palace_path=None, confirm=False):
|
||||
print(" Re-run with --confirm to actually delete.")
|
||||
return
|
||||
|
||||
col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
|
||||
col = ChromaBackend().get_collection(palace_path, collection_name)
|
||||
before = col.count()
|
||||
print(f" Collection size before: {before:,}")
|
||||
|
||||
@@ -232,7 +383,10 @@ class TruncationDetected(Exception):
|
||||
|
||||
|
||||
def check_extraction_safety(
|
||||
palace_path: str, extracted: int, confirm_truncation_ok: bool = False
|
||||
palace_path: str,
|
||||
extracted: int,
|
||||
confirm_truncation_ok: bool = False,
|
||||
collection_name: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Cross-check that ``extracted`` matches the SQLite ground truth.
|
||||
|
||||
@@ -254,7 +408,8 @@ def check_extraction_safety(
|
||||
if confirm_truncation_ok:
|
||||
return
|
||||
|
||||
sqlite_count = sqlite_drawer_count(palace_path)
|
||||
collection_name = collection_name or _drawers_collection_name()
|
||||
sqlite_count = sqlite_drawer_count(palace_path, collection_name)
|
||||
cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT
|
||||
|
||||
if sqlite_count is not None and sqlite_count > extracted:
|
||||
@@ -290,7 +445,7 @@ def check_extraction_safety(
|
||||
raise TruncationDetected(message, sqlite_count, extracted)
|
||||
|
||||
|
||||
def sqlite_drawer_count(palace_path: str) -> "int | None":
|
||||
def sqlite_drawer_count(palace_path: str, collection_name: Optional[str] = None) -> "int | None":
|
||||
"""Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection.
|
||||
|
||||
Used as an independent ground-truth check against the chromadb
|
||||
@@ -302,6 +457,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
|
||||
drift, missing tables, locked file). Callers treat ``None`` as
|
||||
"unknown" and fall back to the cap-detection check.
|
||||
"""
|
||||
collection_name = collection_name or _drawers_collection_name()
|
||||
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||
if not os.path.exists(sqlite_path):
|
||||
return None
|
||||
@@ -318,7 +474,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
|
||||
JOIN collections c ON s.collection = c.id
|
||||
WHERE c.name = ?
|
||||
""",
|
||||
(COLLECTION_NAME,),
|
||||
(collection_name,),
|
||||
).fetchone()
|
||||
return int(row[0]) if row and row[0] is not None else None
|
||||
finally:
|
||||
@@ -330,7 +486,11 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
|
||||
return None
|
||||
|
||||
|
||||
def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
||||
def rebuild_index(
|
||||
palace_path=None,
|
||||
confirm_truncation_ok: bool = False,
|
||||
collection_name: Optional[str] = None,
|
||||
):
|
||||
"""Rebuild the HNSW index from scratch.
|
||||
|
||||
1. Extract all drawers via ChromaDB get()
|
||||
@@ -345,6 +505,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
||||
(typically only a concern for palaces sized at exactly 10 000 rows).
|
||||
"""
|
||||
palace_path = palace_path or _get_palace_path()
|
||||
collection_name = collection_name or _drawers_collection_name()
|
||||
|
||||
if not os.path.isdir(palace_path):
|
||||
print(f"\n No palace found at {palace_path}")
|
||||
@@ -357,7 +518,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
||||
|
||||
backend = ChromaBackend()
|
||||
try:
|
||||
col = backend.get_collection(palace_path, COLLECTION_NAME)
|
||||
col = backend.get_collection(palace_path, collection_name)
|
||||
total = col.count()
|
||||
except Exception as e:
|
||||
print(f" Error reading palace: {e}")
|
||||
@@ -373,18 +534,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
||||
# Extract all drawers in batches
|
||||
print("\n Extracting drawers...")
|
||||
batch_size = 5000
|
||||
all_ids = []
|
||||
all_docs = []
|
||||
all_metas = []
|
||||
offset = 0
|
||||
while offset < total:
|
||||
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
|
||||
if not batch["ids"]:
|
||||
break
|
||||
all_ids.extend(batch["ids"])
|
||||
all_docs.extend(batch["documents"])
|
||||
all_metas.extend(batch["metadatas"])
|
||||
offset += len(batch["ids"])
|
||||
all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size)
|
||||
print(f" Extracted {len(all_ids)} drawers")
|
||||
|
||||
# ── #1208 guard ──────────────────────────────────────────────────
|
||||
@@ -392,7 +542,12 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
||||
# short of the SQLite ground truth (or when extraction == chromadb
|
||||
# default get() cap and the SQLite check couldn't run).
|
||||
try:
|
||||
check_extraction_safety(palace_path, len(all_ids), confirm_truncation_ok)
|
||||
check_extraction_safety(
|
||||
palace_path,
|
||||
len(all_ids),
|
||||
confirm_truncation_ok,
|
||||
collection_name=collection_name,
|
||||
)
|
||||
except TruncationDetected as e:
|
||||
print(e.message)
|
||||
return
|
||||
@@ -407,28 +562,34 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
||||
|
||||
# Rebuild with correct HNSW settings
|
||||
print(" Rebuilding collection with hnsw:space=cosine...")
|
||||
backend.delete_collection(palace_path, COLLECTION_NAME)
|
||||
new_col = backend.create_collection(palace_path, COLLECTION_NAME)
|
||||
|
||||
filed = 0
|
||||
try:
|
||||
for i in range(0, len(all_ids), batch_size):
|
||||
batch_ids = all_ids[i : i + batch_size]
|
||||
batch_docs = all_docs[i : i + batch_size]
|
||||
batch_metas = all_metas[i : i + batch_size]
|
||||
new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
|
||||
filed += len(batch_ids)
|
||||
print(f" Re-filed {filed}/{len(all_ids)} drawers...")
|
||||
except Exception as e:
|
||||
filed = _rebuild_collection_via_temp(
|
||||
backend,
|
||||
palace_path,
|
||||
all_ids,
|
||||
all_docs,
|
||||
all_metas,
|
||||
batch_size,
|
||||
collection_name=collection_name,
|
||||
progress=print,
|
||||
)
|
||||
except RebuildCollectionError as e:
|
||||
print(f"\n ERROR during rebuild: {e}")
|
||||
print(f" Only {filed}/{len(all_ids)} drawers were re-filed.")
|
||||
if os.path.exists(backup_path):
|
||||
print(" Rebuild aborted before completion.")
|
||||
if e.live_replaced and os.path.exists(backup_path):
|
||||
print(f" Restoring from backup: {backup_path}")
|
||||
backend.delete_collection(palace_path, COLLECTION_NAME)
|
||||
shutil.copy2(backup_path, sqlite_path)
|
||||
print(" Backup restored. Palace is back to pre-repair state.")
|
||||
else:
|
||||
try:
|
||||
_close_chroma_handles(palace_path, backend=backend)
|
||||
_delete_collection_if_exists(backend, palace_path, collection_name)
|
||||
shutil.copy2(backup_path, sqlite_path)
|
||||
print(" Backup restored. Palace is back to pre-repair state.")
|
||||
except Exception as restore_error:
|
||||
print(f" Backup restore failed: {restore_error}")
|
||||
print(f" Manual restore required from: {backup_path}")
|
||||
elif e.live_replaced:
|
||||
print(" No backup available. Re-mine from source files to recover.")
|
||||
else:
|
||||
print(" Live collection was not replaced; leaving the original palace untouched.")
|
||||
raise
|
||||
|
||||
print(f"\n Repair complete. {filed} drawers rebuilt.")
|
||||
@@ -436,7 +597,380 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
||||
print(f"\n{'=' * 55}\n")
|
||||
|
||||
|
||||
def status(palace_path=None) -> dict:
|
||||
class RebuildPartialError(Exception):
|
||||
"""Raised when ``rebuild_from_sqlite`` fails partway through upserts.
|
||||
|
||||
Carries enough state for the user (or CLI) to recover: the
|
||||
per-collection counts that succeeded, the collection that failed,
|
||||
the dest path holding the partial palace, and the archive path
|
||||
(when an in-place rebuild had moved the original aside). Re-raises
|
||||
the underlying chromadb error as ``__cause__``.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
*,
|
||||
partial_counts: dict[str, int],
|
||||
failed_collection: str,
|
||||
dest_palace: str,
|
||||
archive_path: Optional[str],
|
||||
):
|
||||
super().__init__(message)
|
||||
self.message = message
|
||||
self.partial_counts = partial_counts
|
||||
self.failed_collection = failed_collection
|
||||
self.dest_palace = dest_palace
|
||||
self.archive_path = archive_path
|
||||
|
||||
|
||||
def _rebuild_one_collection(
|
||||
*,
|
||||
backend: ChromaBackend,
|
||||
source_palace: str,
|
||||
dest_palace: str,
|
||||
collection_name: str,
|
||||
batch_size: int,
|
||||
archive_path: Optional[str],
|
||||
counts_so_far: dict[str, int],
|
||||
) -> int:
|
||||
"""Stream rows for one collection from SQLite and upsert into a
|
||||
freshly-created collection at ``dest_palace``. Returns rows
|
||||
upserted. Raises :class:`RebuildPartialError` (with the underlying
|
||||
chromadb exception as ``__cause__``) on any upsert failure so the
|
||||
caller can stop the loop and print recovery instructions instead of
|
||||
silently shipping a partial palace.
|
||||
"""
|
||||
ids: list[str] = []
|
||||
docs: list[str] = []
|
||||
metas: list[dict] = []
|
||||
upserted = 0
|
||||
col = None
|
||||
|
||||
def _flush() -> int:
|
||||
nonlocal upserted
|
||||
if not ids:
|
||||
return upserted
|
||||
col.upsert(ids=list(ids), documents=list(docs), metadatas=list(metas))
|
||||
upserted += len(ids)
|
||||
print(f" upserted {upserted}")
|
||||
ids.clear()
|
||||
docs.clear()
|
||||
metas.clear()
|
||||
return upserted
|
||||
|
||||
try:
|
||||
# ``create_collection`` lives inside the try so a Chroma-side
|
||||
# "Collection already exists" failure (which can happen when the
|
||||
# process-wide System cache still holds a pre-archive schema) is
|
||||
# reported as a structured ``RebuildPartialError`` carrying
|
||||
# ``archive_path`` — instead of an unstructured exception that
|
||||
# strands the user without recovery instructions.
|
||||
col = backend.create_collection(dest_palace, collection_name)
|
||||
|
||||
for emb_id, doc, meta in extract_via_sqlite(source_palace, collection_name):
|
||||
ids.append(emb_id)
|
||||
docs.append(doc or "")
|
||||
# chromadb 1.5.x rejects None entries in the metadatas list
|
||||
# but accepts empty dicts. Mempalace drawers always carry at
|
||||
# least wing/room, so this branch is defensive — corruption
|
||||
# in embedding_metadata could yield an emb_id with no rows.
|
||||
metas.append(meta if meta else {})
|
||||
if len(ids) >= batch_size:
|
||||
_flush()
|
||||
_flush()
|
||||
except Exception as exc: # noqa: BLE001 — chromadb raises many shapes
|
||||
partial = dict(counts_so_far)
|
||||
partial[collection_name] = upserted
|
||||
msg_parts = [
|
||||
f"Upsert failed in collection {collection_name!r} after {upserted} rows: {exc!r}",
|
||||
f"Partial palace left at: {dest_palace}",
|
||||
]
|
||||
if archive_path is not None:
|
||||
msg_parts.append(f"Original palace archived at: {archive_path}")
|
||||
msg_parts.append(
|
||||
" Recover by removing the partial dest and re-running with "
|
||||
f"--source {archive_path}"
|
||||
)
|
||||
else:
|
||||
msg_parts.append(" Source palace is unchanged. Remove the partial dest and re-run.")
|
||||
message = "\n ".join(msg_parts)
|
||||
print(f"\n ERROR: {message}")
|
||||
raise RebuildPartialError(
|
||||
message,
|
||||
partial_counts=partial,
|
||||
failed_collection=collection_name,
|
||||
dest_palace=dest_palace,
|
||||
archive_path=archive_path,
|
||||
) from exc
|
||||
|
||||
return upserted
|
||||
|
||||
|
||||
def extract_via_sqlite(palace_path: str, collection_name: str) -> Iterator[tuple[str, str, dict]]:
|
||||
"""Yield ``(embedding_id, document, metadata)`` for every row in
|
||||
``collection_name``'s metadata segment by reading ``chroma.sqlite3``
|
||||
directly.
|
||||
|
||||
Bypasses the chromadb client entirely — never opens a
|
||||
``PersistentClient``, never imports hnswlib, never invokes the
|
||||
HNSW segment writer. This is the recovery path for palaces where
|
||||
``Collection.count()`` / ``Collection.get()`` raise ``InternalError``
|
||||
because the compactor cannot apply WAL logs to the HNSW segment
|
||||
(#1308). The drawer rows are still on disk in
|
||||
``embeddings`` + ``embedding_metadata``; the corruption lives in the
|
||||
on-disk index files, not the SQLite tables.
|
||||
|
||||
Resolution rule for chromadb's typed metadata columns: each
|
||||
``embedding_metadata`` row stores its value in exactly one of
|
||||
``string_value`` / ``int_value`` / ``float_value`` / ``bool_value``;
|
||||
we pick the first non-NULL column in that order. Rows where every
|
||||
typed column is NULL are dropped (chromadb never writes that shape).
|
||||
The ``chroma:document`` key is removed from the metadata dict and
|
||||
returned as the document; this matches how chromadb itself stores
|
||||
``add(documents=...)``.
|
||||
|
||||
Silent on missing palace, missing ``chroma.sqlite3``, or unknown
|
||||
collection name — yields nothing. Callers that need to distinguish
|
||||
"empty collection" from "collection not present" should query
|
||||
:func:`sqlite_drawer_count` first.
|
||||
"""
|
||||
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||
if not os.path.isfile(sqlite_path):
|
||||
return
|
||||
|
||||
conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
|
||||
try:
|
||||
seg_row = conn.execute(
|
||||
"""
|
||||
SELECT s.id FROM segments s
|
||||
JOIN collections c ON s.collection = c.id
|
||||
WHERE c.name = ? AND s.scope = 'METADATA'
|
||||
""",
|
||||
(collection_name,),
|
||||
).fetchone()
|
||||
if not seg_row:
|
||||
return
|
||||
segment_id = seg_row[0]
|
||||
|
||||
per_id: dict[str, dict] = defaultdict(dict)
|
||||
order: list[str] = []
|
||||
for emb_id, key, sv, iv, fv, bv in conn.execute(
|
||||
"""
|
||||
SELECT e.embedding_id, em.key, em.string_value, em.int_value,
|
||||
em.float_value, em.bool_value
|
||||
FROM embedding_metadata em
|
||||
JOIN embeddings e ON em.id = e.id
|
||||
WHERE e.segment_id = ?
|
||||
ORDER BY em.id
|
||||
""",
|
||||
(segment_id,),
|
||||
):
|
||||
if emb_id not in per_id:
|
||||
order.append(emb_id)
|
||||
if sv is not None:
|
||||
per_id[emb_id][key] = sv
|
||||
elif iv is not None:
|
||||
per_id[emb_id][key] = iv
|
||||
elif fv is not None:
|
||||
per_id[emb_id][key] = fv
|
||||
elif bv is not None:
|
||||
per_id[emb_id][key] = bool(bv)
|
||||
|
||||
for emb_id in order:
|
||||
kv = per_id[emb_id]
|
||||
doc = kv.pop("chroma:document", "")
|
||||
yield emb_id, doc, kv
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def rebuild_from_sqlite(
|
||||
source_palace: str,
|
||||
dest_palace: str,
|
||||
*,
|
||||
archive_existing_dest: bool = False,
|
||||
batch_size: int = 1000,
|
||||
) -> dict[str, int]:
|
||||
"""Rebuild a palace by reading drawers from ``source_palace``'s
|
||||
``chroma.sqlite3`` and upserting them into a fresh palace at
|
||||
``dest_palace``.
|
||||
|
||||
Recovery path for the #1308 failure mode: the chromadb client raises
|
||||
``InternalError: Failed to apply logs to the hnsw segment writer``
|
||||
on every operation that touches the index (``count``, ``get``,
|
||||
``query``), but the underlying SQLite tables are intact. Both the
|
||||
legacy ``rebuild_index`` and the inline ``cli.cmd_repair`` path call
|
||||
``Collection.count()`` as their first read — exactly the call that
|
||||
fails — so neither can recover this class of corruption. This
|
||||
function bypasses the chromadb read path entirely via
|
||||
:func:`extract_via_sqlite`.
|
||||
|
||||
Re-embeds documents at upsert time using the configured embedding
|
||||
function; the original HNSW vectors are not preserved (they live in
|
||||
the corrupt ``data_level0.bin`` / ``link_lists.bin``, not in
|
||||
SQLite). Acceptable for a corruption-recovery flow because the
|
||||
embedding model is deterministic — same model + same document text
|
||||
yields semantically equivalent search results.
|
||||
|
||||
``archive_existing_dest`` controls behavior when ``dest_palace``
|
||||
already exists:
|
||||
|
||||
* ``False`` (default) — refuse with a clear message. Callers must
|
||||
manually move the existing palace aside first.
|
||||
* ``True`` — rename ``dest_palace`` to
|
||||
``<dest_palace>.pre-rebuild-<timestamp>`` and read from there
|
||||
instead. Used by the in-place CLI flow where ``--source`` defaults
|
||||
to the same path as ``--palace``.
|
||||
|
||||
Returns a ``{collection_name: row_count}`` dict so callers (CLI,
|
||||
tests) can verify the per-collection rebuild count without parsing
|
||||
stdout. A successful rebuild always returns a dict with one key per
|
||||
recoverable collection (values may be ``0`` when a collection is
|
||||
legitimately empty in the source). The empty dict ``{}`` is reserved
|
||||
for validation refusals (missing source DB, refusing to overwrite an
|
||||
existing dest, in-place mode without ``archive_existing_dest``); CLI
|
||||
callers should treat ``{}`` as an error and exit non-zero so CI and
|
||||
scripts can distinguish "invalid inputs" from "successful recovery
|
||||
that found zero rows." Raises :class:`RebuildPartialError` if a
|
||||
chromadb upsert fails partway through; the dest palace is left in
|
||||
place so the user can inspect what landed, and the in-place archive
|
||||
(when applicable) is reported in the error so the user can re-run
|
||||
against it.
|
||||
|
||||
.. warning::
|
||||
|
||||
In-place mode (``source_palace == dest_palace`` with
|
||||
``archive_existing_dest=True``) calls
|
||||
``chromadb.api.client.SharedSystemClient.clear_system_cache()`` to
|
||||
drop chromadb's process-wide System registry — required because
|
||||
an existing cached System built against the original palace will
|
||||
refuse ``create_collection`` after the dir is renamed (chromadb
|
||||
still thinks the collections exist). This invalidates any
|
||||
PersistentClient instances held elsewhere in the same process for
|
||||
*any* palace, not just this one. Do not call this function from
|
||||
inside a long-running mempalace process (MCP server, daemon)
|
||||
while other callers hold live ``PersistentClient`` references —
|
||||
use the CLI in a separate process instead. Cross-palace use
|
||||
(``source != dest``) does not touch the cache.
|
||||
|
||||
Note on metadata fidelity: the resolution rule
|
||||
(``string_value`` → ``int_value`` → ``float_value`` → ``bool_value``)
|
||||
matches the precedent in :mod:`mempalace.migrate`. ChromaDB 0.4.x
|
||||
occasionally wrote booleans as ``int_value=0/1``; those will
|
||||
round-trip as ``int`` rather than ``bool`` after this rebuild. This
|
||||
is a known divergence and matches the existing migrate-path
|
||||
behavior.
|
||||
"""
|
||||
source_palace = os.path.abspath(os.path.expanduser(source_palace))
|
||||
dest_palace = os.path.abspath(os.path.expanduser(dest_palace))
|
||||
|
||||
src_db = os.path.join(source_palace, "chroma.sqlite3")
|
||||
|
||||
in_place = source_palace == dest_palace
|
||||
|
||||
print(f"\n{'=' * 55}")
|
||||
print(" MemPalace Repair — Rebuild from SQLite")
|
||||
print(f"{'=' * 55}\n")
|
||||
print(f" Source: {source_palace}")
|
||||
print(f" Dest: {dest_palace}")
|
||||
|
||||
# Validate source BEFORE any destructive moves. An earlier draft
|
||||
# archived the dest first and surfaced the missing-chroma.sqlite3
|
||||
# error after — leaving the user with a renamed dir to manually undo
|
||||
# when the archive itself was empty. Validate first so a user error
|
||||
# (--source pointing at a non-palace dir) bails cleanly.
|
||||
if in_place:
|
||||
if not archive_existing_dest:
|
||||
print(
|
||||
"\n Source and dest are the same path. Pass "
|
||||
"archive_existing_dest=True (CLI: --archive-existing) to move "
|
||||
"the existing palace aside, or pass a different source_palace= "
|
||||
"(CLI: --source)."
|
||||
)
|
||||
return {}
|
||||
if not os.path.isfile(src_db):
|
||||
print(f"\n Source palace has no chroma.sqlite3 at {src_db}")
|
||||
return {}
|
||||
else:
|
||||
if not os.path.isfile(src_db):
|
||||
print(f"\n Source palace has no chroma.sqlite3 at {src_db}")
|
||||
return {}
|
||||
if os.path.exists(dest_palace):
|
||||
print(
|
||||
f"\n Refusing to rebuild into existing path: {dest_palace}\n"
|
||||
" Move it aside, pass a different dest, or set "
|
||||
"archive_existing_dest=True if rebuilding in place "
|
||||
"(source_palace == dest_palace)."
|
||||
)
|
||||
return {}
|
||||
|
||||
archive_path: Optional[str] = None
|
||||
if in_place:
|
||||
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
archive_path = f"{dest_palace}.pre-rebuild-{ts}"
|
||||
print(f" Archiving {dest_palace} → {archive_path}")
|
||||
shutil.move(dest_palace, archive_path)
|
||||
source_palace = archive_path
|
||||
src_db = os.path.join(source_palace, "chroma.sqlite3")
|
||||
|
||||
# In-place only: drop chromadb's process-wide System registry so
|
||||
# the new client at dest_palace builds a fresh System. Without
|
||||
# this, ``create_collection`` raises "Collection already exists"
|
||||
# because the cached System still holds the pre-rename schema.
|
||||
# Cross-palace mode does not need this and would needlessly
|
||||
# invalidate other callers' clients (see docstring warning).
|
||||
try:
|
||||
from chromadb.api.client import SharedSystemClient
|
||||
|
||||
SharedSystemClient.clear_system_cache()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
print(
|
||||
f" Warning: could not clear chromadb system cache ({exc!r}); "
|
||||
"in-place rebuild may fail with 'Collection already exists'."
|
||||
)
|
||||
|
||||
os.makedirs(dest_palace, exist_ok=True)
|
||||
|
||||
# Backend lifetime is wrapped in try/finally so the dest palace's
|
||||
# PersistentClient handle (opened lazily inside ``create_collection``
|
||||
# / ``get_collection``) is released on every exit path: success,
|
||||
# ``RebuildPartialError``, or any unexpected exception. Without this,
|
||||
# a long-running process that calls ``rebuild_from_sqlite`` would
|
||||
# leak SQLite/HNSW file handles into Chroma's ``SharedSystemClient``
|
||||
# cache, surfacing later as "Collection already exists" on the next
|
||||
# in-place rebuild or as a Windows file-lock failure on cleanup
|
||||
# (cf. #1285's lifecycle hardening for the legacy rebuild path).
|
||||
backend = ChromaBackend()
|
||||
counts: dict[str, int] = {}
|
||||
try:
|
||||
for cname in _recoverable_collections():
|
||||
print(f"\n [{cname}]")
|
||||
upserted = _rebuild_one_collection(
|
||||
backend=backend,
|
||||
source_palace=source_palace,
|
||||
dest_palace=dest_palace,
|
||||
collection_name=cname,
|
||||
batch_size=batch_size,
|
||||
archive_path=archive_path,
|
||||
counts_so_far=counts,
|
||||
)
|
||||
counts[cname] = upserted
|
||||
if upserted == 0:
|
||||
print(f" no rows found for {cname} in source palace")
|
||||
else:
|
||||
print(f" done: {upserted} rows in {cname}")
|
||||
|
||||
print(f"\n Rebuild complete. {sum(counts.values())} total rows.")
|
||||
if archive_path is not None:
|
||||
print(f" Original palace archived at: {archive_path}")
|
||||
print(f"{'=' * 55}\n")
|
||||
return counts
|
||||
finally:
|
||||
backend.close()
|
||||
|
||||
|
||||
def status(palace_path=None, collection_name: Optional[str] = None) -> dict:
|
||||
"""Read-only health check: compare sqlite vs HNSW element counts.
|
||||
|
||||
Catches the #1222 failure mode where chromadb's HNSW segment freezes
|
||||
@@ -454,6 +988,7 @@ def status(palace_path=None) -> dict:
|
||||
``status="unknown"`` when no palace exists at the given path.
|
||||
"""
|
||||
palace_path = palace_path or _get_palace_path()
|
||||
collection_name = collection_name or _drawers_collection_name()
|
||||
print(f"\n{'=' * 55}")
|
||||
print(" MemPalace Repair — Status")
|
||||
print(f"{'=' * 55}\n")
|
||||
@@ -463,8 +998,8 @@ def status(palace_path=None) -> dict:
|
||||
print(" No palace found.\n")
|
||||
return {"status": "unknown", "message": "no palace at path"}
|
||||
|
||||
drawers = hnsw_capacity_status(palace_path, "mempalace_drawers")
|
||||
closets = hnsw_capacity_status(palace_path, "mempalace_closets")
|
||||
drawers = hnsw_capacity_status(palace_path, collection_name)
|
||||
closets = hnsw_capacity_status(palace_path, CLOSETS_COLLECTION_NAME)
|
||||
|
||||
for label, info in (("drawers", drawers), ("closets", closets)):
|
||||
print(f"\n [{label}]")
|
||||
@@ -494,12 +1029,18 @@ def status(palace_path=None) -> dict:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _close_chroma_handles(palace_path: str) -> None:
|
||||
"""Drop ChromaBackend + chromadb singleton caches so OS mmap handles release."""
|
||||
def _close_chroma_handles(palace_path: str, backend: "ChromaBackend | None" = None) -> None:
|
||||
"""Drop ChromaBackend + chromadb singleton caches so OS mmap handles release.
|
||||
|
||||
When ``backend`` is provided, close the live instance so rollback/restore
|
||||
releases the handles it was already using. Otherwise fall back to a
|
||||
transient backend instance for the max-seq-id repair path.
|
||||
"""
|
||||
import gc
|
||||
|
||||
try:
|
||||
ChromaBackend().close_palace(palace_path)
|
||||
closer = backend if backend is not None else ChromaBackend()
|
||||
closer.close_palace(palace_path)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
|
||||
@@ -202,7 +202,7 @@ def detect_rooms_from_files(project_dir: str) -> list:
|
||||
|
||||
SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv", "dist", "build"}
|
||||
|
||||
for root, dirs, filenames in os.walk(project_path):
|
||||
for _root, dirs, filenames in os.walk(project_path):
|
||||
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
|
||||
for filename in filenames:
|
||||
name_lower = filename.lower().replace("-", "_").replace(" ", "_")
|
||||
|
||||
+80
-25
@@ -245,7 +245,7 @@ def _expand_with_neighbors(drawers_col, matched_doc: str, matched_meta: dict, ra
|
||||
all_meta = drawers_col.get(where={"source_file": src}, include=["metadatas"])
|
||||
total_drawers = len(all_meta.ids) if all_meta.ids else None
|
||||
except Exception:
|
||||
pass
|
||||
logger.debug("total_drawers lookup failed for %s", src, exc_info=True)
|
||||
|
||||
return {
|
||||
"text": combined_text,
|
||||
@@ -297,10 +297,10 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
|
||||
"""
|
||||
try:
|
||||
col = get_collection(palace_path, create=False)
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
print(f"\n No palace found at {palace_path}")
|
||||
print(" Run: mempalace init <dir> then mempalace mine <dir>")
|
||||
raise SearchError(f"No palace found at {palace_path}")
|
||||
raise SearchError(f"No palace found at {palace_path}") from e
|
||||
|
||||
# Alert the user if this palace predates hnsw:space=cosine being set on
|
||||
# creation — their similarity scores will be junk until they run repair.
|
||||
@@ -340,7 +340,7 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
|
||||
# `_hybrid_rank`; do the same here so CLI results match what agents
|
||||
# see via `mempalace_search`.
|
||||
hits = [
|
||||
{"text": doc, "distance": float(dist), "metadata": meta or {}}
|
||||
{"text": doc or "", "distance": float(dist), "metadata": meta or {}}
|
||||
for doc, meta, dist in zip(docs, metas, dists)
|
||||
]
|
||||
hits = _hybrid_rank(hits, query)
|
||||
@@ -382,6 +382,7 @@ def _bm25_only_via_sqlite(
|
||||
n_results: int = 5,
|
||||
max_candidates: int = 500,
|
||||
_include_internal: bool = False,
|
||||
collection_name: str = None,
|
||||
) -> dict:
|
||||
"""BM25-only search reading drawers directly from chroma.sqlite3.
|
||||
|
||||
@@ -405,6 +406,35 @@ def _bm25_only_via_sqlite(
|
||||
"error": "No palace found",
|
||||
"hint": "Run: mempalace init <dir> && mempalace mine <dir>",
|
||||
}
|
||||
if collection_name is None:
|
||||
from .config import get_configured_collection_name
|
||||
|
||||
collection_name = get_configured_collection_name()
|
||||
|
||||
def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]:
|
||||
clauses = []
|
||||
params = []
|
||||
for key, value in (("wing", wing), ("room", room)):
|
||||
if not value:
|
||||
continue
|
||||
clauses.append(
|
||||
f"""
|
||||
AND EXISTS (
|
||||
SELECT 1
|
||||
FROM embedding_metadata mf
|
||||
WHERE mf.id = {row_id_expr}
|
||||
AND mf.key = ?
|
||||
AND COALESCE(
|
||||
mf.string_value,
|
||||
CAST(mf.int_value AS TEXT),
|
||||
CAST(mf.float_value AS TEXT),
|
||||
CAST(mf.bool_value AS TEXT)
|
||||
) = ?
|
||||
)
|
||||
"""
|
||||
)
|
||||
params.extend([key, value])
|
||||
return "".join(clauses), params
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
|
||||
@@ -416,45 +446,57 @@ def _bm25_only_via_sqlite(
|
||||
# shorter than 3 chars (trigram tokenizer can't match them).
|
||||
tokens = [t for t in _tokenize(query) if len(t) >= 3]
|
||||
candidate_ids: list[int] = []
|
||||
use_recency_fallback = not tokens
|
||||
if tokens:
|
||||
fts_query = " OR ".join(tokens)
|
||||
filter_sql, filter_params = _metadata_filter_sql("embedding_fulltext_search.rowid")
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT rowid
|
||||
f"""
|
||||
SELECT embedding_fulltext_search.rowid
|
||||
FROM embedding_fulltext_search
|
||||
JOIN embeddings e ON e.id = embedding_fulltext_search.rowid
|
||||
JOIN segments s ON e.segment_id = s.id
|
||||
JOIN collections c ON s.collection = c.id
|
||||
WHERE embedding_fulltext_search MATCH ?
|
||||
AND c.name = ?
|
||||
{filter_sql}
|
||||
LIMIT ?
|
||||
""",
|
||||
(fts_query, max_candidates),
|
||||
(fts_query, collection_name, *filter_params, max_candidates),
|
||||
).fetchall()
|
||||
candidate_ids = [r[0] for r in rows]
|
||||
except sqlite3.Error:
|
||||
# FTS5 tokenizer mismatch or syntax error — fall through
|
||||
# to the recency-window selector below.
|
||||
logger.debug("FTS5 MATCH failed; using recency fallback", exc_info=True)
|
||||
use_recency_fallback = True
|
||||
|
||||
if not candidate_ids:
|
||||
# No FTS hits (or no usable tokens) — pull the most recent
|
||||
# rows for the drawers segment so we can BM25-rank something
|
||||
# rather than return empty-handed. Wrapped in try/except
|
||||
# because the schema may differ on legacy palaces (older
|
||||
# chromadb without ``created_at``, missing ``segments``
|
||||
# rows after partial restore, etc.); on schema mismatch we
|
||||
# fall back to ordering by primary-key id and finally to an
|
||||
# empty result rather than letting search raise.
|
||||
if not candidate_ids and use_recency_fallback:
|
||||
# No usable FTS tokens, or FTS itself failed — pull the most
|
||||
# recent rows for the drawers segment so we can BM25-rank
|
||||
# something rather than return empty-handed. A clean FTS miss
|
||||
# must stay empty, especially after wing/room filtering, because
|
||||
# recency fallback would return unrelated scoped drawers.
|
||||
# Wrapped in try/except because the schema may differ on legacy
|
||||
# palaces (older chromadb without ``created_at``, missing
|
||||
# ``segments`` rows after partial restore, etc.); on schema
|
||||
# mismatch we fall back to ordering by primary-key id and finally
|
||||
# to an empty result rather than letting search raise.
|
||||
try:
|
||||
filter_sql, filter_params = _metadata_filter_sql("e.id")
|
||||
rows = conn.execute(
|
||||
"""
|
||||
f"""
|
||||
SELECT e.id
|
||||
FROM embeddings e
|
||||
JOIN segments s ON e.segment_id = s.id
|
||||
JOIN collections c ON s.collection = c.id
|
||||
WHERE c.name = 'mempalace_drawers'
|
||||
WHERE c.name = ?
|
||||
{filter_sql}
|
||||
ORDER BY e.created_at DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(max_candidates,),
|
||||
(collection_name, *filter_params, max_candidates),
|
||||
).fetchall()
|
||||
candidate_ids = [r[0] for r in rows]
|
||||
except sqlite3.Error:
|
||||
@@ -463,17 +505,19 @@ def _bm25_only_via_sqlite(
|
||||
exc_info=True,
|
||||
)
|
||||
try:
|
||||
filter_sql, filter_params = _metadata_filter_sql("e.id")
|
||||
rows = conn.execute(
|
||||
"""
|
||||
f"""
|
||||
SELECT e.id
|
||||
FROM embeddings e
|
||||
JOIN segments s ON e.segment_id = s.id
|
||||
JOIN collections c ON s.collection = c.id
|
||||
WHERE c.name = 'mempalace_drawers'
|
||||
WHERE c.name = ?
|
||||
{filter_sql}
|
||||
ORDER BY e.id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(max_candidates,),
|
||||
(collection_name, *filter_params, max_candidates),
|
||||
).fetchall()
|
||||
candidate_ids = [r[0] for r in rows]
|
||||
except sqlite3.Error:
|
||||
@@ -689,6 +733,7 @@ def search_memories(
|
||||
max_distance: float = 0.0,
|
||||
vector_disabled: bool = False,
|
||||
candidate_strategy: str = "vector",
|
||||
collection_name: str = None,
|
||||
) -> dict:
|
||||
"""Programmatic search — returns a dict instead of printing.
|
||||
|
||||
@@ -739,10 +784,11 @@ def search_memories(
|
||||
wing=wing,
|
||||
room=room,
|
||||
n_results=n_results,
|
||||
collection_name=collection_name,
|
||||
)
|
||||
|
||||
try:
|
||||
drawers_col = get_collection(palace_path, create=False)
|
||||
drawers_col = get_collection(palace_path, collection_name=collection_name, create=False)
|
||||
except Exception as e:
|
||||
logger.error("No palace found at %s: %s", palace_path, e)
|
||||
return {
|
||||
@@ -795,7 +841,8 @@ def search_memories(
|
||||
if source and source not in closet_boost_by_source:
|
||||
closet_boost_by_source[source] = (rank, cdist, cdoc[:200])
|
||||
except Exception:
|
||||
pass # no closets yet — hybrid degrades to pure drawer search
|
||||
# No closets yet — hybrid degrades to pure drawer search.
|
||||
logger.debug("Closet collection unavailable; using drawer-only search", exc_info=True)
|
||||
|
||||
# Rank-based boost. The ordinal signal ("which closet matched best") is
|
||||
# more reliable than absolute distance on narrative content, where
|
||||
@@ -809,6 +856,8 @@ def search_memories(
|
||||
_first_or_empty(drawer_results, "metadatas"),
|
||||
_first_or_empty(drawer_results, "distances"),
|
||||
):
|
||||
meta = meta or {}
|
||||
doc = doc or ""
|
||||
# Filter on raw distance before rounding to avoid precision loss.
|
||||
if max_distance > 0.0 and dist > max_distance:
|
||||
continue
|
||||
@@ -825,7 +874,12 @@ def search_memories(
|
||||
matched_via = "drawer+closet"
|
||||
closet_preview = c_preview
|
||||
|
||||
effective_dist = dist - boost
|
||||
# Clamp to the valid cosine-distance range [0, 2]. When a strong
|
||||
# closet boost (up to 0.40) exceeds the raw distance, the subtraction
|
||||
# can go negative — which (a) yields ``similarity > 1.0`` downstream
|
||||
# and (b) makes the sort key land *below* ordinary positive distances,
|
||||
# inverting the ranking so the best hybrid matches sort last.
|
||||
effective_dist = max(0.0, min(2.0, dist - boost))
|
||||
entry = {
|
||||
"text": doc,
|
||||
"wing": meta.get("wing", "unknown"),
|
||||
@@ -870,6 +924,7 @@ def search_memories(
|
||||
include=["documents", "metadatas"],
|
||||
)
|
||||
except Exception:
|
||||
logger.debug("Neighbor fetch failed for %s", full_source, exc_info=True)
|
||||
continue
|
||||
docs = source_drawers.documents
|
||||
metas_ = source_drawers.metadatas
|
||||
|
||||
Reference in New Issue
Block a user