diff --git a/README.md b/README.md index 8157fca..d82bcd2 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,10 @@ > domain — including `mempalace.tech` — is an impostor and may distribute > malware. Details and timeline: [docs/HISTORY.md](docs/HISTORY.md). +> [!IMPORTANT] +> **🚨 Claude Code sessions expire in 30 days w/out auto-save hooks wired!** **[Read this →](https://github.com/MemPalace/mempalace/discussions/1388)** + +
MemPalace diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py index 58f9ba9..bbb9c93 100644 --- a/mempalace/mcp_server.py +++ b/mempalace/mcp_server.py @@ -326,68 +326,94 @@ def _get_client(): def _get_collection(create=False): - """Return the ChromaDB collection, caching the client between calls.""" - global _collection_cache, _metadata_cache, _metadata_cache_time - try: - client = _get_client() - # ChromaDB 1.x persists the EF *identity* (its ``name()``) with the - # collection but not the EF *instance/configuration*. So a reader or - # writer that omits ``embedding_function=`` silently gets chromadb's - # built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the - # one we spoof in ``mempalace.embedding`` (both report ``"default"``, - # the identity check passes), but the *provider list* is chromadb's - # default rather than the user's resolved device. On bleeding-edge - # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon) - # that default provider selection can SIGSEGV the host process on - # first ``col.add()``. The miner / Stop hook ingest path avoids this - # because it routes through ``ChromaBackend.get_collection``, which - # resolves the EF via ``ChromaBackend._resolve_embedding_function``; - # the MCP server bypassed that abstraction. Resolve the EF inside the - # branches that actually open a collection so warm-cache reads stay - # zero-cost. Reuse the backend helper so the two call sites can't - # drift on logging or fallback semantics. - if create: - ef = ChromaBackend._resolve_embedding_function() - ef_kwargs = {"embedding_function": ef} if ef is not None else {} - # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor - # HNSW insert path, which has a race in repairConnectionsForUpdate / - # addPoint (see issues #974, #965). Set via metadata on fresh - # collections and re-applied via _pin_hnsw_threads() for legacy - # palaces whose collections were created before this fix (the - # runtime config does not persist cross-process in chromadb 1.5.x, - # so the retrofit runs every time _get_collection opens a cache). - # - # ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection - # is called with metadata that differs from what's stored. The split - # below skips the metadata-comparison codepath for existing - # collections, mirroring the backend-layer fix from #1262. - try: + """Return the ChromaDB collection, caching the client between calls. + + On failure, log the exception and retry once after clearing the client + and collection caches. Tools were silently returning ``None`` when a + cached client/collection went stale — typically after the chromadb + rust bindings invalidated a handle following an out-of-band write — + leaving the LLM with no diagnostic and no recovery path. The retry + forces ``_get_client()`` to rebuild from scratch (which re-runs + ``quarantine_stale_hnsw`` per #1322), so the second attempt heals the + common stale-handle / stale-HNSW case automatically. + """ + global _client_cache, _collection_cache, _metadata_cache, _metadata_cache_time + for attempt in range(2): + try: + client = _get_client() + # ChromaDB 1.x persists the EF *identity* (its ``name()``) with the + # collection but not the EF *instance/configuration*. So a reader or + # writer that omits ``embedding_function=`` silently gets chromadb's + # built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the + # one we spoof in ``mempalace.embedding`` (both report ``"default"``, + # the identity check passes), but the *provider list* is chromadb's + # default rather than the user's resolved device. On bleeding-edge + # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon) + # that default provider selection can SIGSEGV the host process on + # first ``col.add()``. The miner / Stop hook ingest path avoids this + # because it routes through ``ChromaBackend.get_collection``, which + # resolves the EF via ``ChromaBackend._resolve_embedding_function``; + # the MCP server bypassed that abstraction. Resolve the EF inside the + # branches that actually open a collection so warm-cache reads stay + # zero-cost. Reuse the backend helper so the two call sites can't + # drift on logging or fallback semantics. + if create: + ef = ChromaBackend._resolve_embedding_function() + ef_kwargs = {"embedding_function": ef} if ef is not None else {} + # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor + # HNSW insert path, which has a race in repairConnectionsForUpdate / + # addPoint (see issues #974, #965). Set via metadata on fresh + # collections and re-applied via _pin_hnsw_threads() for legacy + # palaces whose collections were created before this fix (the + # runtime config does not persist cross-process in chromadb 1.5.x, + # so the retrofit runs every time _get_collection opens a cache). + # + # ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection + # is called with metadata that differs from what's stored. The split + # below skips the metadata-comparison codepath for existing + # collections, mirroring the backend-layer fix from #1262. + try: + raw = client.get_collection(_config.collection_name, **ef_kwargs) + except _ChromaNotFoundError: + raw = client.create_collection( + _config.collection_name, + metadata={ + "hnsw:space": "cosine", + "hnsw:num_threads": 1, + **_HNSW_BLOAT_GUARD, + }, + **ef_kwargs, + ) + _pin_hnsw_threads(raw) + _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path) + _metadata_cache = None + _metadata_cache_time = 0 + elif _collection_cache is None: + ef = ChromaBackend._resolve_embedding_function() + ef_kwargs = {"embedding_function": ef} if ef is not None else {} raw = client.get_collection(_config.collection_name, **ef_kwargs) - except _ChromaNotFoundError: - raw = client.create_collection( - _config.collection_name, - metadata={ - "hnsw:space": "cosine", - "hnsw:num_threads": 1, - **_HNSW_BLOAT_GUARD, - }, - **ef_kwargs, - ) - _pin_hnsw_threads(raw) - _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path) - _metadata_cache = None - _metadata_cache_time = 0 - elif _collection_cache is None: - ef = ChromaBackend._resolve_embedding_function() - ef_kwargs = {"embedding_function": ef} if ef is not None else {} - raw = client.get_collection(_config.collection_name, **ef_kwargs) - _pin_hnsw_threads(raw) - _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path) - _metadata_cache = None - _metadata_cache_time = 0 - return _collection_cache - except Exception: - return None + _pin_hnsw_threads(raw) + _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path) + _metadata_cache = None + _metadata_cache_time = 0 + return _collection_cache + except Exception: + logger.exception( + "_get_collection attempt %d/2 failed (palace=%s, create=%s)", + attempt + 1, + _config.palace_path, + create, + ) + if attempt == 0: + # Reset all caches so the next attempt forces _get_client() + # to rebuild the chromadb client from scratch — that path + # re-runs quarantine_stale_hnsw (#1322) and reopens the + # collection cleanly, healing the common stale-handle case. + _client_cache = None + _collection_cache = None + _metadata_cache = None + _metadata_cache_time = 0 + return None def _no_palace(): diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py index c073830..ae20bf3 100644 --- a/tests/test_mcp_server.py +++ b/tests/test_mcp_server.py @@ -1259,6 +1259,71 @@ class TestCacheInvalidation: assert "embedding_function" in kwargs assert kwargs["embedding_function"] is not None + def test_get_collection_retries_once_on_exception(self, monkeypatch, config, palace_path, kg): + """Regression: a transient failure inside _get_collection must trigger + one retry after clearing the client/collection caches, not silently + return None. + + Before this fix, a stale chromadb handle (e.g. the rust bindings + invalidating after an out-of-band write) would raise inside the + single ``try`` block, get swallowed by ``except Exception: return + None``, and every subsequent tool call would hit the same poisoned + cache returning None. The retry forces ``_get_client()`` to rebuild + the client (which re-runs ``quarantine_stale_hnsw`` per #1322), so + the second attempt heals the common stale-handle case. + """ + _patch_mcp_server(monkeypatch, config, kg) + _client, _col = _get_collection(palace_path, create=True) + del _client + from mempalace import mcp_server + + # Force a cold cache so the first call goes through the open path. + mcp_server._client_cache = None + mcp_server._collection_cache = None + + real_get_client = mcp_server._get_client + attempts = {"count": 0} + + def flaky_get_client(): + attempts["count"] += 1 + if attempts["count"] == 1: + raise RuntimeError("simulated transient chromadb failure") + return real_get_client() + + monkeypatch.setattr(mcp_server, "_get_client", flaky_get_client) + + col = mcp_server._get_collection() + + # Both attempts ran and the second succeeded. + assert attempts["count"] == 2 + assert col is not None + + def test_get_collection_returns_none_after_two_failures( + self, monkeypatch, config, palace_path, kg + ): + """If both attempts fail, return None (matches the prior contract for + permanent failures — only the transient case is now self-healing).""" + _patch_mcp_server(monkeypatch, config, kg) + _client, _col = _get_collection(palace_path, create=True) + del _client + from mempalace import mcp_server + + mcp_server._client_cache = None + mcp_server._collection_cache = None + + attempts = {"count": 0} + + def always_fails(): + attempts["count"] += 1 + raise RuntimeError("permanent chromadb failure") + + monkeypatch.setattr(mcp_server, "_get_client", always_fails) + + col = mcp_server._get_collection() + + assert attempts["count"] == 2 + assert col is None + class TestKGLazyCache: """Lazy per-path KnowledgeGraph cache (issue #1136).""" diff --git a/tools/backup_claude_jsonls.sh b/tools/backup_claude_jsonls.sh new file mode 100755 index 0000000..f252de0 --- /dev/null +++ b/tools/backup_claude_jsonls.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# backup_claude_jsonls.sh +# +# Claude Code stores every conversation as a JSONL transcript at +# ~/.claude/projects//.jsonl +# Anthropic auto-deletes those files after 30 DAYS: +# https://docs.claude.com/en/docs/claude-code/data-usage +# +# This script copies them, read-only, into ~/Documents/Claude_JSONL_Backup/ +# so the 30-day clock no longer applies. Re-run any time — rsync is incremental. +# It NEVER deletes, modifies, or touches files inside ~/.claude/. + +set -eu + +SRC="${HOME}/.claude/projects/" +DST="${HOME}/Documents/Claude_JSONL_Backup/" + +[ -d "$SRC" ] || { echo "ERROR: $SRC does not exist."; exit 1; } +mkdir -p "$DST" + +echo "Backing up $SRC -> $DST" +rsync -a --times "$SRC" "$DST" + +src_count=$(find "$SRC" -type f -name '*.jsonl' | wc -l | tr -d ' ') +dst_count=$(find "$DST" -type f -name '*.jsonl' | wc -l | tr -d ' ') +oldest=$(find "$DST" -type f -name '*.jsonl' -exec stat -f '%Sm %N' -t '%Y-%m-%d' {} \; 2>/dev/null \ + || find "$DST" -type f -name '*.jsonl' -printf '%TY-%Tm-%Td %p\n' 2>/dev/null) +oldest_date=$(echo "$oldest" | sort | head -n 1 | awk '{print $1}') +newest_date=$(echo "$oldest" | sort | tail -n 1 | awk '{print $1}') + +echo "Source JSONL count : $src_count" +echo "Backup JSONL count : $dst_count" +echo "Oldest backup file : ${oldest_date:-n/a}" +echo "Newest backup file : ${newest_date:-n/a}" + +if [ "$src_count" -ne "$dst_count" ]; then + echo "FAIL: count mismatch ($src_count vs $dst_count)"; exit 2 +fi +echo "OK: backup verified." diff --git a/tools/find_orphan_claude_jsonls.sh b/tools/find_orphan_claude_jsonls.sh new file mode 100755 index 0000000..43523f5 --- /dev/null +++ b/tools/find_orphan_claude_jsonls.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash +# find_orphan_claude_jsonls.sh — v3 (multi-line shape + verb-aware preview) +# ----------------------------------------------------------------------------- +# Finds Claude Code conversation transcripts (.jsonl) that may have survived in +# backup/sync locations. Claude Code stores transcripts at +# ~/.claude/projects//.jsonl and auto-deletes them locally +# after 30 days. If your machine syncs to iCloud, Dropbox, Google Drive, +# OneDrive, Time Machine, or you copied transcripts elsewhere manually, those +# copies still exist. This script finds them and shows a topic preview from +# the first substantive user message — strips leading filler interjections +# ("ok so", "oh", "well", "hey") so previews surface the actual content. +# +# Read-only. Safe to re-run. +# ----------------------------------------------------------------------------- +set -eu + +LOCATIONS=( + "$HOME/Library/Mobile Documents" "$HOME/Dropbox" "$HOME/Google Drive" + "$HOME/OneDrive" "$HOME/Documents" "$HOME/Desktop" "/Volumes" +) + +TMP="$(mktemp)"; trap 'rm -f "$TMP" "$TMP.s"' EXIT + +printf "Scanning backup locations" >&2 +for loc in "${LOCATIONS[@]}"; do + [ -d "$loc" ] || continue + printf "." >&2 + while IFS= read -r -d '' f; do + # Combined: shape detection (multi-line) + verb-aware topic preview + if preview="$(python3 - "$f" 2>/dev/null <<'PYEOF' +import json, sys, re + +# Single-word/short greetings — message gets skipped entirely if it is just one of these +GREETINGS = {'hi','hey','hello','thanks','thank you','ok','okay','yes','no', + 'sure','cool','great','good','done','yep','nope','perfect','copy'} + +# Leading filler — interjections that get STRIPPED from the start of a message +# before the preview is taken. Iterative — handles "ok so well, then..." → "then..." +LEADING_FILLER = re.compile( + r'^(?:ok(?:ay)?|so|oh|well|anyway|btw|hmm+|um+|uh+|hey|hi|hello|right|' + r'yes|no|sure|cool|great|good|listen|look|wait|actually|alright|gotcha|' + r'yeah|yep|nope|nah)\b[\s,!.?:;-]*', + re.IGNORECASE +) + +path = sys.argv[1] +shape_ok = False +preview = "" +try: + with open(path, 'r', errors='replace') as fh: + for i, line in enumerate(fh): + if i >= 30: break + try: + d = json.loads(line) + except Exception: + continue + if not isinstance(d, dict): continue + # Shape check — accept if any line in first 30 has session fields + if not shape_ok and 'sessionId' in d and 'timestamp' in d and 'message' in d: + shape_ok = True + # Preview — first user message after stripping leading filler + if not preview: + role = d.get('type', '') or d.get('message', {}).get('role', '') + if role == 'user': + content = d.get('message', {}).get('content', '') + if isinstance(content, list): + text = ' '.join( + c.get('text', '') for c in content + if isinstance(c, dict) and c.get('type') == 'text' + ) + elif isinstance(content, str): + text = content + else: + text = '' + text = re.sub(r'\s+', ' ', text).strip() + # Skip messages that are pure greetings + if text.lower() in GREETINGS: + continue + # Iteratively strip leading filler tokens until stable + prev_text = None + while prev_text != text: + prev_text = text + text = LEADING_FILLER.sub('', text).strip() + # Skip if what remains is too short + if len(text) < 20: + continue + preview = text[:80] + ('...' if len(text) > 80 else '') + if shape_ok and preview: break +except Exception: + pass +if shape_ok: + print(preview if preview else "(no preview — first 30 lines were greetings or short)") + sys.exit(0) +sys.exit(1) +PYEOF +)"; then + mtime="$(stat -f '%Sm' -t '%Y-%m-%d' "$f" 2>/dev/null || stat -c '%y' "$f" 2>/dev/null | cut -d' ' -f1)" + size="$(stat -f '%z' "$f" 2>/dev/null || stat -c '%s' "$f" 2>/dev/null)" + printf '%s\t%s\t%s\t%s\n' "$mtime" "$size" "$f" "$preview" >>"$TMP" + fi + done < <(find "$loc" -type f -name '*.jsonl' -print0 2>/dev/null) +done +printf "\n" >&2 + +count=$(wc -l <"$TMP" | tr -d ' ') +if [ "$count" -eq 0 ]; then + echo "No orphan Claude Code transcripts found in scanned backup locations." + exit 0 +fi +sort -k1,1 "$TMP" >"$TMP.s" +oldest="$(head -n 1 "$TMP.s" | cut -f1)" +newest="$(tail -n 1 "$TMP.s" | cut -f1)" +echo "Found $count orphan Claude Code transcript(s). Oldest: $oldest Newest: $newest" +echo "----------------------------------------------------------------------" +awk -F'\t' '{ printf "%s %10s %s\n \"%s\"\n\n", $1, $2, $3, $4 }' "$TMP.s" diff --git a/tools/render_jsonl.py b/tools/render_jsonl.py new file mode 100755 index 0000000..2bec0da --- /dev/null +++ b/tools/render_jsonl.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +"""render_jsonl.py — turn one Claude Code JSONL transcript into readable text. + +Claude Code stores conversations at ~/.claude/projects//.jsonl and +Anthropic auto-deletes them after 30 days +(https://docs.claude.com/en/docs/claude-code/data-usage). This script renders a +JSONL into a clean .txt so you can keep / read / share it without the tooling. + +Usage: + python3 render_jsonl.py [output.txt] + +Stdlib only. Python 3.9+. Read-only on the input. +""" + +import json +import sys +from pathlib import Path + + +def extract_text(content): + if isinstance(content, str): + return content.strip() + if isinstance(content, list): + parts = [] + for blk in content: + if isinstance(blk, dict) and blk.get("type") == "text": + t = (blk.get("text") or "").strip() + if t: + parts.append(t) + return "\n".join(parts) + return "" + + +def main(): + if len(sys.argv) < 2: + print(__doc__) + sys.exit(1) + src = Path(sys.argv[1]) + if not src.is_file(): + print(f"ERROR: not a file: {src}") + sys.exit(1) + out = open(sys.argv[2], "w", encoding="utf-8") if len(sys.argv) > 2 else sys.stdout + + turns, stamps = [], [] + for raw in src.read_text(encoding="utf-8", errors="replace").splitlines(): + if not raw.strip(): + continue + try: + obj = json.loads(raw) + except json.JSONDecodeError: + continue + role = obj.get("type") or (obj.get("message") or {}).get("role") + if role not in ("user", "assistant"): + continue + msg = obj.get("message") or obj + text = extract_text(msg.get("content")) + if not text: + continue + ts = obj.get("timestamp") or "" + if ts: + stamps.append(ts) + turns.append((ts, role, text)) + + header = [ + f"# Claude Code transcript: {src}", + f"# Total turns: {len(turns)}", + f"# Date range : {min(stamps) if stamps else 'n/a'} -> {max(stamps) if stamps else 'n/a'}", + "#" + "-" * 70, + "", + ] + out.write("\n".join(header)) + for ts, role, text in turns: + out.write(f"\n[{ts}] {role.upper()}\n{text}\n\n{'-'*72}\n") + if out is not sys.stdout: + out.close() + print(f"Wrote {len(turns)} turns to {sys.argv[2]}") + + +if __name__ == "__main__": + main() diff --git a/tools/save.md b/tools/save.md new file mode 100644 index 0000000..914156b --- /dev/null +++ b/tools/save.md @@ -0,0 +1,26 @@ +--- +description: Save the current Claude Code session into MemPalace. Idempotent — won't dupe. +--- + +# /save + +Save the current Claude Code session into MemPalace. Run this when you +want a checkpoint. Safe to run repeatedly — drawer IDs are content-hashed +so re-running on the same session overwrites in place, no duplicates. + +Behavior: + +1. Find the current session's JSONL transcript path (Claude Code passes + it via the conversation context — look for `~/.claude/projects/` paths). +2. Run via bash: + + ``` + mempalace mine "" --mode convos --wing claude_imports + ``` + +3. If the user supplied an argument after `/save`, use it as the wing name + instead of `claude_imports` (e.g. `/save my_research` → + `--wing my_research`). +4. Report back: how many drawers were filed, into which wing/room. + +Requires `mempalace` to be installed (`pip install mempalace`).