Merge remote-tracking branch 'origin/develop' into fix/1308-rebuild-from-sqlite

This commit is contained in:
Igor Lins e Silva
2026-05-07 07:30:56 -03:00
7 changed files with 416 additions and 61 deletions
+4
View File
@@ -6,6 +6,10 @@
> domain — including `mempalace.tech` — is an impostor and may distribute
> malware. Details and timeline: [docs/HISTORY.md](docs/HISTORY.md).
> [!IMPORTANT]
> **🚨 Claude Code sessions expire in 30 days w/out auto-save hooks wired!** **[Read this →](https://github.com/MemPalace/mempalace/discussions/1388)**
<div align="center">
<img src="assets/mempalace_logo.png" alt="MemPalace" width="240">
+87 -61
View File
@@ -326,68 +326,94 @@ def _get_client():
def _get_collection(create=False):
"""Return the ChromaDB collection, caching the client between calls."""
global _collection_cache, _metadata_cache, _metadata_cache_time
try:
client = _get_client()
# ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
# collection but not the EF *instance/configuration*. So a reader or
# writer that omits ``embedding_function=`` silently gets chromadb's
# built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
# one we spoof in ``mempalace.embedding`` (both report ``"default"``,
# the identity check passes), but the *provider list* is chromadb's
# default rather than the user's resolved device. On bleeding-edge
# interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
# that default provider selection can SIGSEGV the host process on
# first ``col.add()``. The miner / Stop hook ingest path avoids this
# because it routes through ``ChromaBackend.get_collection``, which
# resolves the EF via ``ChromaBackend._resolve_embedding_function``;
# the MCP server bypassed that abstraction. Resolve the EF inside the
# branches that actually open a collection so warm-cache reads stay
# zero-cost. Reuse the backend helper so the two call sites can't
# drift on logging or fallback semantics.
if create:
ef = ChromaBackend._resolve_embedding_function()
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
# hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
# HNSW insert path, which has a race in repairConnectionsForUpdate /
# addPoint (see issues #974, #965). Set via metadata on fresh
# collections and re-applied via _pin_hnsw_threads() for legacy
# palaces whose collections were created before this fix (the
# runtime config does not persist cross-process in chromadb 1.5.x,
# so the retrofit runs every time _get_collection opens a cache).
#
# ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
# is called with metadata that differs from what's stored. The split
# below skips the metadata-comparison codepath for existing
# collections, mirroring the backend-layer fix from #1262.
try:
"""Return the ChromaDB collection, caching the client between calls.
On failure, log the exception and retry once after clearing the client
and collection caches. Tools were silently returning ``None`` when a
cached client/collection went stale — typically after the chromadb
rust bindings invalidated a handle following an out-of-band write —
leaving the LLM with no diagnostic and no recovery path. The retry
forces ``_get_client()`` to rebuild from scratch (which re-runs
``quarantine_stale_hnsw`` per #1322), so the second attempt heals the
common stale-handle / stale-HNSW case automatically.
"""
global _client_cache, _collection_cache, _metadata_cache, _metadata_cache_time
for attempt in range(2):
try:
client = _get_client()
# ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
# collection but not the EF *instance/configuration*. So a reader or
# writer that omits ``embedding_function=`` silently gets chromadb's
# built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
# one we spoof in ``mempalace.embedding`` (both report ``"default"``,
# the identity check passes), but the *provider list* is chromadb's
# default rather than the user's resolved device. On bleeding-edge
# interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
# that default provider selection can SIGSEGV the host process on
# first ``col.add()``. The miner / Stop hook ingest path avoids this
# because it routes through ``ChromaBackend.get_collection``, which
# resolves the EF via ``ChromaBackend._resolve_embedding_function``;
# the MCP server bypassed that abstraction. Resolve the EF inside the
# branches that actually open a collection so warm-cache reads stay
# zero-cost. Reuse the backend helper so the two call sites can't
# drift on logging or fallback semantics.
if create:
ef = ChromaBackend._resolve_embedding_function()
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
# hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
# HNSW insert path, which has a race in repairConnectionsForUpdate /
# addPoint (see issues #974, #965). Set via metadata on fresh
# collections and re-applied via _pin_hnsw_threads() for legacy
# palaces whose collections were created before this fix (the
# runtime config does not persist cross-process in chromadb 1.5.x,
# so the retrofit runs every time _get_collection opens a cache).
#
# ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
# is called with metadata that differs from what's stored. The split
# below skips the metadata-comparison codepath for existing
# collections, mirroring the backend-layer fix from #1262.
try:
raw = client.get_collection(_config.collection_name, **ef_kwargs)
except _ChromaNotFoundError:
raw = client.create_collection(
_config.collection_name,
metadata={
"hnsw:space": "cosine",
"hnsw:num_threads": 1,
**_HNSW_BLOAT_GUARD,
},
**ef_kwargs,
)
_pin_hnsw_threads(raw)
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
_metadata_cache = None
_metadata_cache_time = 0
elif _collection_cache is None:
ef = ChromaBackend._resolve_embedding_function()
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
raw = client.get_collection(_config.collection_name, **ef_kwargs)
except _ChromaNotFoundError:
raw = client.create_collection(
_config.collection_name,
metadata={
"hnsw:space": "cosine",
"hnsw:num_threads": 1,
**_HNSW_BLOAT_GUARD,
},
**ef_kwargs,
)
_pin_hnsw_threads(raw)
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
_metadata_cache = None
_metadata_cache_time = 0
elif _collection_cache is None:
ef = ChromaBackend._resolve_embedding_function()
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
raw = client.get_collection(_config.collection_name, **ef_kwargs)
_pin_hnsw_threads(raw)
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
_metadata_cache = None
_metadata_cache_time = 0
return _collection_cache
except Exception:
return None
_pin_hnsw_threads(raw)
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
_metadata_cache = None
_metadata_cache_time = 0
return _collection_cache
except Exception:
logger.exception(
"_get_collection attempt %d/2 failed (palace=%s, create=%s)",
attempt + 1,
_config.palace_path,
create,
)
if attempt == 0:
# Reset all caches so the next attempt forces _get_client()
# to rebuild the chromadb client from scratch — that path
# re-runs quarantine_stale_hnsw (#1322) and reopens the
# collection cleanly, healing the common stale-handle case.
_client_cache = None
_collection_cache = None
_metadata_cache = None
_metadata_cache_time = 0
return None
def _no_palace():
+65
View File
@@ -1259,6 +1259,71 @@ class TestCacheInvalidation:
assert "embedding_function" in kwargs
assert kwargs["embedding_function"] is not None
def test_get_collection_retries_once_on_exception(self, monkeypatch, config, palace_path, kg):
"""Regression: a transient failure inside _get_collection must trigger
one retry after clearing the client/collection caches, not silently
return None.
Before this fix, a stale chromadb handle (e.g. the rust bindings
invalidating after an out-of-band write) would raise inside the
single ``try`` block, get swallowed by ``except Exception: return
None``, and every subsequent tool call would hit the same poisoned
cache returning None. The retry forces ``_get_client()`` to rebuild
the client (which re-runs ``quarantine_stale_hnsw`` per #1322), so
the second attempt heals the common stale-handle case.
"""
_patch_mcp_server(monkeypatch, config, kg)
_client, _col = _get_collection(palace_path, create=True)
del _client
from mempalace import mcp_server
# Force a cold cache so the first call goes through the open path.
mcp_server._client_cache = None
mcp_server._collection_cache = None
real_get_client = mcp_server._get_client
attempts = {"count": 0}
def flaky_get_client():
attempts["count"] += 1
if attempts["count"] == 1:
raise RuntimeError("simulated transient chromadb failure")
return real_get_client()
monkeypatch.setattr(mcp_server, "_get_client", flaky_get_client)
col = mcp_server._get_collection()
# Both attempts ran and the second succeeded.
assert attempts["count"] == 2
assert col is not None
def test_get_collection_returns_none_after_two_failures(
self, monkeypatch, config, palace_path, kg
):
"""If both attempts fail, return None (matches the prior contract for
permanent failures — only the transient case is now self-healing)."""
_patch_mcp_server(monkeypatch, config, kg)
_client, _col = _get_collection(palace_path, create=True)
del _client
from mempalace import mcp_server
mcp_server._client_cache = None
mcp_server._collection_cache = None
attempts = {"count": 0}
def always_fails():
attempts["count"] += 1
raise RuntimeError("permanent chromadb failure")
monkeypatch.setattr(mcp_server, "_get_client", always_fails)
col = mcp_server._get_collection()
assert attempts["count"] == 2
assert col is None
class TestKGLazyCache:
"""Lazy per-path KnowledgeGraph cache (issue #1136)."""
+39
View File
@@ -0,0 +1,39 @@
#!/usr/bin/env bash
# backup_claude_jsonls.sh
#
# Claude Code stores every conversation as a JSONL transcript at
# ~/.claude/projects/<encoded-project>/<session-uuid>.jsonl
# Anthropic auto-deletes those files after 30 DAYS:
# https://docs.claude.com/en/docs/claude-code/data-usage
#
# This script copies them, read-only, into ~/Documents/Claude_JSONL_Backup/
# so the 30-day clock no longer applies. Re-run any time — rsync is incremental.
# It NEVER deletes, modifies, or touches files inside ~/.claude/.
set -eu
SRC="${HOME}/.claude/projects/"
DST="${HOME}/Documents/Claude_JSONL_Backup/"
[ -d "$SRC" ] || { echo "ERROR: $SRC does not exist."; exit 1; }
mkdir -p "$DST"
echo "Backing up $SRC -> $DST"
rsync -a --times "$SRC" "$DST"
src_count=$(find "$SRC" -type f -name '*.jsonl' | wc -l | tr -d ' ')
dst_count=$(find "$DST" -type f -name '*.jsonl' | wc -l | tr -d ' ')
oldest=$(find "$DST" -type f -name '*.jsonl' -exec stat -f '%Sm %N' -t '%Y-%m-%d' {} \; 2>/dev/null \
|| find "$DST" -type f -name '*.jsonl' -printf '%TY-%Tm-%Td %p\n' 2>/dev/null)
oldest_date=$(echo "$oldest" | sort | head -n 1 | awk '{print $1}')
newest_date=$(echo "$oldest" | sort | tail -n 1 | awk '{print $1}')
echo "Source JSONL count : $src_count"
echo "Backup JSONL count : $dst_count"
echo "Oldest backup file : ${oldest_date:-n/a}"
echo "Newest backup file : ${newest_date:-n/a}"
if [ "$src_count" -ne "$dst_count" ]; then
echo "FAIL: count mismatch ($src_count vs $dst_count)"; exit 2
fi
echo "OK: backup verified."
+115
View File
@@ -0,0 +1,115 @@
#!/usr/bin/env bash
# find_orphan_claude_jsonls.sh — v3 (multi-line shape + verb-aware preview)
# -----------------------------------------------------------------------------
# Finds Claude Code conversation transcripts (.jsonl) that may have survived in
# backup/sync locations. Claude Code stores transcripts at
# ~/.claude/projects/<encoded>/<session>.jsonl and auto-deletes them locally
# after 30 days. If your machine syncs to iCloud, Dropbox, Google Drive,
# OneDrive, Time Machine, or you copied transcripts elsewhere manually, those
# copies still exist. This script finds them and shows a topic preview from
# the first substantive user message — strips leading filler interjections
# ("ok so", "oh", "well", "hey") so previews surface the actual content.
#
# Read-only. Safe to re-run.
# -----------------------------------------------------------------------------
set -eu
LOCATIONS=(
"$HOME/Library/Mobile Documents" "$HOME/Dropbox" "$HOME/Google Drive"
"$HOME/OneDrive" "$HOME/Documents" "$HOME/Desktop" "/Volumes"
)
TMP="$(mktemp)"; trap 'rm -f "$TMP" "$TMP.s"' EXIT
printf "Scanning backup locations" >&2
for loc in "${LOCATIONS[@]}"; do
[ -d "$loc" ] || continue
printf "." >&2
while IFS= read -r -d '' f; do
# Combined: shape detection (multi-line) + verb-aware topic preview
if preview="$(python3 - "$f" 2>/dev/null <<'PYEOF'
import json, sys, re
# Single-word/short greetings — message gets skipped entirely if it is just one of these
GREETINGS = {'hi','hey','hello','thanks','thank you','ok','okay','yes','no',
'sure','cool','great','good','done','yep','nope','perfect','copy'}
# Leading filler — interjections that get STRIPPED from the start of a message
# before the preview is taken. Iterative — handles "ok so well, then..." → "then..."
LEADING_FILLER = re.compile(
r'^(?:ok(?:ay)?|so|oh|well|anyway|btw|hmm+|um+|uh+|hey|hi|hello|right|'
r'yes|no|sure|cool|great|good|listen|look|wait|actually|alright|gotcha|'
r'yeah|yep|nope|nah)\b[\s,!.?:;-]*',
re.IGNORECASE
)
path = sys.argv[1]
shape_ok = False
preview = ""
try:
with open(path, 'r', errors='replace') as fh:
for i, line in enumerate(fh):
if i >= 30: break
try:
d = json.loads(line)
except Exception:
continue
if not isinstance(d, dict): continue
# Shape check — accept if any line in first 30 has session fields
if not shape_ok and 'sessionId' in d and 'timestamp' in d and 'message' in d:
shape_ok = True
# Preview — first user message after stripping leading filler
if not preview:
role = d.get('type', '') or d.get('message', {}).get('role', '')
if role == 'user':
content = d.get('message', {}).get('content', '')
if isinstance(content, list):
text = ' '.join(
c.get('text', '') for c in content
if isinstance(c, dict) and c.get('type') == 'text'
)
elif isinstance(content, str):
text = content
else:
text = ''
text = re.sub(r'\s+', ' ', text).strip()
# Skip messages that are pure greetings
if text.lower() in GREETINGS:
continue
# Iteratively strip leading filler tokens until stable
prev_text = None
while prev_text != text:
prev_text = text
text = LEADING_FILLER.sub('', text).strip()
# Skip if what remains is too short
if len(text) < 20:
continue
preview = text[:80] + ('...' if len(text) > 80 else '')
if shape_ok and preview: break
except Exception:
pass
if shape_ok:
print(preview if preview else "(no preview — first 30 lines were greetings or short)")
sys.exit(0)
sys.exit(1)
PYEOF
)"; then
mtime="$(stat -f '%Sm' -t '%Y-%m-%d' "$f" 2>/dev/null || stat -c '%y' "$f" 2>/dev/null | cut -d' ' -f1)"
size="$(stat -f '%z' "$f" 2>/dev/null || stat -c '%s' "$f" 2>/dev/null)"
printf '%s\t%s\t%s\t%s\n' "$mtime" "$size" "$f" "$preview" >>"$TMP"
fi
done < <(find "$loc" -type f -name '*.jsonl' -print0 2>/dev/null)
done
printf "\n" >&2
count=$(wc -l <"$TMP" | tr -d ' ')
if [ "$count" -eq 0 ]; then
echo "No orphan Claude Code transcripts found in scanned backup locations."
exit 0
fi
sort -k1,1 "$TMP" >"$TMP.s"
oldest="$(head -n 1 "$TMP.s" | cut -f1)"
newest="$(tail -n 1 "$TMP.s" | cut -f1)"
echo "Found $count orphan Claude Code transcript(s). Oldest: $oldest Newest: $newest"
echo "----------------------------------------------------------------------"
awk -F'\t' '{ printf "%s %10s %s\n \"%s\"\n\n", $1, $2, $3, $4 }' "$TMP.s"
+80
View File
@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""render_jsonl.py — turn one Claude Code JSONL transcript into readable text.
Claude Code stores conversations at ~/.claude/projects/<proj>/<uuid>.jsonl and
Anthropic auto-deletes them after 30 days
(https://docs.claude.com/en/docs/claude-code/data-usage). This script renders a
JSONL into a clean .txt so you can keep / read / share it without the tooling.
Usage:
python3 render_jsonl.py <input.jsonl> [output.txt]
Stdlib only. Python 3.9+. Read-only on the input.
"""
import json
import sys
from pathlib import Path
def extract_text(content):
if isinstance(content, str):
return content.strip()
if isinstance(content, list):
parts = []
for blk in content:
if isinstance(blk, dict) and blk.get("type") == "text":
t = (blk.get("text") or "").strip()
if t:
parts.append(t)
return "\n".join(parts)
return ""
def main():
if len(sys.argv) < 2:
print(__doc__)
sys.exit(1)
src = Path(sys.argv[1])
if not src.is_file():
print(f"ERROR: not a file: {src}")
sys.exit(1)
out = open(sys.argv[2], "w", encoding="utf-8") if len(sys.argv) > 2 else sys.stdout
turns, stamps = [], []
for raw in src.read_text(encoding="utf-8", errors="replace").splitlines():
if not raw.strip():
continue
try:
obj = json.loads(raw)
except json.JSONDecodeError:
continue
role = obj.get("type") or (obj.get("message") or {}).get("role")
if role not in ("user", "assistant"):
continue
msg = obj.get("message") or obj
text = extract_text(msg.get("content"))
if not text:
continue
ts = obj.get("timestamp") or ""
if ts:
stamps.append(ts)
turns.append((ts, role, text))
header = [
f"# Claude Code transcript: {src}",
f"# Total turns: {len(turns)}",
f"# Date range : {min(stamps) if stamps else 'n/a'} -> {max(stamps) if stamps else 'n/a'}",
"#" + "-" * 70,
"",
]
out.write("\n".join(header))
for ts, role, text in turns:
out.write(f"\n[{ts}] {role.upper()}\n{text}\n\n{'-'*72}\n")
if out is not sys.stdout:
out.close()
print(f"Wrote {len(turns)} turns to {sys.argv[2]}")
if __name__ == "__main__":
main()
+26
View File
@@ -0,0 +1,26 @@
---
description: Save the current Claude Code session into MemPalace. Idempotent — won't dupe.
---
# /save
Save the current Claude Code session into MemPalace. Run this when you
want a checkpoint. Safe to run repeatedly — drawer IDs are content-hashed
so re-running on the same session overwrites in place, no duplicates.
Behavior:
1. Find the current session's JSONL transcript path (Claude Code passes
it via the conversation context — look for `~/.claude/projects/` paths).
2. Run via bash:
```
mempalace mine "<TRANSCRIPT_PATH>" --mode convos --wing claude_imports
```
3. If the user supplied an argument after `/save`, use it as the wing name
instead of `claude_imports` (e.g. `/save my_research` →
`--wing my_research`).
4. Report back: how many drawers were filed, into which wing/room.
Requires `mempalace` to be installed (`pip install mempalace`).