mempalace/tools/find_orphan_claude_jsonls.sh

#!/usr/bin/env bash
# find_orphan_claude_jsonls.sh — v3 (multi-line shape + verb-aware preview)
# -----------------------------------------------------------------------------
# Finds Claude Code conversation transcripts (.jsonl) that may have survived in
# backup/sync locations. Claude Code stores transcripts at
# ~/.claude/projects/<encoded>/<session>.jsonl and auto-deletes them locally
# after 30 days. If your machine syncs to iCloud, Dropbox, Google Drive,
# OneDrive, Time Machine, or you copied transcripts elsewhere manually, those
# copies still exist. This script finds them and shows a topic preview from
# the first substantive user message — strips leading filler interjections
# ("ok so", "oh", "well", "hey") so previews surface the actual content.
#
# Read-only. Safe to re-run.
# -----------------------------------------------------------------------------
set -eu

LOCATIONS=(
  "$HOME/Library/Mobile Documents" "$HOME/Dropbox" "$HOME/Google Drive"
  "$HOME/OneDrive" "$HOME/Documents" "$HOME/Desktop" "/Volumes"
)

TMP="$(mktemp)"; trap 'rm -f "$TMP" "$TMP.s"' EXIT

printf "Scanning backup locations" >&2
for loc in "${LOCATIONS[@]}"; do
  [ -d "$loc" ] || continue
  printf "." >&2
  while IFS= read -r -d '' f; do
    # Combined: shape detection (multi-line) + verb-aware topic preview
    if preview="$(python3 - "$f" 2>/dev/null <<'PYEOF'
import json, sys, re

# Single-word/short greetings — message gets skipped entirely if it is just one of these
GREETINGS = {'hi','hey','hello','thanks','thank you','ok','okay','yes','no',
             'sure','cool','great','good','done','yep','nope','perfect','copy'}

# Leading filler — interjections that get STRIPPED from the start of a message
# before the preview is taken. Iterative — handles "ok so well, then..." → "then..."
LEADING_FILLER = re.compile(
    r'^(?:ok(?:ay)?|so|oh|well|anyway|btw|hmm+|um+|uh+|hey|hi|hello|right|'
    r'yes|no|sure|cool|great|good|listen|look|wait|actually|alright|gotcha|'
    r'yeah|yep|nope|nah)\b[\s,!.?:;-]*',
    re.IGNORECASE
)

path = sys.argv[1]
shape_ok = False
preview = ""
try:
    with open(path, 'r', errors='replace') as fh:
        for i, line in enumerate(fh):
            if i >= 30: break
            try:
                d = json.loads(line)
            except Exception:
                continue
            if not isinstance(d, dict): continue
            # Shape check — accept if any line in first 30 has session fields
            if not shape_ok and 'sessionId' in d and 'timestamp' in d and 'message' in d:
                shape_ok = True
            # Preview — first user message after stripping leading filler
            if not preview:
                role = d.get('type', '') or d.get('message', {}).get('role', '')
                if role == 'user':
                    content = d.get('message', {}).get('content', '')
                    if isinstance(content, list):
                        text = ' '.join(
                            c.get('text', '') for c in content
                            if isinstance(c, dict) and c.get('type') == 'text'
                        )
                    elif isinstance(content, str):
                        text = content
                    else:
                        text = ''
                    text = re.sub(r'\s+', ' ', text).strip()
                    # Skip messages that are pure greetings
                    if text.lower() in GREETINGS:
                        continue
                    # Iteratively strip leading filler tokens until stable
                    prev_text = None
                    while prev_text != text:
                        prev_text = text
                        text = LEADING_FILLER.sub('', text).strip()
                    # Skip if what remains is too short
                    if len(text) < 20:
                        continue
                    preview = text[:80] + ('...' if len(text) > 80 else '')
            if shape_ok and preview: break
except Exception:
    pass
if shape_ok:
    print(preview if preview else "(no preview — first 30 lines were greetings or short)")
    sys.exit(0)
sys.exit(1)
PYEOF
)"; then
      mtime="$(stat -f '%Sm' -t '%Y-%m-%d' "$f" 2>/dev/null || stat -c '%y' "$f" 2>/dev/null | cut -d' ' -f1)"
      size="$(stat -f '%z' "$f" 2>/dev/null || stat -c '%s' "$f" 2>/dev/null)"
      printf '%s\t%s\t%s\t%s\n' "$mtime" "$size" "$f" "$preview" >>"$TMP"
    fi
  done < <(find "$loc" -type f -name '*.jsonl' -print0 2>/dev/null)
done
printf "\n" >&2

count=$(wc -l <"$TMP" | tr -d ' ')
if [ "$count" -eq 0 ]; then
  echo "No orphan Claude Code transcripts found in scanned backup locations."
  exit 0
fi
sort -k1,1 "$TMP" >"$TMP.s"
oldest="$(head -n 1 "$TMP.s" | cut -f1)"
newest="$(tail -n 1 "$TMP.s" | cut -f1)"
echo "Found $count orphan Claude Code transcript(s). Oldest: $oldest  Newest: $newest"
echo "----------------------------------------------------------------------"
awk -F'\t' '{ printf "%s  %10s  %s\n              \"%s\"\n\n", $1, $2, $3, $4 }' "$TMP.s"