tools/find_orphan_claude_jsonls.sh

#!/usr/bin/env bash
# find_orphan_claude_jsonls.sh — v3 (multi-line shape + verb-aware preview)
# -----------------------------------------------------------------------------
# Finds Claude Code conversation transcripts (.jsonl) that may have survived in
# backup/sync locations. Claude Code stores transcripts at
# ~/.claude/projects/<encoded>/<session>.jsonl and auto-deletes them locally
# after 30 days. If your machine syncs to iCloud, Dropbox, Google Drive,
# OneDrive, Time Machine, or you copied transcripts elsewhere manually, those
# copies still exist. This script finds them and shows a topic preview from
# the first substantive user message — strips leading filler interjections
# ("ok so", "oh", "well", "hey") so previews surface the actual content.
#
# Read-only. Safe to re-run.
# -----------------------------------------------------------------------------
set -eu

LOCATIONS=(
  "$HOME/Library/Mobile Documents" "$HOME/Dropbox" "$HOME/Google Drive"
  "$HOME/OneDrive" "$HOME/Documents" "$HOME/Desktop" "/Volumes"
)

TMP="$(mktemp)"; trap 'rm -f "$TMP" "$TMP.s"' EXIT

printf "Scanning backup locations" >&2
for loc in "${LOCATIONS[@]}"; do
  [ -d "$loc" ] || continue
  printf "." >&2
  while IFS= read -r -d '' f; do
    # Combined: shape detection (multi-line) + verb-aware topic preview
    if preview="$(python3 - "$f" 2>/dev/null <<'PYEOF'
import json, sys, re

# Single-word/short greetings — message gets skipped entirely if it is just one of these
GREETINGS = {'hi','hey','hello','thanks','thank you','ok','okay','yes','no',
             'sure','cool','great','good','done','yep','nope','perfect','copy'}

# Leading filler — interjections that get STRIPPED from the start of a message
# before the preview is taken. Iterative — handles "ok so well, then..." → "then..."
LEADING_FILLER = re.compile(
    r'^(?:ok(?:ay)?|so|oh|well|anyway|btw|hmm+|um+|uh+|hey|hi|hello|right|'
    r'yes|no|sure|cool|great|good|listen|look|wait|actually|alright|gotcha|'
    r'yeah|yep|nope|nah)\b[\s,!.?:;-]*',
    re.IGNORECASE
)

path = sys.argv[1]
shape_ok = False
preview = ""
try:
    with open(path, 'r', errors='replace') as fh:
        for i, line in enumerate(fh):
            if i >= 30: break
            try:
                d = json.loads(line)
            except Exception:
                continue
            if not isinstance(d, dict): continue
            # Shape check — accept if any line in first 30 has session fields
            if not shape_ok and 'sessionId' in d and 'timestamp' in d and 'message' in d:
                shape_ok = True
            # Preview — first user message after stripping leading filler
            if not preview:
                role = d.get('type', '') or d.get('message', {}).get('role', '')
                if role == 'user':
                    content = d.get('message', {}).get('content', '')
                    if isinstance(content, list):
                        text = ' '.join(
                            c.get('text', '') for c in content
                            if isinstance(c, dict) and c.get('type') == 'text'
                        )
                    elif isinstance(content, str):
                        text = content
                    else:
                        text = ''
                    text = re.sub(r'\s+', ' ', text).strip()
                    # Skip messages that are pure greetings
                    if text.lower() in GREETINGS:
                        continue
                    # Iteratively strip leading filler tokens until stable
                    prev_text = None
                    while prev_text != text:
                        prev_text = text
                        text = LEADING_FILLER.sub('', text).strip()
                    # Skip if what remains is too short
                    if len(text) < 20:
                        continue
                    preview = text[:80] + ('...' if len(text) > 80 else '')
            if shape_ok and preview: break
except Exception:
    pass
if shape_ok:
    print(preview if preview else "(no preview — first 30 lines were greetings or short)")
    sys.exit(0)
sys.exit(1)
PYEOF
)"; then
      mtime="$(stat -f '%Sm' -t '%Y-%m-%d' "$f" 2>/dev/null || stat -c '%y' "$f" 2>/dev/null | cut -d' ' -f1)"
      size="$(stat -f '%z' "$f" 2>/dev/null || stat -c '%s' "$f" 2>/dev/null)"
      printf '%s\t%s\t%s\t%s\n' "$mtime" "$size" "$f" "$preview" >>"$TMP"
    fi
  done < <(find "$loc" -type f -name '*.jsonl' -print0 2>/dev/null)
done
printf "\n" >&2

count=$(wc -l <"$TMP" | tr -d ' ')
if [ "$count" -eq 0 ]; then
  echo "No orphan Claude Code transcripts found in scanned backup locations."
  exit 0
fi
sort -k1,1 "$TMP" >"$TMP.s"
oldest="$(head -n 1 "$TMP.s" | cut -f1)"
newest="$(tail -n 1 "$TMP.s" | cut -f1)"
echo "Found $count orphan Claude Code transcript(s). Oldest: $oldest  Newest: $newest"
echo "----------------------------------------------------------------------"
awk -F'\t' '{ printf "%s  %10s  %s\n              \"%s\"\n\n", $1, $2, $3, $4 }' "$TMP.s"
docs: add 30-day expiry callout + ship 4 auto-save tools 2026-05-06 12:35:01 -07:00			`#!/usr/bin/env bash`
			`# find_orphan_claude_jsonls.sh — v3 (multi-line shape + verb-aware preview)`
			`# -----------------------------------------------------------------------------`
			`# Finds Claude Code conversation transcripts (.jsonl) that may have survived in`
			`# backup/sync locations. Claude Code stores transcripts at`
			`# ~/.claude/projects/<encoded>/<session>.jsonl and auto-deletes them locally`
			`# after 30 days. If your machine syncs to iCloud, Dropbox, Google Drive,`
			`# OneDrive, Time Machine, or you copied transcripts elsewhere manually, those`
			`# copies still exist. This script finds them and shows a topic preview from`
			`# the first substantive user message — strips leading filler interjections`
			`# ("ok so", "oh", "well", "hey") so previews surface the actual content.`
			`#`
			`# Read-only. Safe to re-run.`
			`# -----------------------------------------------------------------------------`
			`set -eu`

			`LOCATIONS=(`
			`"$HOME/Library/Mobile Documents" "$HOME/Dropbox" "$HOME/Google Drive"`
			`"$HOME/OneDrive" "$HOME/Documents" "$HOME/Desktop" "/Volumes"`
			`)`

			`TMP="$(mktemp)"; trap 'rm -f "$TMP" "$TMP.s"' EXIT`

			`printf "Scanning backup locations" >&2`
			`for loc in "${LOCATIONS[@]}"; do`
			`[ -d "$loc" ] \|\| continue`
			`printf "." >&2`
			`while IFS= read -r -d '' f; do`
			`# Combined: shape detection (multi-line) + verb-aware topic preview`
			`if preview="$(python3 - "$f" 2>/dev/null <<'PYEOF'`
			`import json, sys, re`

			`# Single-word/short greetings — message gets skipped entirely if it is just one of these`
			`GREETINGS = {'hi','hey','hello','thanks','thank you','ok','okay','yes','no',`
			`'sure','cool','great','good','done','yep','nope','perfect','copy'}`

			`# Leading filler — interjections that get STRIPPED from the start of a message`
			`# before the preview is taken. Iterative — handles "ok so well, then..." → "then..."`
			`LEADING_FILLER = re.compile(`
			`r'^(?:ok(?:ay)?\|so\|oh\|well\|anyway\|btw\|hmm+\|um+\|uh+\|hey\|hi\|hello\|right\|'`
			`r'yes\|no\|sure\|cool\|great\|good\|listen\|look\|wait\|actually\|alright\|gotcha\|'`
			`r'yeah\|yep\|nope\|nah)\b[\s,!.?:;-]*',`
			`re.IGNORECASE`
			`)`

			`path = sys.argv[1]`
			`shape_ok = False`
			`preview = ""`
			`try:`
			`with open(path, 'r', errors='replace') as fh:`
			`for i, line in enumerate(fh):`
			`if i >= 30: break`
			`try:`
			`d = json.loads(line)`
			`except Exception:`
			`continue`
			`if not isinstance(d, dict): continue`
			`# Shape check — accept if any line in first 30 has session fields`
			`if not shape_ok and 'sessionId' in d and 'timestamp' in d and 'message' in d:`
			`shape_ok = True`
			`# Preview — first user message after stripping leading filler`
			`if not preview:`
			`role = d.get('type', '') or d.get('message', {}).get('role', '')`
			`if role == 'user':`
			`content = d.get('message', {}).get('content', '')`
			`if isinstance(content, list):`
			`text = ' '.join(`
			`c.get('text', '') for c in content`
			`if isinstance(c, dict) and c.get('type') == 'text'`
			`)`
			`elif isinstance(content, str):`
			`text = content`
			`else:`
			`text = ''`
			`text = re.sub(r'\s+', ' ', text).strip()`
			`# Skip messages that are pure greetings`
			`if text.lower() in GREETINGS:`
			`continue`
			`# Iteratively strip leading filler tokens until stable`
			`prev_text = None`
			`while prev_text != text:`
			`prev_text = text`
			`text = LEADING_FILLER.sub('', text).strip()`
			`# Skip if what remains is too short`
			`if len(text) < 20:`
			`continue`
			`preview = text[:80] + ('...' if len(text) > 80 else '')`
			`if shape_ok and preview: break`
			`except Exception:`
			`pass`
			`if shape_ok:`
			`print(preview if preview else "(no preview — first 30 lines were greetings or short)")`
			`sys.exit(0)`
			`sys.exit(1)`
			`PYEOF`
			`)"; then`
			`mtime="$(stat -f '%Sm' -t '%Y-%m-%d' "$f" 2>/dev/null \|\| stat -c '%y' "$f" 2>/dev/null \| cut -d' ' -f1)"`
			`size="$(stat -f '%z' "$f" 2>/dev/null \|\| stat -c '%s' "$f" 2>/dev/null)"`
			`printf '%s\t%s\t%s\t%s\n' "$mtime" "$size" "$f" "$preview" >>"$TMP"`
			`fi`
			`done < <(find "$loc" -type f -name '*.jsonl' -print0 2>/dev/null)`
			`done`
			`printf "\n" >&2`

			`count=$(wc -l <"$TMP" \| tr -d ' ')`
			`if [ "$count" -eq 0 ]; then`
			`echo "No orphan Claude Code transcripts found in scanned backup locations."`
			`exit 0`
			`fi`
			`sort -k1,1 "$TMP" >"$TMP.s"`
			`oldest="$(head -n 1 "$TMP.s" \| cut -f1)"`
			`newest="$(tail -n 1 "$TMP.s" \| cut -f1)"`
			`echo "Found $count orphan Claude Code transcript(s). Oldest: $oldest Newest: $newest"`
			`echo "----------------------------------------------------------------------"`
			`awk -F'\t' '{ printf "%s %10s %s\n \"%s\"\n\n", $1, $2, $3, $4 }' "$TMP.s"`