feat(convo): parse Claude Code conversation dirs into project entities
Claude Code stores sessions under `~/.claude/projects/<slug>/<id>.jsonl` where `<slug>` is the original CWD with `/` replaced by `-`. That encoding is lossy — can't distinguish `foo-bar` (one segment) from `foo/bar` (two) — so slug-decoding alone produces wrong names for any hyphenated project. Fortunately, every message record carries a `cwd` field with the true path. This scanner reads one record per session to recover the accurate project name deterministically, falling back to slug-decoding only if the JSONL is malformed or empty. Output shape matches project_scanner.ProjectInfo so the discover orchestrator can union results across sources. Session count doubles as a density signal for ranking. 22 unit tests cover: root detection, cwd extraction with malformed input tolerance, fallback slug decoding, name resolution using the newest session (so renames win), and dedup when two encoded dirs resolve to the same project.
This commit is contained in:
@@ -0,0 +1,152 @@
|
||||
"""
|
||||
convo_scanner.py — Parse Claude Code conversation directories into ProjectInfo.
|
||||
|
||||
Claude Code stores sessions under ``~/.claude/projects/<slug>/<id>.jsonl``,
|
||||
where the ``<slug>`` is the original CWD with ``/`` replaced by ``-``. That
|
||||
encoding is lossy: we can't tell whether ``foo-bar`` in a slug is the
|
||||
literal project name ``foo-bar`` or two path segments ``foo/bar``.
|
||||
|
||||
Fortunately, every message record in the JSONL carries a ``cwd`` field with
|
||||
the true path. This scanner reads one record per session to recover the
|
||||
accurate project name, falling back to slug-decoding only if the JSONL
|
||||
is malformed or empty.
|
||||
|
||||
Output is the same ``ProjectInfo`` shape used by ``project_scanner``, so the
|
||||
``discover_entities`` orchestrator can mix-and-match sources.
|
||||
|
||||
Public:
|
||||
is_claude_projects_root(path) -> bool
|
||||
scan_claude_projects(path) -> list[ProjectInfo]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from mempalace.project_scanner import ProjectInfo
|
||||
|
||||
|
||||
MAX_HEADER_LINES = 20 # lines to read per session looking for `cwd`
|
||||
|
||||
|
||||
def is_claude_projects_root(path: Path) -> bool:
|
||||
"""Return True if path looks like `.claude/projects/`.
|
||||
|
||||
Heuristic: at least one child dir whose name starts with ``-`` and which
|
||||
contains at least one ``.jsonl`` file.
|
||||
"""
|
||||
if not path.is_dir():
|
||||
return False
|
||||
try:
|
||||
children = list(path.iterdir())
|
||||
except OSError:
|
||||
return False
|
||||
for child in children:
|
||||
if not (child.is_dir() and child.name.startswith("-")):
|
||||
continue
|
||||
try:
|
||||
if any(p.suffix == ".jsonl" for p in child.iterdir() if p.is_file()):
|
||||
return True
|
||||
except OSError:
|
||||
continue
|
||||
return False
|
||||
|
||||
|
||||
def _extract_cwd_from_session(session_file: Path) -> Optional[str]:
|
||||
"""Return the ``cwd`` from the first message record that carries one.
|
||||
|
||||
Returns None if the file can't be read, has no JSON, or no record has cwd.
|
||||
"""
|
||||
try:
|
||||
with open(session_file, encoding="utf-8", errors="replace") as f:
|
||||
for i, line in enumerate(f):
|
||||
if i >= MAX_HEADER_LINES:
|
||||
break
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
cwd = obj.get("cwd")
|
||||
if isinstance(cwd, str) and cwd:
|
||||
return cwd
|
||||
except OSError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _decode_slug_fallback(slug: str) -> str:
|
||||
"""Best-effort project name from slug when cwd is unavailable.
|
||||
|
||||
The slug is lossy (`/` and `-` both become `-`). Last non-empty segment
|
||||
is the closest guess at the project name, preserving kebab-case is
|
||||
impossible without cwd.
|
||||
"""
|
||||
stripped = slug.lstrip("-")
|
||||
parts = [p for p in stripped.split("-") if p]
|
||||
return parts[-1] if parts else slug
|
||||
|
||||
|
||||
def _resolve_project_name(project_dir: Path) -> str:
|
||||
"""Read one session's cwd to recover the original project name.
|
||||
|
||||
Falls back to slug-decoding if no session has a readable cwd.
|
||||
"""
|
||||
sessions = sorted(
|
||||
(p for p in project_dir.iterdir() if p.is_file() and p.suffix == ".jsonl"),
|
||||
key=lambda p: p.stat().st_mtime,
|
||||
reverse=True, # newest first — most likely to be well-formed
|
||||
)
|
||||
for session in sessions:
|
||||
cwd = _extract_cwd_from_session(session)
|
||||
if cwd:
|
||||
return Path(cwd).name or cwd
|
||||
return _decode_slug_fallback(project_dir.name)
|
||||
|
||||
|
||||
def scan_claude_projects(path: str | Path) -> list[ProjectInfo]:
|
||||
"""Scan a ``.claude/projects/`` directory for Claude Code conversations.
|
||||
|
||||
One ProjectInfo per subdir. ``has_git`` is False (the directory isn't a
|
||||
repo itself) but ``total_commits`` is repurposed here as session count so
|
||||
the UX surfaces a density signal for ranking.
|
||||
"""
|
||||
root = Path(path).expanduser().resolve()
|
||||
if not is_claude_projects_root(root):
|
||||
return []
|
||||
|
||||
projects: dict[str, ProjectInfo] = {}
|
||||
for sub in sorted(root.iterdir()):
|
||||
if not (sub.is_dir() and sub.name.startswith("-")):
|
||||
continue
|
||||
try:
|
||||
sessions = [p for p in sub.iterdir() if p.is_file() and p.suffix == ".jsonl"]
|
||||
except OSError:
|
||||
continue
|
||||
if not sessions:
|
||||
continue
|
||||
|
||||
name = _resolve_project_name(sub)
|
||||
session_count = len(sessions)
|
||||
|
||||
proj = ProjectInfo(
|
||||
name=name,
|
||||
repo_root=sub,
|
||||
manifest=None,
|
||||
has_git=False,
|
||||
total_commits=session_count,
|
||||
user_commits=session_count,
|
||||
is_mine=True, # Claude Code sessions are authored by the user
|
||||
)
|
||||
existing = projects.get(name)
|
||||
if existing is None or session_count > existing.user_commits:
|
||||
projects[name] = proj
|
||||
|
||||
return sorted(
|
||||
projects.values(),
|
||||
key=lambda p: (-p.user_commits, p.name),
|
||||
)
|
||||
Reference in New Issue
Block a user