feat(normalize): Gemini CLI session JSONL adapter

Adds a fifth format adapter to mempalace.normalize alongside the existing Claude Code, Codex, Claude.ai, ChatGPT, and Slack parsers. After this lands, mempalace mine --mode convos ingests Gemini CLI session history without manual export. Why now: Claude Code and Codex CLI are already supported by convo_miner; adding Gemini closes the major-CLI-tool coverage gap. After this lands, the README's "verbatim conversation history" promise is honestly delivered for all three top-tier API-keyed coding CLIs (Claude Code, Codex CLI, Gemini CLI), not just two of them. This is the third leg of the trio Aya pushed for so the public claim matches the actual ingest pipeline. Gemini CLI stores sessions at ~/.gemini/tmp/<project_hash>/chats/ as JSONL. The on-disk schema (per google-gemini/gemini-cli#15292): {"type":"session_metadata","sessionId":"...","projectHash":"...",...} {"type":"user","id":"msg1","content":[{"text":"Hello"}]} {"type":"gemini","id":"msg2","content":[{"text":"Hi"}]} {"type":"message_update","id":"msg2","tokens":{"input":10,"output":5}} The new _try_gemini_jsonl parser: - requires a session_metadata record so it does not false-positive against Claude Code or Codex JSONL passing through the dispatch chain in _try_normalize_json - extracts user/gemini message text from each entry's content array of {"text": "..."} blocks, joining multiple blocks per message in order - skips message_update entries (token-count deltas with no message text) and any other unknown record types - returns None when fewer than two conversational messages are present, mirroring the codex parser's >=2-message guard Test coverage: 9 new unit tests in tests/test_normalize.py mirroring the codex test pattern - happy path, multi-turn, missing session metadata, message_update skip, single-message rejection, multi-block content concatenation, empty content skip, malformed-line resilience, and explicit no-match against codex JSONL fixtures. Schema-level only; real Gemini CLI session fixtures are a follow-up once a real user file is available. Closes part of #59 (the Gemini CLI portion of the umbrella request).
2026-04-27 00:44:40 -07:00
parent 899a5ec4c6
commit f4440f1ce0
2 changed files with 208 additions and 0 deletions
@@ -157,6 +157,10 @@ def _try_normalize_json(content: str) -> Optional[str]:
    if normalized:
        return normalized

+    normalized = _try_gemini_jsonl(content)
+    if normalized:
+        return normalized
+
    try:
        data = json.loads(content)
    except json.JSONDecodeError:
@@ -280,6 +284,66 @@ def _try_codex_jsonl(content: str) -> Optional[str]:
    return None


+def _try_gemini_jsonl(content: str) -> Optional[str]:
+    """Gemini CLI sessions (~/.gemini/tmp/<project_hash>/chats/session-*.jsonl).
+
+    Schema (per google-gemini/gemini-cli#15292): a session_metadata record
+    on the first line, then a stream of ``{"type": "user", "content":
+    [{"text": "..."}]}`` and ``{"type": "gemini", "content": [...]}``
+    records, with optional ``message_update`` records carrying token
+    counts only.
+
+    Detection requires a ``session_metadata`` record so this parser does
+    not false-positive against Claude Code or Codex JSONL passed through
+    the dispatch chain. ``message_update`` entries are skipped — they
+    have no message text. Multiple text blocks within a single message's
+    content array are concatenated in order, separated by newlines.
+    """
+    lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
+    messages = []
+    has_session_metadata = False
+    for line in lines:
+        try:
+            entry = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if not isinstance(entry, dict):
+            continue
+
+        entry_type = entry.get("type", "")
+        if entry_type == "session_metadata":
+            has_session_metadata = True
+            continue
+
+        if entry_type not in ("user", "gemini"):
+            # Skips message_update, system events, anything else.
+            continue
+
+        content_blocks = entry.get("content", [])
+        if not isinstance(content_blocks, list):
+            continue
+
+        parts = []
+        for block in content_blocks:
+            if not isinstance(block, dict):
+                continue
+            text = block.get("text", "")
+            if isinstance(text, str) and text.strip():
+                parts.append(text)
+        if not parts:
+            continue
+        joined = "\n".join(parts)
+
+        if entry_type == "user":
+            messages.append(("user", joined))
+        else:  # "gemini"
+            messages.append(("assistant", joined))
+
+    if len(messages) >= 2 and has_session_metadata:
+        return _messages_to_transcript(messages)
+    return None
+
+
 def _try_claude_ai_json(data) -> Optional[str]:
    """Claude.ai JSON export: flat messages list or privacy export with chat_messages."""
    if isinstance(data, dict):
@@ -11,6 +11,7 @@ from mempalace.normalize import (
    _try_claude_ai_json,
    _try_claude_code_jsonl,
    _try_codex_jsonl,
+    _try_gemini_jsonl,
    _try_normalize_json,
    _try_slack_json,
    normalize,
@@ -450,6 +451,149 @@ def test_codex_jsonl_payload_not_dict():
    assert result is not None


+# ── _try_gemini_jsonl ──────────────────────────────────────────────────
+#
+# Gemini CLI sessions live at ``~/.gemini/tmp/<project_hash>/chats/`` as
+# JSONL. The schema (per google-gemini/gemini-cli#15292):
+#
+#   {"type":"session_metadata","sessionId":"...","projectHash":"...",...}
+#   {"type":"user","id":"msg1","content":[{"text":"Hello"}]}
+#   {"type":"gemini","id":"msg2","content":[{"text":"Hi"}]}
+#   {"type":"message_update","id":"msg2","tokens":{"input":10,"output":5}}
+#
+# Detection requires a ``session_metadata`` record so this parser does
+# not false-positive against Claude Code or Codex JSONL. ``message_update``
+# entries (token-count deltas only) are skipped — they carry no message
+# text. ``content`` is an array of ``{"text": "..."}`` blocks; we join
+# all text blocks for a given message.
+
+
+def test_gemini_jsonl_valid():
+    lines = [
+        json.dumps({"type": "session_metadata", "sessionId": "abc", "projectHash": "h"}),
+        json.dumps({"type": "user", "id": "m1", "content": [{"text": "Hello"}]}),
+        json.dumps({"type": "gemini", "id": "m2", "content": [{"text": "Hi there"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "> Hello" in result
+    assert "Hi there" in result
+
+
+def test_gemini_jsonl_multi_turn():
+    lines = [
+        json.dumps({"type": "session_metadata", "sessionId": "s"}),
+        json.dumps({"type": "user", "content": [{"text": "Q1"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "A1"}]}),
+        json.dumps({"type": "user", "content": [{"text": "Q2"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "A2"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "> Q1" in result
+    assert "A1" in result
+    assert "> Q2" in result
+    assert "A2" in result
+
+
+def test_gemini_jsonl_no_session_metadata():
+    """Without session_metadata, parser returns None — guards against false
+    positives on Claude Code / Codex JSONL passed through the dispatch chain."""
+    lines = [
+        json.dumps({"type": "user", "content": [{"text": "Hi"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "Hello"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is None
+
+
+def test_gemini_jsonl_skips_message_update():
+    """message_update records carry only token counts — must be ignored,
+    not turned into empty drawers or duplicated assistant turns."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        json.dumps({"type": "user", "content": [{"text": "Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
+        json.dumps({"type": "message_update", "id": "m2", "tokens": {"input": 10, "output": 5}}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "tokens" not in result
+    assert "input" not in result
+
+
+def test_gemini_jsonl_too_few_messages():
+    """Mirror codex/claude_code behavior: < 2 conversational messages = None."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        json.dumps({"type": "user", "content": [{"text": "only one msg"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is None
+
+
+def test_gemini_jsonl_multi_block_content():
+    """A single message can have multiple text blocks in its content array
+    (e.g. a thinking block + a final answer). Both should be concatenated
+    into one transcript turn, in order."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        json.dumps({"type": "user", "content": [{"text": "Q"}]}),
+        json.dumps(
+            {
+                "type": "gemini",
+                "content": [{"text": "First part."}, {"text": "Second part."}],
+            }
+        ),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "First part." in result
+    assert "Second part." in result
+
+
+def test_gemini_jsonl_empty_content_skipped():
+    """A message whose content array yields no text should be skipped, not
+    emit an empty turn that would corrupt the transcript."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        json.dumps({"type": "user", "content": []}),
+        json.dumps({"type": "user", "content": [{"text": "real Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "real A"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "> real Q" in result
+    assert "real A" in result
+
+
+def test_gemini_jsonl_invalid_json_lines_skipped():
+    """A malformed line in the middle of the stream must not abort parsing —
+    the rest of the session should still produce a transcript."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        "not-valid-json{",
+        json.dumps({"type": "user", "content": [{"text": "Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "> Q" in result
+
+
+def test_gemini_jsonl_does_not_match_codex():
+    """Codex JSONL passed in must NOT be parsed by the gemini adapter — the
+    dispatch chain in _try_normalize_json relies on each adapter returning
+    None when it doesn't recognize a format."""
+    lines = [
+        json.dumps({"type": "session_meta", "payload": {}}),
+        json.dumps({"type": "event_msg", "payload": {"type": "user_message", "message": "Q"}}),
+        json.dumps({"type": "event_msg", "payload": {"type": "agent_message", "message": "A"}}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is None
+
+
 # ── _try_claude_ai_json ───────────────────────────────────────────────