feat(normalize): Gemini CLI session JSONL adapter

Adds a fifth format adapter to mempalace.normalize alongside the
existing Claude Code, Codex, Claude.ai, ChatGPT, and Slack parsers.
After this lands, mempalace mine --mode convos ingests Gemini CLI
session history without manual export.

Why now: Claude Code and Codex CLI are already supported by convo_miner;
adding Gemini closes the major-CLI-tool coverage gap. After this lands,
the README's "verbatim conversation history" promise is honestly
delivered for all three top-tier API-keyed coding CLIs (Claude Code,
Codex CLI, Gemini CLI), not just two of them. This is the third leg
of the trio Aya pushed for so the public claim matches the actual
ingest pipeline.

Gemini CLI stores sessions at ~/.gemini/tmp/<project_hash>/chats/ as
JSONL. The on-disk schema (per google-gemini/gemini-cli#15292):

    {"type":"session_metadata","sessionId":"...","projectHash":"...",...}
    {"type":"user","id":"msg1","content":[{"text":"Hello"}]}
    {"type":"gemini","id":"msg2","content":[{"text":"Hi"}]}
    {"type":"message_update","id":"msg2","tokens":{"input":10,"output":5}}

The new _try_gemini_jsonl parser:

  - requires a session_metadata record so it does not false-positive
    against Claude Code or Codex JSONL passing through the dispatch
    chain in _try_normalize_json
  - extracts user/gemini message text from each entry's content array
    of {"text": "..."} blocks, joining multiple blocks per message
    in order
  - skips message_update entries (token-count deltas with no message
    text) and any other unknown record types
  - returns None when fewer than two conversational messages are
    present, mirroring the codex parser's >=2-message guard

Test coverage: 9 new unit tests in tests/test_normalize.py mirroring
the codex test pattern - happy path, multi-turn, missing session
metadata, message_update skip, single-message rejection, multi-block
content concatenation, empty content skip, malformed-line resilience,
and explicit no-match against codex JSONL fixtures. Schema-level only;
real Gemini CLI session fixtures are a follow-up once a real user file
is available.

Closes part of #59 (the Gemini CLI portion of the umbrella request).
This commit is contained in:
MSL
2026-04-27 00:44:40 -07:00
parent 899a5ec4c6
commit f4440f1ce0
2 changed files with 208 additions and 0 deletions
+64
View File
@@ -157,6 +157,10 @@ def _try_normalize_json(content: str) -> Optional[str]:
if normalized:
return normalized
normalized = _try_gemini_jsonl(content)
if normalized:
return normalized
try:
data = json.loads(content)
except json.JSONDecodeError:
@@ -280,6 +284,66 @@ def _try_codex_jsonl(content: str) -> Optional[str]:
return None
def _try_gemini_jsonl(content: str) -> Optional[str]:
"""Gemini CLI sessions (~/.gemini/tmp/<project_hash>/chats/session-*.jsonl).
Schema (per google-gemini/gemini-cli#15292): a session_metadata record
on the first line, then a stream of ``{"type": "user", "content":
[{"text": "..."}]}`` and ``{"type": "gemini", "content": [...]}``
records, with optional ``message_update`` records carrying token
counts only.
Detection requires a ``session_metadata`` record so this parser does
not false-positive against Claude Code or Codex JSONL passed through
the dispatch chain. ``message_update`` entries are skipped — they
have no message text. Multiple text blocks within a single message's
content array are concatenated in order, separated by newlines.
"""
lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
messages = []
has_session_metadata = False
for line in lines:
try:
entry = json.loads(line)
except json.JSONDecodeError:
continue
if not isinstance(entry, dict):
continue
entry_type = entry.get("type", "")
if entry_type == "session_metadata":
has_session_metadata = True
continue
if entry_type not in ("user", "gemini"):
# Skips message_update, system events, anything else.
continue
content_blocks = entry.get("content", [])
if not isinstance(content_blocks, list):
continue
parts = []
for block in content_blocks:
if not isinstance(block, dict):
continue
text = block.get("text", "")
if isinstance(text, str) and text.strip():
parts.append(text)
if not parts:
continue
joined = "\n".join(parts)
if entry_type == "user":
messages.append(("user", joined))
else: # "gemini"
messages.append(("assistant", joined))
if len(messages) >= 2 and has_session_metadata:
return _messages_to_transcript(messages)
return None
def _try_claude_ai_json(data) -> Optional[str]:
"""Claude.ai JSON export: flat messages list or privacy export with chat_messages."""
if isinstance(data, dict):
+144
View File
@@ -11,6 +11,7 @@ from mempalace.normalize import (
_try_claude_ai_json,
_try_claude_code_jsonl,
_try_codex_jsonl,
_try_gemini_jsonl,
_try_normalize_json,
_try_slack_json,
normalize,
@@ -450,6 +451,149 @@ def test_codex_jsonl_payload_not_dict():
assert result is not None
# ── _try_gemini_jsonl ──────────────────────────────────────────────────
#
# Gemini CLI sessions live at ``~/.gemini/tmp/<project_hash>/chats/`` as
# JSONL. The schema (per google-gemini/gemini-cli#15292):
#
# {"type":"session_metadata","sessionId":"...","projectHash":"...",...}
# {"type":"user","id":"msg1","content":[{"text":"Hello"}]}
# {"type":"gemini","id":"msg2","content":[{"text":"Hi"}]}
# {"type":"message_update","id":"msg2","tokens":{"input":10,"output":5}}
#
# Detection requires a ``session_metadata`` record so this parser does
# not false-positive against Claude Code or Codex JSONL. ``message_update``
# entries (token-count deltas only) are skipped — they carry no message
# text. ``content`` is an array of ``{"text": "..."}`` blocks; we join
# all text blocks for a given message.
def test_gemini_jsonl_valid():
lines = [
json.dumps({"type": "session_metadata", "sessionId": "abc", "projectHash": "h"}),
json.dumps({"type": "user", "id": "m1", "content": [{"text": "Hello"}]}),
json.dumps({"type": "gemini", "id": "m2", "content": [{"text": "Hi there"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "> Hello" in result
assert "Hi there" in result
def test_gemini_jsonl_multi_turn():
lines = [
json.dumps({"type": "session_metadata", "sessionId": "s"}),
json.dumps({"type": "user", "content": [{"text": "Q1"}]}),
json.dumps({"type": "gemini", "content": [{"text": "A1"}]}),
json.dumps({"type": "user", "content": [{"text": "Q2"}]}),
json.dumps({"type": "gemini", "content": [{"text": "A2"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "> Q1" in result
assert "A1" in result
assert "> Q2" in result
assert "A2" in result
def test_gemini_jsonl_no_session_metadata():
"""Without session_metadata, parser returns None — guards against false
positives on Claude Code / Codex JSONL passed through the dispatch chain."""
lines = [
json.dumps({"type": "user", "content": [{"text": "Hi"}]}),
json.dumps({"type": "gemini", "content": [{"text": "Hello"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is None
def test_gemini_jsonl_skips_message_update():
"""message_update records carry only token counts — must be ignored,
not turned into empty drawers or duplicated assistant turns."""
lines = [
json.dumps({"type": "session_metadata"}),
json.dumps({"type": "user", "content": [{"text": "Q"}]}),
json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
json.dumps({"type": "message_update", "id": "m2", "tokens": {"input": 10, "output": 5}}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "tokens" not in result
assert "input" not in result
def test_gemini_jsonl_too_few_messages():
"""Mirror codex/claude_code behavior: < 2 conversational messages = None."""
lines = [
json.dumps({"type": "session_metadata"}),
json.dumps({"type": "user", "content": [{"text": "only one msg"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is None
def test_gemini_jsonl_multi_block_content():
"""A single message can have multiple text blocks in its content array
(e.g. a thinking block + a final answer). Both should be concatenated
into one transcript turn, in order."""
lines = [
json.dumps({"type": "session_metadata"}),
json.dumps({"type": "user", "content": [{"text": "Q"}]}),
json.dumps(
{
"type": "gemini",
"content": [{"text": "First part."}, {"text": "Second part."}],
}
),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "First part." in result
assert "Second part." in result
def test_gemini_jsonl_empty_content_skipped():
"""A message whose content array yields no text should be skipped, not
emit an empty turn that would corrupt the transcript."""
lines = [
json.dumps({"type": "session_metadata"}),
json.dumps({"type": "user", "content": []}),
json.dumps({"type": "user", "content": [{"text": "real Q"}]}),
json.dumps({"type": "gemini", "content": [{"text": "real A"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "> real Q" in result
assert "real A" in result
def test_gemini_jsonl_invalid_json_lines_skipped():
"""A malformed line in the middle of the stream must not abort parsing —
the rest of the session should still produce a transcript."""
lines = [
json.dumps({"type": "session_metadata"}),
"not-valid-json{",
json.dumps({"type": "user", "content": [{"text": "Q"}]}),
json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "> Q" in result
def test_gemini_jsonl_does_not_match_codex():
"""Codex JSONL passed in must NOT be parsed by the gemini adapter — the
dispatch chain in _try_normalize_json relies on each adapter returning
None when it doesn't recognize a format."""
lines = [
json.dumps({"type": "session_meta", "payload": {}}),
json.dumps({"type": "event_msg", "payload": {"type": "user_message", "message": "Q"}}),
json.dumps({"type": "event_msg", "payload": {"type": "agent_message", "message": "A"}}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is None
# ── _try_claude_ai_json ───────────────────────────────────────────────