diff --git a/mempalace/normalize.py b/mempalace/normalize.py index 29516aa..f018935 100644 --- a/mempalace/normalize.py +++ b/mempalace/normalize.py @@ -157,6 +157,10 @@ def _try_normalize_json(content: str) -> Optional[str]: if normalized: return normalized + normalized = _try_gemini_jsonl(content) + if normalized: + return normalized + try: data = json.loads(content) except json.JSONDecodeError: @@ -280,6 +284,66 @@ def _try_codex_jsonl(content: str) -> Optional[str]: return None +def _try_gemini_jsonl(content: str) -> Optional[str]: + """Gemini CLI sessions (~/.gemini/tmp//chats/session-*.jsonl). + + Schema (per google-gemini/gemini-cli#15292): a session_metadata record + on the first line, then a stream of ``{"type": "user", "content": + [{"text": "..."}]}`` and ``{"type": "gemini", "content": [...]}`` + records, with optional ``message_update`` records carrying token + counts only. + + Detection requires a ``session_metadata`` record so this parser does + not false-positive against Claude Code or Codex JSONL passed through + the dispatch chain. ``message_update`` entries are skipped — they + have no message text. Multiple text blocks within a single message's + content array are concatenated in order, separated by newlines. + """ + lines = [line.strip() for line in content.strip().split("\n") if line.strip()] + messages = [] + has_session_metadata = False + for line in lines: + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + if not isinstance(entry, dict): + continue + + entry_type = entry.get("type", "") + if entry_type == "session_metadata": + has_session_metadata = True + continue + + if entry_type not in ("user", "gemini"): + # Skips message_update, system events, anything else. + continue + + content_blocks = entry.get("content", []) + if not isinstance(content_blocks, list): + continue + + parts = [] + for block in content_blocks: + if not isinstance(block, dict): + continue + text = block.get("text", "") + if isinstance(text, str) and text.strip(): + parts.append(text) + if not parts: + continue + joined = "\n".join(parts) + + if entry_type == "user": + messages.append(("user", joined)) + else: # "gemini" + messages.append(("assistant", joined)) + + if len(messages) >= 2 and has_session_metadata: + return _messages_to_transcript(messages) + return None + + def _try_claude_ai_json(data) -> Optional[str]: """Claude.ai JSON export: flat messages list or privacy export with chat_messages.""" if isinstance(data, dict): diff --git a/tests/test_normalize.py b/tests/test_normalize.py index c175450..be408d5 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -11,6 +11,7 @@ from mempalace.normalize import ( _try_claude_ai_json, _try_claude_code_jsonl, _try_codex_jsonl, + _try_gemini_jsonl, _try_normalize_json, _try_slack_json, normalize, @@ -450,6 +451,149 @@ def test_codex_jsonl_payload_not_dict(): assert result is not None +# ── _try_gemini_jsonl ────────────────────────────────────────────────── +# +# Gemini CLI sessions live at ``~/.gemini/tmp//chats/`` as +# JSONL. The schema (per google-gemini/gemini-cli#15292): +# +# {"type":"session_metadata","sessionId":"...","projectHash":"...",...} +# {"type":"user","id":"msg1","content":[{"text":"Hello"}]} +# {"type":"gemini","id":"msg2","content":[{"text":"Hi"}]} +# {"type":"message_update","id":"msg2","tokens":{"input":10,"output":5}} +# +# Detection requires a ``session_metadata`` record so this parser does +# not false-positive against Claude Code or Codex JSONL. ``message_update`` +# entries (token-count deltas only) are skipped — they carry no message +# text. ``content`` is an array of ``{"text": "..."}`` blocks; we join +# all text blocks for a given message. + + +def test_gemini_jsonl_valid(): + lines = [ + json.dumps({"type": "session_metadata", "sessionId": "abc", "projectHash": "h"}), + json.dumps({"type": "user", "id": "m1", "content": [{"text": "Hello"}]}), + json.dumps({"type": "gemini", "id": "m2", "content": [{"text": "Hi there"}]}), + ] + result = _try_gemini_jsonl("\n".join(lines)) + assert result is not None + assert "> Hello" in result + assert "Hi there" in result + + +def test_gemini_jsonl_multi_turn(): + lines = [ + json.dumps({"type": "session_metadata", "sessionId": "s"}), + json.dumps({"type": "user", "content": [{"text": "Q1"}]}), + json.dumps({"type": "gemini", "content": [{"text": "A1"}]}), + json.dumps({"type": "user", "content": [{"text": "Q2"}]}), + json.dumps({"type": "gemini", "content": [{"text": "A2"}]}), + ] + result = _try_gemini_jsonl("\n".join(lines)) + assert result is not None + assert "> Q1" in result + assert "A1" in result + assert "> Q2" in result + assert "A2" in result + + +def test_gemini_jsonl_no_session_metadata(): + """Without session_metadata, parser returns None — guards against false + positives on Claude Code / Codex JSONL passed through the dispatch chain.""" + lines = [ + json.dumps({"type": "user", "content": [{"text": "Hi"}]}), + json.dumps({"type": "gemini", "content": [{"text": "Hello"}]}), + ] + result = _try_gemini_jsonl("\n".join(lines)) + assert result is None + + +def test_gemini_jsonl_skips_message_update(): + """message_update records carry only token counts — must be ignored, + not turned into empty drawers or duplicated assistant turns.""" + lines = [ + json.dumps({"type": "session_metadata"}), + json.dumps({"type": "user", "content": [{"text": "Q"}]}), + json.dumps({"type": "gemini", "content": [{"text": "A"}]}), + json.dumps({"type": "message_update", "id": "m2", "tokens": {"input": 10, "output": 5}}), + ] + result = _try_gemini_jsonl("\n".join(lines)) + assert result is not None + assert "tokens" not in result + assert "input" not in result + + +def test_gemini_jsonl_too_few_messages(): + """Mirror codex/claude_code behavior: < 2 conversational messages = None.""" + lines = [ + json.dumps({"type": "session_metadata"}), + json.dumps({"type": "user", "content": [{"text": "only one msg"}]}), + ] + result = _try_gemini_jsonl("\n".join(lines)) + assert result is None + + +def test_gemini_jsonl_multi_block_content(): + """A single message can have multiple text blocks in its content array + (e.g. a thinking block + a final answer). Both should be concatenated + into one transcript turn, in order.""" + lines = [ + json.dumps({"type": "session_metadata"}), + json.dumps({"type": "user", "content": [{"text": "Q"}]}), + json.dumps( + { + "type": "gemini", + "content": [{"text": "First part."}, {"text": "Second part."}], + } + ), + ] + result = _try_gemini_jsonl("\n".join(lines)) + assert result is not None + assert "First part." in result + assert "Second part." in result + + +def test_gemini_jsonl_empty_content_skipped(): + """A message whose content array yields no text should be skipped, not + emit an empty turn that would corrupt the transcript.""" + lines = [ + json.dumps({"type": "session_metadata"}), + json.dumps({"type": "user", "content": []}), + json.dumps({"type": "user", "content": [{"text": "real Q"}]}), + json.dumps({"type": "gemini", "content": [{"text": "real A"}]}), + ] + result = _try_gemini_jsonl("\n".join(lines)) + assert result is not None + assert "> real Q" in result + assert "real A" in result + + +def test_gemini_jsonl_invalid_json_lines_skipped(): + """A malformed line in the middle of the stream must not abort parsing — + the rest of the session should still produce a transcript.""" + lines = [ + json.dumps({"type": "session_metadata"}), + "not-valid-json{", + json.dumps({"type": "user", "content": [{"text": "Q"}]}), + json.dumps({"type": "gemini", "content": [{"text": "A"}]}), + ] + result = _try_gemini_jsonl("\n".join(lines)) + assert result is not None + assert "> Q" in result + + +def test_gemini_jsonl_does_not_match_codex(): + """Codex JSONL passed in must NOT be parsed by the gemini adapter — the + dispatch chain in _try_normalize_json relies on each adapter returning + None when it doesn't recognize a format.""" + lines = [ + json.dumps({"type": "session_meta", "payload": {}}), + json.dumps({"type": "event_msg", "payload": {"type": "user_message", "message": "Q"}}), + json.dumps({"type": "event_msg", "payload": {"type": "agent_message", "message": "A"}}), + ] + result = _try_gemini_jsonl("\n".join(lines)) + assert result is None + + # ── _try_claude_ai_json ───────────────────────────────────────────────