feat: add OpenAI Codex CLI JSONL normalizer

Add _try_codex_jsonl parser for Codex CLI session files stored at
~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl.

Uses only event_msg entries (user_message / agent_message) which
represent the canonical conversation turns. response_item entries
are intentionally skipped — they include synthetic context injections
(environment_context) and can duplicate real messages when both
representations are present in the same rollout.

Format based on Codex source tests (codex-rs/rollout/src/recorder_tests.rs).
Requires session_meta header to reduce false positives on other JSONL.

Refs: #59
This commit is contained in:
adv3nt3
2026-04-07 14:41:19 +02:00
parent 1782628b8a
commit d4e1945f77
+53
View File
@@ -7,6 +7,7 @@ Supported:
- Claude.ai JSON export
- ChatGPT conversations.json
- Claude Code JSONL
- OpenAI Codex CLI JSONL
- Slack JSON export
- Plain text (pass through for paragraph chunking)
@@ -55,6 +56,10 @@ def _try_normalize_json(content: str) -> Optional[str]:
if normalized:
return normalized
normalized = _try_codex_jsonl(content)
if normalized:
return normalized
try:
data = json.loads(content)
except json.JSONDecodeError:
@@ -94,6 +99,54 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]:
return None
def _try_codex_jsonl(content: str) -> Optional[str]:
"""OpenAI Codex CLI sessions (~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl).
Uses only event_msg entries (user_message / agent_message) which represent
the canonical conversation turns. response_item entries are skipped because
they include synthetic context injections and duplicate the real messages.
"""
lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
messages = []
has_session_meta = False
for line in lines:
try:
entry = json.loads(line)
except json.JSONDecodeError:
continue
if not isinstance(entry, dict):
continue
entry_type = entry.get("type", "")
if entry_type == "session_meta":
has_session_meta = True
continue
if entry_type != "event_msg":
continue
payload = entry.get("payload", {})
if not isinstance(payload, dict):
continue
payload_type = payload.get("type", "")
msg = payload.get("message")
if not isinstance(msg, str):
continue
text = msg.strip()
if not text:
continue
if payload_type == "user_message":
messages.append(("user", text))
elif payload_type == "agent_message":
messages.append(("assistant", text))
if len(messages) >= 2 and has_session_meta:
return _messages_to_transcript(messages)
return None
def _try_claude_ai_json(data) -> Optional[str]:
"""Claude.ai JSON export: [{"role": "user", "content": "..."}]"""
if isinstance(data, dict):