feat: add OpenAI Codex CLI JSONL normalizer
Add _try_codex_jsonl parser for Codex CLI session files stored at ~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl. Uses only event_msg entries (user_message / agent_message) which represent the canonical conversation turns. response_item entries are intentionally skipped — they include synthetic context injections (environment_context) and can duplicate real messages when both representations are present in the same rollout. Format based on Codex source tests (codex-rs/rollout/src/recorder_tests.rs). Requires session_meta header to reduce false positives on other JSONL. Refs: #59
This commit is contained in:
@@ -7,6 +7,7 @@ Supported:
|
||||
- Claude.ai JSON export
|
||||
- ChatGPT conversations.json
|
||||
- Claude Code JSONL
|
||||
- OpenAI Codex CLI JSONL
|
||||
- Slack JSON export
|
||||
- Plain text (pass through for paragraph chunking)
|
||||
|
||||
@@ -55,6 +56,10 @@ def _try_normalize_json(content: str) -> Optional[str]:
|
||||
if normalized:
|
||||
return normalized
|
||||
|
||||
normalized = _try_codex_jsonl(content)
|
||||
if normalized:
|
||||
return normalized
|
||||
|
||||
try:
|
||||
data = json.loads(content)
|
||||
except json.JSONDecodeError:
|
||||
@@ -94,6 +99,54 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def _try_codex_jsonl(content: str) -> Optional[str]:
|
||||
"""OpenAI Codex CLI sessions (~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl).
|
||||
|
||||
Uses only event_msg entries (user_message / agent_message) which represent
|
||||
the canonical conversation turns. response_item entries are skipped because
|
||||
they include synthetic context injections and duplicate the real messages.
|
||||
"""
|
||||
lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
|
||||
messages = []
|
||||
has_session_meta = False
|
||||
for line in lines:
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
|
||||
entry_type = entry.get("type", "")
|
||||
if entry_type == "session_meta":
|
||||
has_session_meta = True
|
||||
continue
|
||||
|
||||
if entry_type != "event_msg":
|
||||
continue
|
||||
|
||||
payload = entry.get("payload", {})
|
||||
if not isinstance(payload, dict):
|
||||
continue
|
||||
|
||||
payload_type = payload.get("type", "")
|
||||
msg = payload.get("message")
|
||||
if not isinstance(msg, str):
|
||||
continue
|
||||
text = msg.strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
if payload_type == "user_message":
|
||||
messages.append(("user", text))
|
||||
elif payload_type == "agent_message":
|
||||
messages.append(("assistant", text))
|
||||
|
||||
if len(messages) >= 2 and has_session_meta:
|
||||
return _messages_to_transcript(messages)
|
||||
return None
|
||||
|
||||
|
||||
def _try_claude_ai_json(data) -> Optional[str]:
|
||||
"""Claude.ai JSON export: [{"role": "user", "content": "..."}]"""
|
||||
if isinstance(data, dict):
|
||||
|
||||
Reference in New Issue
Block a user