feat: add OpenAI Codex CLI JSONL normalizer
Add _try_codex_jsonl parser for Codex CLI session files stored at ~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl. Uses only event_msg entries (user_message / agent_message) which represent the canonical conversation turns. response_item entries are intentionally skipped — they include synthetic context injections (environment_context) and can duplicate real messages when both representations are present in the same rollout. Format based on Codex source tests (codex-rs/rollout/src/recorder_tests.rs). Requires session_meta header to reduce false positives on other JSONL. Refs: #59
This commit is contained in:
@@ -7,6 +7,7 @@ Supported:
|
|||||||
- Claude.ai JSON export
|
- Claude.ai JSON export
|
||||||
- ChatGPT conversations.json
|
- ChatGPT conversations.json
|
||||||
- Claude Code JSONL
|
- Claude Code JSONL
|
||||||
|
- OpenAI Codex CLI JSONL
|
||||||
- Slack JSON export
|
- Slack JSON export
|
||||||
- Plain text (pass through for paragraph chunking)
|
- Plain text (pass through for paragraph chunking)
|
||||||
|
|
||||||
@@ -55,6 +56,10 @@ def _try_normalize_json(content: str) -> Optional[str]:
|
|||||||
if normalized:
|
if normalized:
|
||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
normalized = _try_codex_jsonl(content)
|
||||||
|
if normalized:
|
||||||
|
return normalized
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data = json.loads(content)
|
data = json.loads(content)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
@@ -94,6 +99,54 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _try_codex_jsonl(content: str) -> Optional[str]:
|
||||||
|
"""OpenAI Codex CLI sessions (~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl).
|
||||||
|
|
||||||
|
Uses only event_msg entries (user_message / agent_message) which represent
|
||||||
|
the canonical conversation turns. response_item entries are skipped because
|
||||||
|
they include synthetic context injections and duplicate the real messages.
|
||||||
|
"""
|
||||||
|
lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
|
||||||
|
messages = []
|
||||||
|
has_session_meta = False
|
||||||
|
for line in lines:
|
||||||
|
try:
|
||||||
|
entry = json.loads(line)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
entry_type = entry.get("type", "")
|
||||||
|
if entry_type == "session_meta":
|
||||||
|
has_session_meta = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
if entry_type != "event_msg":
|
||||||
|
continue
|
||||||
|
|
||||||
|
payload = entry.get("payload", {})
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
payload_type = payload.get("type", "")
|
||||||
|
msg = payload.get("message")
|
||||||
|
if not isinstance(msg, str):
|
||||||
|
continue
|
||||||
|
text = msg.strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if payload_type == "user_message":
|
||||||
|
messages.append(("user", text))
|
||||||
|
elif payload_type == "agent_message":
|
||||||
|
messages.append(("assistant", text))
|
||||||
|
|
||||||
|
if len(messages) >= 2 and has_session_meta:
|
||||||
|
return _messages_to_transcript(messages)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _try_claude_ai_json(data) -> Optional[str]:
|
def _try_claude_ai_json(data) -> Optional[str]:
|
||||||
"""Claude.ai JSON export: [{"role": "user", "content": "..."}]"""
|
"""Claude.ai JSON export: [{"role": "user", "content": "..."}]"""
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
|
|||||||
Reference in New Issue
Block a user