* fix: parse Claude.ai privacy export with messages key and sender field (#677) The privacy-export branch in _try_claude_ai_json only checked for the "chat_messages" key, missing exports that use "messages" instead. It also only read the "role" field while real privacy exports use "sender". Both gaps caused the file to fall through to plain-text, producing a single giant drawer. Changes: - Accept "messages" alongside "chat_messages" in the conversation-object guard and inner extraction. - Accept "sender" alongside "role" as the author field. - Fall back to a top-level "text" key when content blocks are empty. - Produce one transcript per conversation instead of concatenating all conversations into a single blob. - Extract shared logic into _collect_claude_messages helper. - Add 6 regression tests covering each variant. * style: apply ruff format to normalize.py * fix: guard against null text field in Claude.ai export parsing item.get("text", "").strip() crashes when "text" is explicitly null in the JSON (legal and observed in some exports). Use (item.get("text") or "").strip() and add a regression test. --------- Co-authored-by: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
e200ce2c8a
commit
a2432a3245
+28
-22
@@ -30,7 +30,7 @@ def normalize(filepath: str) -> str:
|
||||
except OSError as e:
|
||||
raise IOError(f"Could not read {filepath}: {e}")
|
||||
if file_size > 500 * 1024 * 1024: # 500 MB safety limit
|
||||
raise IOError(f"File too large ({file_size // (1024*1024)} MB): {filepath}")
|
||||
raise IOError(f"File too large ({file_size // (1024 * 1024)} MB): {filepath}")
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
|
||||
content = f.read()
|
||||
@@ -190,40 +190,46 @@ def _try_claude_ai_json(data) -> Optional[str]:
|
||||
if not isinstance(data, list):
|
||||
return None
|
||||
|
||||
# Privacy export: array of conversation objects with chat_messages inside each
|
||||
if data and isinstance(data[0], dict) and "chat_messages" in data[0]:
|
||||
all_messages = []
|
||||
# Privacy export: array of conversation objects, each containing its own
|
||||
# message list under "chat_messages" or "messages" (both variants seen in the wild).
|
||||
if data and isinstance(data[0], dict) and ("chat_messages" in data[0] or "messages" in data[0]):
|
||||
transcripts = []
|
||||
for convo in data:
|
||||
if not isinstance(convo, dict):
|
||||
continue
|
||||
chat_msgs = convo.get("chat_messages", [])
|
||||
for item in chat_msgs:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
role = item.get("role", "")
|
||||
text = _extract_content(item.get("content", ""))
|
||||
if role in ("user", "human") and text:
|
||||
all_messages.append(("user", text))
|
||||
elif role in ("assistant", "ai") and text:
|
||||
all_messages.append(("assistant", text))
|
||||
if len(all_messages) >= 2:
|
||||
return _messages_to_transcript(all_messages)
|
||||
chat_msgs = convo.get("chat_messages") or convo.get("messages", [])
|
||||
messages = _collect_claude_messages(chat_msgs)
|
||||
if len(messages) >= 2:
|
||||
transcripts.append(_messages_to_transcript(messages))
|
||||
if transcripts:
|
||||
return "\n\n".join(transcripts)
|
||||
return None
|
||||
|
||||
# Flat messages list
|
||||
messages = _collect_claude_messages(data)
|
||||
if len(messages) >= 2:
|
||||
return _messages_to_transcript(messages)
|
||||
return None
|
||||
|
||||
|
||||
def _collect_claude_messages(items) -> list:
|
||||
"""Extract (role, text) pairs from a Claude.ai message list.
|
||||
|
||||
Accepts both ``role`` (API format) and ``sender`` (privacy export) as the
|
||||
author field, and falls back to a top-level ``text`` key when the
|
||||
``content`` blocks are empty or absent.
|
||||
"""
|
||||
messages = []
|
||||
for item in data:
|
||||
for item in items:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
role = item.get("role", "")
|
||||
text = _extract_content(item.get("content", ""))
|
||||
role = item.get("role") or item.get("sender", "")
|
||||
text = _extract_content(item.get("content", "")) or (item.get("text") or "").strip()
|
||||
if role in ("user", "human") and text:
|
||||
messages.append(("user", text))
|
||||
elif role in ("assistant", "ai") and text:
|
||||
messages.append(("assistant", text))
|
||||
if len(messages) >= 2:
|
||||
return _messages_to_transcript(messages)
|
||||
return None
|
||||
return messages
|
||||
|
||||
|
||||
def _try_chatgpt_json(data) -> Optional[str]:
|
||||
|
||||
@@ -524,6 +524,119 @@ def test_claude_ai_privacy_export_non_dict_items():
|
||||
assert result is not None
|
||||
|
||||
|
||||
def test_claude_ai_privacy_export_messages_key():
|
||||
"""Privacy export using 'messages' key instead of 'chat_messages'."""
|
||||
data = [
|
||||
{
|
||||
"uuid": "abc-123",
|
||||
"name": "Test convo",
|
||||
"messages": [
|
||||
{"role": "human", "content": "Q1"},
|
||||
{"role": "ai", "content": "A1"},
|
||||
],
|
||||
}
|
||||
]
|
||||
result = _try_claude_ai_json(data)
|
||||
assert result is not None
|
||||
assert "> Q1" in result
|
||||
|
||||
|
||||
def test_claude_ai_privacy_export_sender_field():
|
||||
"""Privacy export using 'sender' instead of 'role'."""
|
||||
data = [
|
||||
{
|
||||
"chat_messages": [
|
||||
{"sender": "human", "content": "Q1"},
|
||||
{"sender": "assistant", "content": "A1"},
|
||||
]
|
||||
}
|
||||
]
|
||||
result = _try_claude_ai_json(data)
|
||||
assert result is not None
|
||||
assert "> Q1" in result
|
||||
|
||||
|
||||
def test_claude_ai_privacy_export_text_fallback():
|
||||
"""Privacy export where content is empty but text field has the message."""
|
||||
data = [
|
||||
{
|
||||
"chat_messages": [
|
||||
{"sender": "human", "text": "Q1", "content": []},
|
||||
{"sender": "assistant", "text": "A1", "content": []},
|
||||
]
|
||||
}
|
||||
]
|
||||
result = _try_claude_ai_json(data)
|
||||
assert result is not None
|
||||
assert "> Q1" in result
|
||||
|
||||
|
||||
def test_claude_ai_privacy_export_null_text():
|
||||
"""Privacy export where text field is explicitly null must not crash."""
|
||||
data = [
|
||||
{
|
||||
"chat_messages": [
|
||||
{"sender": "human", "text": None, "content": "Q1"},
|
||||
{"sender": "assistant", "text": None, "content": "A1"},
|
||||
]
|
||||
}
|
||||
]
|
||||
result = _try_claude_ai_json(data)
|
||||
assert result is not None
|
||||
assert "> Q1" in result
|
||||
|
||||
|
||||
def test_claude_ai_privacy_export_per_conversation():
|
||||
"""Multiple conversations produce separate transcripts."""
|
||||
data = [
|
||||
{
|
||||
"uuid": "convo-1",
|
||||
"chat_messages": [
|
||||
{"role": "human", "content": "Q1"},
|
||||
{"role": "ai", "content": "A1"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"uuid": "convo-2",
|
||||
"chat_messages": [
|
||||
{"role": "human", "content": "Q2"},
|
||||
{"role": "ai", "content": "A2"},
|
||||
],
|
||||
},
|
||||
]
|
||||
result = _try_claude_ai_json(data)
|
||||
assert result is not None
|
||||
assert "> Q1" in result
|
||||
assert "> Q2" in result
|
||||
# each conversation is a separate transcript block
|
||||
parts = result.split("\n\n")
|
||||
q1_parts = [p for p in parts if "> Q1" in p]
|
||||
q2_parts = [p for p in parts if "> Q2" in p]
|
||||
assert len(q1_parts) >= 1
|
||||
assert len(q2_parts) >= 1
|
||||
|
||||
|
||||
def test_claude_ai_privacy_export_skips_empty_conversations():
|
||||
"""Conversations with <2 messages are skipped."""
|
||||
data = [
|
||||
{
|
||||
"chat_messages": [
|
||||
{"role": "human", "content": "lonely message"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"chat_messages": [
|
||||
{"role": "human", "content": "Q1"},
|
||||
{"role": "ai", "content": "A1"},
|
||||
],
|
||||
},
|
||||
]
|
||||
result = _try_claude_ai_json(data)
|
||||
assert result is not None
|
||||
assert "lonely message" not in result
|
||||
assert "> Q1" in result
|
||||
|
||||
|
||||
# ── _try_chatgpt_json ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user