diff --git a/mempalace/normalize.py b/mempalace/normalize.py index f180fb4..e599df9 100644 --- a/mempalace/normalize.py +++ b/mempalace/normalize.py @@ -30,7 +30,7 @@ def normalize(filepath: str) -> str: except OSError as e: raise IOError(f"Could not read {filepath}: {e}") if file_size > 500 * 1024 * 1024: # 500 MB safety limit - raise IOError(f"File too large ({file_size // (1024*1024)} MB): {filepath}") + raise IOError(f"File too large ({file_size // (1024 * 1024)} MB): {filepath}") try: with open(filepath, "r", encoding="utf-8", errors="replace") as f: content = f.read() @@ -190,40 +190,46 @@ def _try_claude_ai_json(data) -> Optional[str]: if not isinstance(data, list): return None - # Privacy export: array of conversation objects with chat_messages inside each - if data and isinstance(data[0], dict) and "chat_messages" in data[0]: - all_messages = [] + # Privacy export: array of conversation objects, each containing its own + # message list under "chat_messages" or "messages" (both variants seen in the wild). + if data and isinstance(data[0], dict) and ("chat_messages" in data[0] or "messages" in data[0]): + transcripts = [] for convo in data: if not isinstance(convo, dict): continue - chat_msgs = convo.get("chat_messages", []) - for item in chat_msgs: - if not isinstance(item, dict): - continue - role = item.get("role", "") - text = _extract_content(item.get("content", "")) - if role in ("user", "human") and text: - all_messages.append(("user", text)) - elif role in ("assistant", "ai") and text: - all_messages.append(("assistant", text)) - if len(all_messages) >= 2: - return _messages_to_transcript(all_messages) + chat_msgs = convo.get("chat_messages") or convo.get("messages", []) + messages = _collect_claude_messages(chat_msgs) + if len(messages) >= 2: + transcripts.append(_messages_to_transcript(messages)) + if transcripts: + return "\n\n".join(transcripts) return None # Flat messages list + messages = _collect_claude_messages(data) + if len(messages) >= 2: + return _messages_to_transcript(messages) + return None + + +def _collect_claude_messages(items) -> list: + """Extract (role, text) pairs from a Claude.ai message list. + + Accepts both ``role`` (API format) and ``sender`` (privacy export) as the + author field, and falls back to a top-level ``text`` key when the + ``content`` blocks are empty or absent. + """ messages = [] - for item in data: + for item in items: if not isinstance(item, dict): continue - role = item.get("role", "") - text = _extract_content(item.get("content", "")) + role = item.get("role") or item.get("sender", "") + text = _extract_content(item.get("content", "")) or (item.get("text") or "").strip() if role in ("user", "human") and text: messages.append(("user", text)) elif role in ("assistant", "ai") and text: messages.append(("assistant", text)) - if len(messages) >= 2: - return _messages_to_transcript(messages) - return None + return messages def _try_chatgpt_json(data) -> Optional[str]: diff --git a/tests/test_normalize.py b/tests/test_normalize.py index fa9ea6e..7f0652a 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -524,6 +524,119 @@ def test_claude_ai_privacy_export_non_dict_items(): assert result is not None +def test_claude_ai_privacy_export_messages_key(): + """Privacy export using 'messages' key instead of 'chat_messages'.""" + data = [ + { + "uuid": "abc-123", + "name": "Test convo", + "messages": [ + {"role": "human", "content": "Q1"}, + {"role": "ai", "content": "A1"}, + ], + } + ] + result = _try_claude_ai_json(data) + assert result is not None + assert "> Q1" in result + + +def test_claude_ai_privacy_export_sender_field(): + """Privacy export using 'sender' instead of 'role'.""" + data = [ + { + "chat_messages": [ + {"sender": "human", "content": "Q1"}, + {"sender": "assistant", "content": "A1"}, + ] + } + ] + result = _try_claude_ai_json(data) + assert result is not None + assert "> Q1" in result + + +def test_claude_ai_privacy_export_text_fallback(): + """Privacy export where content is empty but text field has the message.""" + data = [ + { + "chat_messages": [ + {"sender": "human", "text": "Q1", "content": []}, + {"sender": "assistant", "text": "A1", "content": []}, + ] + } + ] + result = _try_claude_ai_json(data) + assert result is not None + assert "> Q1" in result + + +def test_claude_ai_privacy_export_null_text(): + """Privacy export where text field is explicitly null must not crash.""" + data = [ + { + "chat_messages": [ + {"sender": "human", "text": None, "content": "Q1"}, + {"sender": "assistant", "text": None, "content": "A1"}, + ] + } + ] + result = _try_claude_ai_json(data) + assert result is not None + assert "> Q1" in result + + +def test_claude_ai_privacy_export_per_conversation(): + """Multiple conversations produce separate transcripts.""" + data = [ + { + "uuid": "convo-1", + "chat_messages": [ + {"role": "human", "content": "Q1"}, + {"role": "ai", "content": "A1"}, + ], + }, + { + "uuid": "convo-2", + "chat_messages": [ + {"role": "human", "content": "Q2"}, + {"role": "ai", "content": "A2"}, + ], + }, + ] + result = _try_claude_ai_json(data) + assert result is not None + assert "> Q1" in result + assert "> Q2" in result + # each conversation is a separate transcript block + parts = result.split("\n\n") + q1_parts = [p for p in parts if "> Q1" in p] + q2_parts = [p for p in parts if "> Q2" in p] + assert len(q1_parts) >= 1 + assert len(q2_parts) >= 1 + + +def test_claude_ai_privacy_export_skips_empty_conversations(): + """Conversations with <2 messages are skipped.""" + data = [ + { + "chat_messages": [ + {"role": "human", "content": "lonely message"}, + ], + }, + { + "chat_messages": [ + {"role": "human", "content": "Q1"}, + {"role": "ai", "content": "A1"}, + ], + }, + ] + result = _try_claude_ai_json(data) + assert result is not None + assert "lonely message" not in result + assert "> Q1" in result + + # ── _try_chatgpt_json ─────────────────────────────────────────────────