feat(gmail): add HTML body extraction with text fallback (single + batch)
Add _extract_message_bodies() to return text and HTML (BFS over multipart) Prefer text/plain; gracefully fall back to HTML-only bodies Update single and batch tools; truncate very large HTML; guard base64 decode Backward compatible; no new deps Testing: verified with live eBay emails (single and batch show "[HTML Content Converted]")
This commit is contained in:
@@ -33,6 +33,7 @@ GMAIL_REQUEST_DELAY = 0.1
|
|||||||
def _extract_message_body(payload):
|
def _extract_message_body(payload):
|
||||||
"""
|
"""
|
||||||
Helper function to extract plain text body from a Gmail message payload.
|
Helper function to extract plain text body from a Gmail message payload.
|
||||||
|
(Maintained for backward compatibility)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
payload (dict): The message payload from Gmail API
|
payload (dict): The message payload from Gmail API
|
||||||
@@ -40,29 +41,60 @@ def _extract_message_body(payload):
|
|||||||
Returns:
|
Returns:
|
||||||
str: The plain text body content, or empty string if not found
|
str: The plain text body content, or empty string if not found
|
||||||
"""
|
"""
|
||||||
body_data = ""
|
bodies = _extract_message_bodies(payload)
|
||||||
|
return bodies.get("text", "")
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_message_bodies(payload):
|
||||||
|
"""
|
||||||
|
Helper function to extract both plain text and HTML bodies from a Gmail message payload.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
payload (dict): The message payload from Gmail API
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Dictionary with 'text' and 'html' keys containing body content
|
||||||
|
"""
|
||||||
|
text_body = ""
|
||||||
|
html_body = ""
|
||||||
parts = [payload] if "parts" not in payload else payload.get("parts", [])
|
parts = [payload] if "parts" not in payload else payload.get("parts", [])
|
||||||
|
|
||||||
part_queue = list(parts) # Use a queue for BFS traversal of parts
|
part_queue = list(parts) # Use a queue for BFS traversal of parts
|
||||||
while part_queue:
|
while part_queue:
|
||||||
part = part_queue.pop(0)
|
part = part_queue.pop(0)
|
||||||
if part.get("mimeType") == "text/plain" and part.get("body", {}).get("data"):
|
mime_type = part.get("mimeType", "")
|
||||||
data = base64.urlsafe_b64decode(part["body"]["data"])
|
body_data = part.get("body", {}).get("data")
|
||||||
body_data = data.decode("utf-8", errors="ignore")
|
|
||||||
break # Found plain text body
|
if body_data:
|
||||||
elif part.get("mimeType", "").startswith("multipart/") and "parts" in part:
|
try:
|
||||||
part_queue.extend(part.get("parts", [])) # Add sub-parts to the queue
|
decoded_data = base64.urlsafe_b64decode(body_data).decode("utf-8", errors="ignore")
|
||||||
|
if mime_type == "text/plain" and not text_body:
|
||||||
|
text_body = decoded_data
|
||||||
|
elif mime_type == "text/html" and not html_body:
|
||||||
|
html_body = decoded_data
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to decode body part: {e}")
|
||||||
|
|
||||||
|
# Add sub-parts to queue for multipart messages
|
||||||
|
if mime_type.startswith("multipart/") and "parts" in part:
|
||||||
|
part_queue.extend(part.get("parts", []))
|
||||||
|
|
||||||
# If no plain text found, check the main payload body if it exists
|
# Check the main payload if it has body data directly
|
||||||
if (
|
if payload.get("body", {}).get("data"):
|
||||||
not body_data
|
try:
|
||||||
and payload.get("mimeType") == "text/plain"
|
decoded_data = base64.urlsafe_b64decode(payload["body"]["data"]).decode("utf-8", errors="ignore")
|
||||||
and payload.get("body", {}).get("data")
|
mime_type = payload.get("mimeType", "")
|
||||||
):
|
if mime_type == "text/plain" and not text_body:
|
||||||
data = base64.urlsafe_b64decode(payload["body"]["data"])
|
text_body = decoded_data
|
||||||
body_data = data.decode("utf-8", errors="ignore")
|
elif mime_type == "text/html" and not html_body:
|
||||||
|
html_body = decoded_data
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to decode main payload body: {e}")
|
||||||
|
|
||||||
return body_data
|
return {
|
||||||
|
"text": text_body,
|
||||||
|
"html": html_body
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _extract_headers(payload: dict, header_names: List[str]) -> Dict[str, str]:
|
def _extract_headers(payload: dict, header_names: List[str]) -> Dict[str, str]:
|
||||||
@@ -316,9 +348,22 @@ async def get_gmail_message_content(
|
|||||||
.execute
|
.execute
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract the plain text body using helper function
|
# Extract both text and HTML bodies using enhanced helper function
|
||||||
payload = message_full.get("payload", {})
|
payload = message_full.get("payload", {})
|
||||||
body_data = _extract_message_body(payload)
|
bodies = _extract_message_bodies(payload)
|
||||||
|
text_body = bodies.get("text", "")
|
||||||
|
html_body = bodies.get("html", "")
|
||||||
|
|
||||||
|
# Format body content with HTML fallback
|
||||||
|
if text_body.strip():
|
||||||
|
body_data = text_body
|
||||||
|
elif html_body.strip():
|
||||||
|
# Truncate very large HTML to keep responses manageable
|
||||||
|
if len(html_body) > 20000:
|
||||||
|
html_body = html_body[:20000] + "\n\n[HTML content truncated...]"
|
||||||
|
body_data = f"[HTML Content Converted]\n{html_body}"
|
||||||
|
else:
|
||||||
|
body_data = "[No readable content found]"
|
||||||
|
|
||||||
content_text = "\n".join(
|
content_text = "\n".join(
|
||||||
[
|
[
|
||||||
@@ -480,14 +525,29 @@ async def get_gmail_messages_content_batch(
|
|||||||
headers = _extract_headers(payload, ["Subject", "From"])
|
headers = _extract_headers(payload, ["Subject", "From"])
|
||||||
subject = headers.get("Subject", "(no subject)")
|
subject = headers.get("Subject", "(no subject)")
|
||||||
sender = headers.get("From", "(unknown sender)")
|
sender = headers.get("From", "(unknown sender)")
|
||||||
body = _extract_message_body(payload)
|
|
||||||
|
# Extract both text and HTML bodies using enhanced helper function
|
||||||
|
bodies = _extract_message_bodies(payload)
|
||||||
|
text_body = bodies.get("text", "")
|
||||||
|
html_body = bodies.get("html", "")
|
||||||
|
|
||||||
|
# Format body content with HTML fallback
|
||||||
|
if text_body.strip():
|
||||||
|
body_data = text_body
|
||||||
|
elif html_body.strip():
|
||||||
|
# Truncate very large HTML to keep batch responses manageable
|
||||||
|
if len(html_body) > 15000:
|
||||||
|
html_body = html_body[:15000] + "\n\n[HTML content truncated...]"
|
||||||
|
body_data = f"[HTML Content Converted]\n{html_body}"
|
||||||
|
else:
|
||||||
|
body_data = "[No readable content found]"
|
||||||
|
|
||||||
output_messages.append(
|
output_messages.append(
|
||||||
f"Message ID: {mid}\n"
|
f"Message ID: {mid}\n"
|
||||||
f"Subject: {subject}\n"
|
f"Subject: {subject}\n"
|
||||||
f"From: {sender}\n"
|
f"From: {sender}\n"
|
||||||
f"Web Link: {_generate_gmail_web_url(mid)}\n"
|
f"Web Link: {_generate_gmail_web_url(mid)}\n"
|
||||||
f"\n{body or '[No text/plain body found]'}\n"
|
f"\n{body_data}\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Combine all messages with separators
|
# Combine all messages with separators
|
||||||
@@ -710,9 +770,22 @@ def _format_thread_content(thread_data: dict, thread_id: str) -> str:
|
|||||||
date = headers.get("Date", "(unknown date)")
|
date = headers.get("Date", "(unknown date)")
|
||||||
subject = headers.get("Subject", "(no subject)")
|
subject = headers.get("Subject", "(no subject)")
|
||||||
|
|
||||||
# Extract message body
|
# Extract both text and HTML bodies
|
||||||
payload = message.get("payload", {})
|
payload = message.get("payload", {})
|
||||||
body_data = _extract_message_body(payload)
|
bodies = _extract_message_bodies(payload)
|
||||||
|
text_body = bodies.get("text", "")
|
||||||
|
html_body = bodies.get("html", "")
|
||||||
|
|
||||||
|
# Format body content with HTML fallback
|
||||||
|
if text_body.strip():
|
||||||
|
body_data = text_body
|
||||||
|
elif html_body.strip():
|
||||||
|
# Truncate very large HTML to keep batch responses manageable
|
||||||
|
if len(html_body) > 15000:
|
||||||
|
html_body = html_body[:15000] + "\n\n[HTML content truncated...]"
|
||||||
|
body_data = f"[HTML Content Converted]\n{html_body}"
|
||||||
|
else:
|
||||||
|
body_data = "[No readable content found]"
|
||||||
|
|
||||||
# Add message to content
|
# Add message to content
|
||||||
content_lines.extend(
|
content_lines.extend(
|
||||||
@@ -730,7 +803,7 @@ def _format_thread_content(thread_data: dict, thread_id: str) -> str:
|
|||||||
content_lines.extend(
|
content_lines.extend(
|
||||||
[
|
[
|
||||||
"",
|
"",
|
||||||
body_data or "[No text/plain body found]",
|
body_data,
|
||||||
"",
|
"",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user