From 92840ac8e8514af84848a4c2ece2acb1fde573e6 Mon Sep 17 00:00:00 2001 From: Taylor Wilsdon Date: Mon, 2 Mar 2026 09:29:53 -0500 Subject: [PATCH] better fallback logic for html in emails --- gmail/gmail_tools.py | 36 ++++++++++++++++++++++++++++++------ uv.lock | 2 +- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/gmail/gmail_tools.py b/gmail/gmail_tools.py index ff875d5..cef30d9 100644 --- a/gmail/gmail_tools.py +++ b/gmail/gmail_tools.py @@ -37,6 +37,19 @@ GMAIL_BATCH_SIZE = 25 GMAIL_REQUEST_DELAY = 0.1 HTML_BODY_TRUNCATE_LIMIT = 20000 GMAIL_METADATA_HEADERS = ["Subject", "From", "To", "Cc", "Message-ID", "Date"] +LOW_VALUE_TEXT_PLACEHOLDERS = ( + "your client does not support html", + "view this email in your browser", + "open this email in your browser", +) +LOW_VALUE_TEXT_FOOTER_MARKERS = ( + "mailing list", + "mailman/listinfo", + "unsubscribe", + "list-unsubscribe", + "manage preferences", +) +LOW_VALUE_TEXT_HTML_DIFF_MIN = 80 class _HTMLTextExtractor(HTMLParser): @@ -154,16 +167,27 @@ def _format_body_content(text_body: str, html_body: str) -> str: """ text_stripped = text_body.strip() html_stripped = html_body.strip() + html_text = _html_to_text(html_stripped).strip() if html_stripped else "" - # Detect useless fallback: HTML comments in text, or HTML is 50x+ longer - use_html = html_stripped and ( - not text_stripped - or "