better fallback logic for html in emails
This commit is contained in:
@@ -37,6 +37,19 @@ GMAIL_BATCH_SIZE = 25
|
|||||||
GMAIL_REQUEST_DELAY = 0.1
|
GMAIL_REQUEST_DELAY = 0.1
|
||||||
HTML_BODY_TRUNCATE_LIMIT = 20000
|
HTML_BODY_TRUNCATE_LIMIT = 20000
|
||||||
GMAIL_METADATA_HEADERS = ["Subject", "From", "To", "Cc", "Message-ID", "Date"]
|
GMAIL_METADATA_HEADERS = ["Subject", "From", "To", "Cc", "Message-ID", "Date"]
|
||||||
|
LOW_VALUE_TEXT_PLACEHOLDERS = (
|
||||||
|
"your client does not support html",
|
||||||
|
"view this email in your browser",
|
||||||
|
"open this email in your browser",
|
||||||
|
)
|
||||||
|
LOW_VALUE_TEXT_FOOTER_MARKERS = (
|
||||||
|
"mailing list",
|
||||||
|
"mailman/listinfo",
|
||||||
|
"unsubscribe",
|
||||||
|
"list-unsubscribe",
|
||||||
|
"manage preferences",
|
||||||
|
)
|
||||||
|
LOW_VALUE_TEXT_HTML_DIFF_MIN = 80
|
||||||
|
|
||||||
|
|
||||||
class _HTMLTextExtractor(HTMLParser):
|
class _HTMLTextExtractor(HTMLParser):
|
||||||
@@ -154,16 +167,27 @@ def _format_body_content(text_body: str, html_body: str) -> str:
|
|||||||
"""
|
"""
|
||||||
text_stripped = text_body.strip()
|
text_stripped = text_body.strip()
|
||||||
html_stripped = html_body.strip()
|
html_stripped = html_body.strip()
|
||||||
|
html_text = _html_to_text(html_stripped).strip() if html_stripped else ""
|
||||||
|
|
||||||
# Detect useless fallback: HTML comments in text, or HTML is 50x+ longer
|
plain_lower = " ".join(text_stripped.split()).lower()
|
||||||
use_html = html_stripped and (
|
html_lower = " ".join(html_text.split()).lower()
|
||||||
not text_stripped
|
plain_is_low_value = plain_lower and (
|
||||||
or "<!--" in text_stripped
|
any(marker in plain_lower for marker in LOW_VALUE_TEXT_PLACEHOLDERS)
|
||||||
or len(html_stripped) > len(text_stripped) * 50
|
or (
|
||||||
|
any(marker in plain_lower for marker in LOW_VALUE_TEXT_FOOTER_MARKERS)
|
||||||
|
and len(html_lower) >= len(plain_lower) + LOW_VALUE_TEXT_HTML_DIFF_MIN
|
||||||
|
)
|
||||||
|
or (
|
||||||
|
len(html_lower) >= len(plain_lower) + LOW_VALUE_TEXT_HTML_DIFF_MIN
|
||||||
|
and html_lower.endswith(plain_lower)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Prefer plain text, but fall back to HTML when plain text is empty or clearly low-value.
|
||||||
|
use_html = html_text and (not text_stripped or "<!--" in text_stripped or plain_is_low_value)
|
||||||
|
|
||||||
if use_html:
|
if use_html:
|
||||||
content = _html_to_text(html_stripped)
|
content = html_text
|
||||||
if len(content) > HTML_BODY_TRUNCATE_LIMIT:
|
if len(content) > HTML_BODY_TRUNCATE_LIMIT:
|
||||||
content = content[:HTML_BODY_TRUNCATE_LIMIT] + "\n\n[Content truncated...]"
|
content = content[:HTML_BODY_TRUNCATE_LIMIT] + "\n\n[Content truncated...]"
|
||||||
return content
|
return content
|
||||||
|
|||||||
2
uv.lock
generated
2
uv.lock
generated
@@ -2044,7 +2044,7 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "workspace-mcp"
|
name = "workspace-mcp"
|
||||||
version = "1.14.0"
|
version = "1.14.1"
|
||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "cryptography" },
|
{ name = "cryptography" },
|
||||||
|
|||||||
Reference in New Issue
Block a user