Merge pull request #538 from taylorwilsdon/issues/536

enh: better fallback logic for html in emails
This commit is contained in:
Taylor Wilsdon
2026-03-03 11:28:08 -05:00
committed by GitHub
4 changed files with 37 additions and 11 deletions

View File

@@ -96,9 +96,7 @@ class SecureFastMCP(FastMCP):
# Rebuild middleware stack # Rebuild middleware stack
app.middleware_stack = app.build_middleware_stack() app.middleware_stack = app.build_middleware_stack()
logger.info( logger.info("Added middleware stack: WellKnownCacheControl, Session Management")
"Added middleware stack: WellKnownCacheControl, Session Management"
)
return app return app

View File

@@ -37,6 +37,19 @@ GMAIL_BATCH_SIZE = 25
GMAIL_REQUEST_DELAY = 0.1 GMAIL_REQUEST_DELAY = 0.1
HTML_BODY_TRUNCATE_LIMIT = 20000 HTML_BODY_TRUNCATE_LIMIT = 20000
GMAIL_METADATA_HEADERS = ["Subject", "From", "To", "Cc", "Message-ID", "Date"] GMAIL_METADATA_HEADERS = ["Subject", "From", "To", "Cc", "Message-ID", "Date"]
LOW_VALUE_TEXT_PLACEHOLDERS = (
"your client does not support html",
"view this email in your browser",
"open this email in your browser",
)
LOW_VALUE_TEXT_FOOTER_MARKERS = (
"mailing list",
"mailman/listinfo",
"unsubscribe",
"list-unsubscribe",
"manage preferences",
)
LOW_VALUE_TEXT_HTML_DIFF_MIN = 80
class _HTMLTextExtractor(HTMLParser): class _HTMLTextExtractor(HTMLParser):
@@ -154,16 +167,29 @@ def _format_body_content(text_body: str, html_body: str) -> str:
""" """
text_stripped = text_body.strip() text_stripped = text_body.strip()
html_stripped = html_body.strip() html_stripped = html_body.strip()
html_text = _html_to_text(html_stripped).strip() if html_stripped else ""
# Detect useless fallback: HTML comments in text, or HTML is 50x+ longer plain_lower = " ".join(text_stripped.split()).lower()
use_html = html_stripped and ( html_lower = " ".join(html_text.split()).lower()
not text_stripped plain_is_low_value = plain_lower and (
or "<!--" in text_stripped any(marker in plain_lower for marker in LOW_VALUE_TEXT_PLACEHOLDERS)
or len(html_stripped) > len(text_stripped) * 50 or (
any(marker in plain_lower for marker in LOW_VALUE_TEXT_FOOTER_MARKERS)
and len(html_lower) >= len(plain_lower) + LOW_VALUE_TEXT_HTML_DIFF_MIN
)
or (
len(html_lower) >= len(plain_lower) + LOW_VALUE_TEXT_HTML_DIFF_MIN
and html_lower.endswith(plain_lower)
)
)
# Prefer plain text, but fall back to HTML when plain text is empty or clearly low-value.
use_html = html_text and (
not text_stripped or "<!--" in text_stripped or plain_is_low_value
) )
if use_html: if use_html:
content = _html_to_text(html_stripped) content = html_text
if len(content) > HTML_BODY_TRUNCATE_LIMIT: if len(content) > HTML_BODY_TRUNCATE_LIMIT:
content = content[:HTML_BODY_TRUNCATE_LIMIT] + "\n\n[Content truncated...]" content = content[:HTML_BODY_TRUNCATE_LIMIT] + "\n\n[Content truncated...]"
return content return content

View File

@@ -51,7 +51,9 @@ def test_well_known_cache_control_middleware_rewrites_headers():
assert "etag" not in extra.headers assert "etag" not in extra.headers
def test_configured_server_applies_no_cache_to_served_oauth_discovery_routes(monkeypatch): def test_configured_server_applies_no_cache_to_served_oauth_discovery_routes(
monkeypatch,
):
monkeypatch.setenv("MCP_ENABLE_OAUTH21", "true") monkeypatch.setenv("MCP_ENABLE_OAUTH21", "true")
monkeypatch.setenv("GOOGLE_OAUTH_CLIENT_ID", "dummy-client") monkeypatch.setenv("GOOGLE_OAUTH_CLIENT_ID", "dummy-client")
monkeypatch.setenv("GOOGLE_OAUTH_CLIENT_SECRET", "dummy-secret") monkeypatch.setenv("GOOGLE_OAUTH_CLIENT_SECRET", "dummy-secret")

2
uv.lock generated
View File

@@ -2044,7 +2044,7 @@ wheels = [
[[package]] [[package]]
name = "workspace-mcp" name = "workspace-mcp"
version = "1.14.0" version = "1.14.1"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "cryptography" }, { name = "cryptography" },