add utils for office xml text handling

This commit is contained in:
Taylor Wilsdon
2025-05-24 13:55:00 -04:00
parent f4e6275c19
commit 84b9078659
2 changed files with 52 additions and 49 deletions

50
core/utils.py Normal file
View File

@@ -0,0 +1,50 @@
import zipfile, xml.etree.ElementTree as ET
from typing import List, Optional
# -------------------------------------------------------------------------
# Helper: pull raw text from OOXML containers (docx / xlsx / pptx)
# -------------------------------------------------------------------------
def extract_office_xml_text(file_bytes: bytes, mime_type: str) -> Optional[str]:
"""
Very light-weight XML scraper for Word, Excel, PowerPoint files.
Returns plain-text if something readable is found, else None.
No external deps just std-lib zipfile + ElementTree.
"""
try:
with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
# Map MIME → iterable of XML files to inspect
if mime_type == (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
targets = ["word/document.xml"]
elif mime_type == (
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
targets = [n for n in zf.namelist() if n.startswith("ppt/slides/slide")]
elif mime_type == (
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
):
targets = [n for n in zf.namelist() if n.startswith("xl/worksheets/sheet")]
else:
return None
pieces: List[str] = []
for member in targets:
try:
xml_root = ET.fromstring(zf.read(member))
# In both Word/PowerPoint the text is in <w:t> or <a:t>;
# in Excel, cell values are in <v>.
for elem in xml_root.iter():
tag = elem.tag.split("}")[-1] # strip namespace
if tag in {"t", "v"} and elem.text:
pieces.append(elem.text)
pieces.append("\n") # separator per part / sheet / slide
except Exception:
continue # ignore individual slide/sheet errors
text = "\n".join(pieces).strip()
return text or None
except Exception as e:
logger.error(f"Failed to extract file content: {e}")
# Any failure → quietly signal "not handled"
return None

View File

@@ -8,7 +8,6 @@ import asyncio
import re import re
import os import os
from typing import List, Optional, Dict, Any from typing import List, Optional, Dict, Any
import zipfile, xml.etree.ElementTree as ET
from mcp import types from mcp import types
from fastapi import Header from fastapi import Header
@@ -18,6 +17,7 @@ from googleapiclient.http import MediaIoBaseDownload # For file content
import io # For file content import io # For file content
from auth.google_auth import get_authenticated_google_service from auth.google_auth import get_authenticated_google_service
from core.utils import extract_office_xml_text
from core.server import server from core.server import server
from core.server import ( from core.server import (
DRIVE_READONLY_SCOPE, DRIVE_READONLY_SCOPE,
@@ -26,53 +26,6 @@ from core.server import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# -------------------------------------------------------------------------
# Helper: pull raw text from OOXML containers (docx / xlsx / pptx)
# -------------------------------------------------------------------------
def _extract_office_xml_text(file_bytes: bytes, mime_type: str) -> Optional[str]:
"""
Very light-weight XML scraper for Word, Excel, PowerPoint files.
Returns plain-text if something readable is found, else None.
No external deps just std-lib zipfile + ElementTree.
"""
try:
with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
# Map MIME → iterable of XML files to inspect
if mime_type == (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
targets = ["word/document.xml"]
elif mime_type == (
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
targets = [n for n in zf.namelist() if n.startswith("ppt/slides/slide")]
elif mime_type == (
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
):
targets = [n for n in zf.namelist() if n.startswith("xl/worksheets/sheet")]
else:
return None
pieces: List[str] = []
for member in targets:
try:
xml_root = ET.fromstring(zf.read(member))
# In both Word/PowerPoint the text is in <w:t> or <a:t>;
# in Excel, cell values are in <v>.
for elem in xml_root.iter():
tag = elem.tag.split("}")[-1] # strip namespace
if tag in {"t", "v"} and elem.text:
pieces.append(elem.text)
pieces.append("\n") # separator per part / sheet / slide
except Exception:
continue # ignore individual slide/sheet errors
text = "\n".join(pieces).strip()
return text or None
except Exception as e:
logger.error(f"Failed to extract file content: {e}")
# Any failure → quietly signal "not handled"
return None
@server.tool() @server.tool()
async def search_drive_files( async def search_drive_files(
user_google_email: str, user_google_email: str,
@@ -223,7 +176,7 @@ async def get_drive_file_content(
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Attempt Office XML extraction # Attempt Office XML extraction
# ------------------------------------------------------------------ # ------------------------------------------------------------------
office_text = _extract_office_xml_text(file_content_bytes, mime_type) office_text = extract_office_xml_text(file_content_bytes, mime_type)
if office_text: if office_text:
body_text = office_text body_text = office_text
else: else: