add utils for office xml text handling

2025-05-24 13:55:00 -04:00
parent f4e6275c19
commit 84b9078659
2 changed files with 52 additions and 49 deletions
--- a/core/utils.py
+++ b/core/utils.py
@@ -0,0 +1,50 @@
 import zipfile, xml.etree.ElementTree as ET
 from typing import List, Optional
 # -------------------------------------------------------------------------
 # Helper: pull raw text from OOXML containers (docx / xlsx / pptx)
 # -------------------------------------------------------------------------
 def extract_office_xml_text(file_bytes: bytes, mime_type: str) -> Optional[str]:
    """
    Very light-weight XML scraper for Word, Excel, PowerPoint files.
    Returns plain-text if something readable is found, else None.
    No external deps – just std-lib zipfile + ElementTree.
    """
    try:
        with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
            # Map MIME → iterable of XML files to inspect
            if mime_type == (
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
            ):
                targets = ["word/document.xml"]
            elif mime_type == (
                "application/vnd.openxmlformats-officedocument.presentationml.presentation"
            ):
                targets = [n for n in zf.namelist() if n.startswith("ppt/slides/slide")]
            elif mime_type == (
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            ):
                targets = [n for n in zf.namelist() if n.startswith("xl/worksheets/sheet")]
            else:
                return None
            pieces: List[str] = []
            for member in targets:
                try:
                    xml_root = ET.fromstring(zf.read(member))
                    # In both Word/PowerPoint the text is in <w:t> or <a:t>;
                    # in Excel, cell values are in <v>.
                    for elem in xml_root.iter():
                        tag = elem.tag.split("}")[-1]  # strip namespace
                        if tag in {"t", "v"} and elem.text:
                            pieces.append(elem.text)
                    pieces.append("\n")  # separator per part / sheet / slide
                except Exception:
                    continue  # ignore individual slide/sheet errors
            text = "\n".join(pieces).strip()
            return text or None
    except Exception as e:
        logger.error(f"Failed to extract file content: {e}")
        # Any failure → quietly signal "not handled"
        return None
--- a/gdrive/drive_tools.py
+++ b/gdrive/drive_tools.py
@@ -8,7 +8,6 @@ import asyncio
 import re
 import os
 from typing import List, Optional, Dict, Any
 import zipfile, xml.etree.ElementTree as ET
 from mcp import types
 from fastapi import Header
@@ -18,6 +17,7 @@ from googleapiclient.http import MediaIoBaseDownload # For file content
 import io # For file content
 from auth.google_auth import get_authenticated_google_service
 from core.utils import extract_office_xml_text
 from core.server import server
 from core.server import (
    DRIVE_READONLY_SCOPE,
@@ -26,53 +26,6 @@ from core.server import (
 logger = logging.getLogger(__name__)
 # -------------------------------------------------------------------------
 # Helper: pull raw text from OOXML containers (docx / xlsx / pptx)
 # -------------------------------------------------------------------------
 def _extract_office_xml_text(file_bytes: bytes, mime_type: str) -> Optional[str]:
    """
    Very light-weight XML scraper for Word, Excel, PowerPoint files.
    Returns plain-text if something readable is found, else None.
    No external deps – just std-lib zipfile + ElementTree.
    """
    try:
        with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
            # Map MIME → iterable of XML files to inspect
            if mime_type == (
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
            ):
                targets = ["word/document.xml"]
            elif mime_type == (
                "application/vnd.openxmlformats-officedocument.presentationml.presentation"
            ):
                targets = [n for n in zf.namelist() if n.startswith("ppt/slides/slide")]
            elif mime_type == (
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            ):
                targets = [n for n in zf.namelist() if n.startswith("xl/worksheets/sheet")]
            else:
                return None
            pieces: List[str] = []
            for member in targets:
                try:
                    xml_root = ET.fromstring(zf.read(member))
                    # In both Word/PowerPoint the text is in <w:t> or <a:t>;
                    # in Excel, cell values are in <v>.
                    for elem in xml_root.iter():
                        tag = elem.tag.split("}")[-1]  # strip namespace
                        if tag in {"t", "v"} and elem.text:
                            pieces.append(elem.text)
                    pieces.append("\n")  # separator per part / sheet / slide
                except Exception:
                    continue  # ignore individual slide/sheet errors
            text = "\n".join(pieces).strip()
            return text or None
    except Exception as e:
        logger.error(f"Failed to extract file content: {e}")
        # Any failure → quietly signal "not handled"
        return None
@server.tool()
 async def search_drive_files(
    user_google_email: str,
@@ -223,7 +176,7 @@ async def get_drive_file_content(
        # ------------------------------------------------------------------
        # Attempt Office XML extraction
        # ------------------------------------------------------------------
-        office_text = _extract_office_xml_text(file_content_bytes, mime_type)
+        office_text = extract_office_xml_text(file_content_bytes, mime_type)
        if office_text:
            body_text = office_text
        else: