add utils for office xml text handling
This commit is contained in:
50
core/utils.py
Normal file
50
core/utils.py
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
import zipfile, xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Helper: pull raw text from OOXML containers (docx / xlsx / pptx)
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
def extract_office_xml_text(file_bytes: bytes, mime_type: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Very light-weight XML scraper for Word, Excel, PowerPoint files.
|
||||||
|
Returns plain-text if something readable is found, else None.
|
||||||
|
No external deps – just std-lib zipfile + ElementTree.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
|
||||||
|
# Map MIME → iterable of XML files to inspect
|
||||||
|
if mime_type == (
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
):
|
||||||
|
targets = ["word/document.xml"]
|
||||||
|
elif mime_type == (
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||||
|
):
|
||||||
|
targets = [n for n in zf.namelist() if n.startswith("ppt/slides/slide")]
|
||||||
|
elif mime_type == (
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
):
|
||||||
|
targets = [n for n in zf.namelist() if n.startswith("xl/worksheets/sheet")]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
pieces: List[str] = []
|
||||||
|
for member in targets:
|
||||||
|
try:
|
||||||
|
xml_root = ET.fromstring(zf.read(member))
|
||||||
|
# In both Word/PowerPoint the text is in <w:t> or <a:t>;
|
||||||
|
# in Excel, cell values are in <v>.
|
||||||
|
for elem in xml_root.iter():
|
||||||
|
tag = elem.tag.split("}")[-1] # strip namespace
|
||||||
|
if tag in {"t", "v"} and elem.text:
|
||||||
|
pieces.append(elem.text)
|
||||||
|
pieces.append("\n") # separator per part / sheet / slide
|
||||||
|
except Exception:
|
||||||
|
continue # ignore individual slide/sheet errors
|
||||||
|
text = "\n".join(pieces).strip()
|
||||||
|
return text or None
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to extract file content: {e}")
|
||||||
|
# Any failure → quietly signal "not handled"
|
||||||
|
return None
|
||||||
@@ -8,7 +8,6 @@ import asyncio
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
from typing import List, Optional, Dict, Any
|
from typing import List, Optional, Dict, Any
|
||||||
import zipfile, xml.etree.ElementTree as ET
|
|
||||||
|
|
||||||
from mcp import types
|
from mcp import types
|
||||||
from fastapi import Header
|
from fastapi import Header
|
||||||
@@ -18,6 +17,7 @@ from googleapiclient.http import MediaIoBaseDownload # For file content
|
|||||||
import io # For file content
|
import io # For file content
|
||||||
|
|
||||||
from auth.google_auth import get_authenticated_google_service
|
from auth.google_auth import get_authenticated_google_service
|
||||||
|
from core.utils import extract_office_xml_text
|
||||||
from core.server import server
|
from core.server import server
|
||||||
from core.server import (
|
from core.server import (
|
||||||
DRIVE_READONLY_SCOPE,
|
DRIVE_READONLY_SCOPE,
|
||||||
@@ -26,53 +26,6 @@ from core.server import (
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
# Helper: pull raw text from OOXML containers (docx / xlsx / pptx)
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
def _extract_office_xml_text(file_bytes: bytes, mime_type: str) -> Optional[str]:
|
|
||||||
"""
|
|
||||||
Very light-weight XML scraper for Word, Excel, PowerPoint files.
|
|
||||||
Returns plain-text if something readable is found, else None.
|
|
||||||
No external deps – just std-lib zipfile + ElementTree.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
|
|
||||||
# Map MIME → iterable of XML files to inspect
|
|
||||||
if mime_type == (
|
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
||||||
):
|
|
||||||
targets = ["word/document.xml"]
|
|
||||||
elif mime_type == (
|
|
||||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
|
||||||
):
|
|
||||||
targets = [n for n in zf.namelist() if n.startswith("ppt/slides/slide")]
|
|
||||||
elif mime_type == (
|
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
||||||
):
|
|
||||||
targets = [n for n in zf.namelist() if n.startswith("xl/worksheets/sheet")]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
pieces: List[str] = []
|
|
||||||
for member in targets:
|
|
||||||
try:
|
|
||||||
xml_root = ET.fromstring(zf.read(member))
|
|
||||||
# In both Word/PowerPoint the text is in <w:t> or <a:t>;
|
|
||||||
# in Excel, cell values are in <v>.
|
|
||||||
for elem in xml_root.iter():
|
|
||||||
tag = elem.tag.split("}")[-1] # strip namespace
|
|
||||||
if tag in {"t", "v"} and elem.text:
|
|
||||||
pieces.append(elem.text)
|
|
||||||
pieces.append("\n") # separator per part / sheet / slide
|
|
||||||
except Exception:
|
|
||||||
continue # ignore individual slide/sheet errors
|
|
||||||
text = "\n".join(pieces).strip()
|
|
||||||
return text or None
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to extract file content: {e}")
|
|
||||||
# Any failure → quietly signal "not handled"
|
|
||||||
return None
|
|
||||||
|
|
||||||
@server.tool()
|
@server.tool()
|
||||||
async def search_drive_files(
|
async def search_drive_files(
|
||||||
user_google_email: str,
|
user_google_email: str,
|
||||||
@@ -223,7 +176,7 @@ async def get_drive_file_content(
|
|||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Attempt Office XML extraction
|
# Attempt Office XML extraction
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
office_text = _extract_office_xml_text(file_content_bytes, mime_type)
|
office_text = extract_office_xml_text(file_content_bytes, mime_type)
|
||||||
if office_text:
|
if office_text:
|
||||||
body_text = office_text
|
body_text = office_text
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user