feat: initial commit from workspace-mcp
Some checks failed
Check Maintainer Edits Enabled / check-maintainer-edits (pull_request) Has been cancelled
Check Maintainer Edits Enabled / check-maintainer-edits-internal (pull_request) Has been cancelled
Docker Build and Push to GHCR / build-and-push (pull_request) Has been cancelled
Ruff / ruff (pull_request) Has been cancelled
Some checks failed
Check Maintainer Edits Enabled / check-maintainer-edits (pull_request) Has been cancelled
Check Maintainer Edits Enabled / check-maintainer-edits-internal (pull_request) Has been cancelled
Docker Build and Push to GHCR / build-and-push (pull_request) Has been cancelled
Ruff / ruff (pull_request) Has been cancelled
This commit is contained in:
357
gdocs/docs_structure.py
Normal file
357
gdocs/docs_structure.py
Normal file
@@ -0,0 +1,357 @@
|
||||
"""
|
||||
Google Docs Document Structure Parsing and Analysis
|
||||
|
||||
This module provides utilities for parsing and analyzing the structure
|
||||
of Google Docs documents, including finding tables, cells, and other elements.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def parse_document_structure(doc_data: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Parse the full document structure into a navigable format.
|
||||
|
||||
Args:
|
||||
doc_data: Raw document data from Google Docs API
|
||||
|
||||
Returns:
|
||||
Dictionary containing parsed structure with elements and their positions
|
||||
"""
|
||||
structure = {
|
||||
"title": doc_data.get("title", ""),
|
||||
"body": [],
|
||||
"tables": [],
|
||||
"headers": {},
|
||||
"footers": {},
|
||||
"total_length": 0,
|
||||
}
|
||||
|
||||
body = doc_data.get("body", {})
|
||||
content = body.get("content", [])
|
||||
|
||||
for element in content:
|
||||
element_info = _parse_element(element)
|
||||
if element_info:
|
||||
structure["body"].append(element_info)
|
||||
if element_info["type"] == "table":
|
||||
structure["tables"].append(element_info)
|
||||
|
||||
# Calculate total document length
|
||||
if structure["body"]:
|
||||
last_element = structure["body"][-1]
|
||||
structure["total_length"] = last_element.get("end_index", 0)
|
||||
|
||||
# Parse headers and footers
|
||||
for header_id, header_data in doc_data.get("headers", {}).items():
|
||||
structure["headers"][header_id] = _parse_segment(header_data)
|
||||
|
||||
for footer_id, footer_data in doc_data.get("footers", {}).items():
|
||||
structure["footers"][footer_id] = _parse_segment(footer_data)
|
||||
|
||||
return structure
|
||||
|
||||
|
||||
def _parse_element(element: dict[str, Any]) -> Optional[dict[str, Any]]:
|
||||
"""
|
||||
Parse a single document element.
|
||||
|
||||
Args:
|
||||
element: Element data from document
|
||||
|
||||
Returns:
|
||||
Parsed element information or None
|
||||
"""
|
||||
element_info = {
|
||||
"start_index": element.get("startIndex", 0),
|
||||
"end_index": element.get("endIndex", 0),
|
||||
}
|
||||
|
||||
if "paragraph" in element:
|
||||
paragraph = element["paragraph"]
|
||||
element_info["type"] = "paragraph"
|
||||
element_info["text"] = _extract_paragraph_text(paragraph)
|
||||
element_info["style"] = paragraph.get("paragraphStyle", {})
|
||||
|
||||
elif "table" in element:
|
||||
table = element["table"]
|
||||
element_info["type"] = "table"
|
||||
element_info["rows"] = len(table.get("tableRows", []))
|
||||
element_info["columns"] = len(
|
||||
table.get("tableRows", [{}])[0].get("tableCells", [])
|
||||
)
|
||||
element_info["cells"] = _parse_table_cells(table)
|
||||
element_info["table_style"] = table.get("tableStyle", {})
|
||||
|
||||
elif "sectionBreak" in element:
|
||||
element_info["type"] = "section_break"
|
||||
element_info["section_style"] = element["sectionBreak"].get("sectionStyle", {})
|
||||
|
||||
elif "tableOfContents" in element:
|
||||
element_info["type"] = "table_of_contents"
|
||||
|
||||
else:
|
||||
return None
|
||||
|
||||
return element_info
|
||||
|
||||
|
||||
def _parse_table_cells(table: dict[str, Any]) -> list[list[dict[str, Any]]]:
|
||||
"""
|
||||
Parse table cells with their positions and content.
|
||||
|
||||
Args:
|
||||
table: Table element data
|
||||
|
||||
Returns:
|
||||
2D list of cell information
|
||||
"""
|
||||
cells = []
|
||||
for row_idx, row in enumerate(table.get("tableRows", [])):
|
||||
row_cells = []
|
||||
for col_idx, cell in enumerate(row.get("tableCells", [])):
|
||||
# Find the first paragraph in the cell for insertion
|
||||
insertion_index = cell.get("startIndex", 0) + 1 # Default fallback
|
||||
|
||||
# Look for the first paragraph in cell content
|
||||
content_elements = cell.get("content", [])
|
||||
for element in content_elements:
|
||||
if "paragraph" in element:
|
||||
paragraph = element["paragraph"]
|
||||
# Get the first element in the paragraph
|
||||
para_elements = paragraph.get("elements", [])
|
||||
if para_elements:
|
||||
first_element = para_elements[0]
|
||||
if "startIndex" in first_element:
|
||||
insertion_index = first_element["startIndex"]
|
||||
break
|
||||
|
||||
cell_info = {
|
||||
"row": row_idx,
|
||||
"column": col_idx,
|
||||
"start_index": cell.get("startIndex", 0),
|
||||
"end_index": cell.get("endIndex", 0),
|
||||
"insertion_index": insertion_index, # Where to insert text in this cell
|
||||
"content": _extract_cell_text(cell),
|
||||
"content_elements": content_elements,
|
||||
}
|
||||
row_cells.append(cell_info)
|
||||
cells.append(row_cells)
|
||||
return cells
|
||||
|
||||
|
||||
def _extract_paragraph_text(paragraph: dict[str, Any]) -> str:
|
||||
"""Extract text from a paragraph element."""
|
||||
text_parts = []
|
||||
for element in paragraph.get("elements", []):
|
||||
if "textRun" in element:
|
||||
text_parts.append(element["textRun"].get("content", ""))
|
||||
return "".join(text_parts)
|
||||
|
||||
|
||||
def _extract_cell_text(cell: dict[str, Any]) -> str:
|
||||
"""Extract text content from a table cell."""
|
||||
text_parts = []
|
||||
for element in cell.get("content", []):
|
||||
if "paragraph" in element:
|
||||
text_parts.append(_extract_paragraph_text(element["paragraph"]))
|
||||
return "".join(text_parts)
|
||||
|
||||
|
||||
def _parse_segment(segment_data: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Parse a document segment (header/footer)."""
|
||||
return {
|
||||
"content": segment_data.get("content", []),
|
||||
"start_index": segment_data.get("content", [{}])[0].get("startIndex", 0)
|
||||
if segment_data.get("content")
|
||||
else 0,
|
||||
"end_index": segment_data.get("content", [{}])[-1].get("endIndex", 0)
|
||||
if segment_data.get("content")
|
||||
else 0,
|
||||
}
|
||||
|
||||
|
||||
def find_tables(doc_data: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Find all tables in the document with their positions and dimensions.
|
||||
|
||||
Args:
|
||||
doc_data: Raw document data from Google Docs API
|
||||
|
||||
Returns:
|
||||
List of table information dictionaries
|
||||
"""
|
||||
tables = []
|
||||
structure = parse_document_structure(doc_data)
|
||||
|
||||
for idx, table_info in enumerate(structure["tables"]):
|
||||
tables.append(
|
||||
{
|
||||
"index": idx,
|
||||
"start_index": table_info["start_index"],
|
||||
"end_index": table_info["end_index"],
|
||||
"rows": table_info["rows"],
|
||||
"columns": table_info["columns"],
|
||||
"cells": table_info["cells"],
|
||||
}
|
||||
)
|
||||
|
||||
return tables
|
||||
|
||||
|
||||
def get_table_cell_indices(
|
||||
doc_data: dict[str, Any], table_index: int = 0
|
||||
) -> Optional[list[list[tuple[int, int]]]]:
|
||||
"""
|
||||
Get content indices for all cells in a specific table.
|
||||
|
||||
Args:
|
||||
doc_data: Raw document data from Google Docs API
|
||||
table_index: Index of the table (0-based)
|
||||
|
||||
Returns:
|
||||
2D list of (start_index, end_index) tuples for each cell, or None if table not found
|
||||
"""
|
||||
tables = find_tables(doc_data)
|
||||
|
||||
if table_index >= len(tables):
|
||||
logger.warning(
|
||||
f"Table index {table_index} not found. Document has {len(tables)} tables."
|
||||
)
|
||||
return None
|
||||
|
||||
table = tables[table_index]
|
||||
cell_indices = []
|
||||
|
||||
for row in table["cells"]:
|
||||
row_indices = []
|
||||
for cell in row:
|
||||
# Each cell contains at least one paragraph
|
||||
# Find the first paragraph in the cell for content insertion
|
||||
cell_content = cell.get("content_elements", [])
|
||||
if cell_content:
|
||||
# Look for the first paragraph in cell content
|
||||
first_para = None
|
||||
for element in cell_content:
|
||||
if "paragraph" in element:
|
||||
first_para = element["paragraph"]
|
||||
break
|
||||
|
||||
if first_para and "elements" in first_para and first_para["elements"]:
|
||||
# Insert at the start of the first text run in the paragraph
|
||||
first_text_element = first_para["elements"][0]
|
||||
if "textRun" in first_text_element:
|
||||
start_idx = first_text_element.get(
|
||||
"startIndex", cell["start_index"] + 1
|
||||
)
|
||||
end_idx = first_text_element.get("endIndex", start_idx + 1)
|
||||
row_indices.append((start_idx, end_idx))
|
||||
continue
|
||||
|
||||
# Fallback: use cell boundaries with safe margins
|
||||
content_start = cell["start_index"] + 1
|
||||
content_end = cell["end_index"] - 1
|
||||
row_indices.append((content_start, content_end))
|
||||
cell_indices.append(row_indices)
|
||||
|
||||
return cell_indices
|
||||
|
||||
|
||||
def find_element_at_index(
|
||||
doc_data: dict[str, Any], index: int
|
||||
) -> Optional[dict[str, Any]]:
|
||||
"""
|
||||
Find what element exists at a given index in the document.
|
||||
|
||||
Args:
|
||||
doc_data: Raw document data from Google Docs API
|
||||
index: Position in the document
|
||||
|
||||
Returns:
|
||||
Information about the element at that position, or None
|
||||
"""
|
||||
structure = parse_document_structure(doc_data)
|
||||
|
||||
for element in structure["body"]:
|
||||
if element["start_index"] <= index < element["end_index"]:
|
||||
element_copy = element.copy()
|
||||
|
||||
# If it's a table, find which cell contains the index
|
||||
if element["type"] == "table" and "cells" in element:
|
||||
for row_idx, row in enumerate(element["cells"]):
|
||||
for col_idx, cell in enumerate(row):
|
||||
if cell["start_index"] <= index < cell["end_index"]:
|
||||
element_copy["containing_cell"] = {
|
||||
"row": row_idx,
|
||||
"column": col_idx,
|
||||
"cell_start": cell["start_index"],
|
||||
"cell_end": cell["end_index"],
|
||||
}
|
||||
break
|
||||
|
||||
return element_copy
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_next_paragraph_index(doc_data: dict[str, Any], after_index: int = 0) -> int:
|
||||
"""
|
||||
Find the next safe position to insert content after a given index.
|
||||
|
||||
Args:
|
||||
doc_data: Raw document data from Google Docs API
|
||||
after_index: Index after which to find insertion point
|
||||
|
||||
Returns:
|
||||
Safe index for insertion
|
||||
"""
|
||||
structure = parse_document_structure(doc_data)
|
||||
|
||||
# Find the first paragraph element after the given index
|
||||
for element in structure["body"]:
|
||||
if element["type"] == "paragraph" and element["start_index"] > after_index:
|
||||
# Insert at the end of the previous element or start of this paragraph
|
||||
return element["start_index"]
|
||||
|
||||
# If no paragraph found, return the end of document
|
||||
return structure["total_length"] - 1 if structure["total_length"] > 0 else 1
|
||||
|
||||
|
||||
def analyze_document_complexity(doc_data: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Analyze document complexity and provide statistics.
|
||||
|
||||
Args:
|
||||
doc_data: Raw document data from Google Docs API
|
||||
|
||||
Returns:
|
||||
Dictionary with document statistics
|
||||
"""
|
||||
structure = parse_document_structure(doc_data)
|
||||
|
||||
stats = {
|
||||
"total_elements": len(structure["body"]),
|
||||
"tables": len(structure["tables"]),
|
||||
"paragraphs": sum(1 for e in structure["body"] if e.get("type") == "paragraph"),
|
||||
"section_breaks": sum(
|
||||
1 for e in structure["body"] if e.get("type") == "section_break"
|
||||
),
|
||||
"total_length": structure["total_length"],
|
||||
"has_headers": bool(structure["headers"]),
|
||||
"has_footers": bool(structure["footers"]),
|
||||
}
|
||||
|
||||
# Add table statistics
|
||||
if structure["tables"]:
|
||||
total_cells = sum(
|
||||
table["rows"] * table["columns"] for table in structure["tables"]
|
||||
)
|
||||
stats["total_table_cells"] = total_cells
|
||||
stats["largest_table"] = max(
|
||||
(t["rows"] * t["columns"] for t in structure["tables"]), default=0
|
||||
)
|
||||
|
||||
return stats
|
||||
Reference in New Issue
Block a user