feat: initial commit from workspace-mcp

2026-03-17 19:23:33 -05:00
commit 395f0e2029
138 changed files with 41691 additions and 0 deletions
--- a/gdocs/docs_structure.py
+++ b/gdocs/docs_structure.py
@@ -0,0 +1,357 @@
+"""
+Google Docs Document Structure Parsing and Analysis
+
+This module provides utilities for parsing and analyzing the structure
+of Google Docs documents, including finding tables, cells, and other elements.
+"""
+
+import logging
+from typing import Any, Optional
+
+logger = logging.getLogger(__name__)
+
+
+def parse_document_structure(doc_data: dict[str, Any]) -> dict[str, Any]:
+    """
+    Parse the full document structure into a navigable format.
+
+    Args:
+        doc_data: Raw document data from Google Docs API
+
+    Returns:
+        Dictionary containing parsed structure with elements and their positions
+    """
+    structure = {
+        "title": doc_data.get("title", ""),
+        "body": [],
+        "tables": [],
+        "headers": {},
+        "footers": {},
+        "total_length": 0,
+    }
+
+    body = doc_data.get("body", {})
+    content = body.get("content", [])
+
+    for element in content:
+        element_info = _parse_element(element)
+        if element_info:
+            structure["body"].append(element_info)
+            if element_info["type"] == "table":
+                structure["tables"].append(element_info)
+
+    # Calculate total document length
+    if structure["body"]:
+        last_element = structure["body"][-1]
+        structure["total_length"] = last_element.get("end_index", 0)
+
+    # Parse headers and footers
+    for header_id, header_data in doc_data.get("headers", {}).items():
+        structure["headers"][header_id] = _parse_segment(header_data)
+
+    for footer_id, footer_data in doc_data.get("footers", {}).items():
+        structure["footers"][footer_id] = _parse_segment(footer_data)
+
+    return structure
+
+
+def _parse_element(element: dict[str, Any]) -> Optional[dict[str, Any]]:
+    """
+    Parse a single document element.
+
+    Args:
+        element: Element data from document
+
+    Returns:
+        Parsed element information or None
+    """
+    element_info = {
+        "start_index": element.get("startIndex", 0),
+        "end_index": element.get("endIndex", 0),
+    }
+
+    if "paragraph" in element:
+        paragraph = element["paragraph"]
+        element_info["type"] = "paragraph"
+        element_info["text"] = _extract_paragraph_text(paragraph)
+        element_info["style"] = paragraph.get("paragraphStyle", {})
+
+    elif "table" in element:
+        table = element["table"]
+        element_info["type"] = "table"
+        element_info["rows"] = len(table.get("tableRows", []))
+        element_info["columns"] = len(
+            table.get("tableRows", [{}])[0].get("tableCells", [])
+        )
+        element_info["cells"] = _parse_table_cells(table)
+        element_info["table_style"] = table.get("tableStyle", {})
+
+    elif "sectionBreak" in element:
+        element_info["type"] = "section_break"
+        element_info["section_style"] = element["sectionBreak"].get("sectionStyle", {})
+
+    elif "tableOfContents" in element:
+        element_info["type"] = "table_of_contents"
+
+    else:
+        return None
+
+    return element_info
+
+
+def _parse_table_cells(table: dict[str, Any]) -> list[list[dict[str, Any]]]:
+    """
+    Parse table cells with their positions and content.
+
+    Args:
+        table: Table element data
+
+    Returns:
+        2D list of cell information
+    """
+    cells = []
+    for row_idx, row in enumerate(table.get("tableRows", [])):
+        row_cells = []
+        for col_idx, cell in enumerate(row.get("tableCells", [])):
+            # Find the first paragraph in the cell for insertion
+            insertion_index = cell.get("startIndex", 0) + 1  # Default fallback
+
+            # Look for the first paragraph in cell content
+            content_elements = cell.get("content", [])
+            for element in content_elements:
+                if "paragraph" in element:
+                    paragraph = element["paragraph"]
+                    # Get the first element in the paragraph
+                    para_elements = paragraph.get("elements", [])
+                    if para_elements:
+                        first_element = para_elements[0]
+                        if "startIndex" in first_element:
+                            insertion_index = first_element["startIndex"]
+                            break
+
+            cell_info = {
+                "row": row_idx,
+                "column": col_idx,
+                "start_index": cell.get("startIndex", 0),
+                "end_index": cell.get("endIndex", 0),
+                "insertion_index": insertion_index,  # Where to insert text in this cell
+                "content": _extract_cell_text(cell),
+                "content_elements": content_elements,
+            }
+            row_cells.append(cell_info)
+        cells.append(row_cells)
+    return cells
+
+
+def _extract_paragraph_text(paragraph: dict[str, Any]) -> str:
+    """Extract text from a paragraph element."""
+    text_parts = []
+    for element in paragraph.get("elements", []):
+        if "textRun" in element:
+            text_parts.append(element["textRun"].get("content", ""))
+    return "".join(text_parts)
+
+
+def _extract_cell_text(cell: dict[str, Any]) -> str:
+    """Extract text content from a table cell."""
+    text_parts = []
+    for element in cell.get("content", []):
+        if "paragraph" in element:
+            text_parts.append(_extract_paragraph_text(element["paragraph"]))
+    return "".join(text_parts)
+
+
+def _parse_segment(segment_data: dict[str, Any]) -> dict[str, Any]:
+    """Parse a document segment (header/footer)."""
+    return {
+        "content": segment_data.get("content", []),
+        "start_index": segment_data.get("content", [{}])[0].get("startIndex", 0)
+        if segment_data.get("content")
+        else 0,
+        "end_index": segment_data.get("content", [{}])[-1].get("endIndex", 0)
+        if segment_data.get("content")
+        else 0,
+    }
+
+
+def find_tables(doc_data: dict[str, Any]) -> list[dict[str, Any]]:
+    """
+    Find all tables in the document with their positions and dimensions.
+
+    Args:
+        doc_data: Raw document data from Google Docs API
+
+    Returns:
+        List of table information dictionaries
+    """
+    tables = []
+    structure = parse_document_structure(doc_data)
+
+    for idx, table_info in enumerate(structure["tables"]):
+        tables.append(
+            {
+                "index": idx,
+                "start_index": table_info["start_index"],
+                "end_index": table_info["end_index"],
+                "rows": table_info["rows"],
+                "columns": table_info["columns"],
+                "cells": table_info["cells"],
+            }
+        )
+
+    return tables
+
+
+def get_table_cell_indices(
+    doc_data: dict[str, Any], table_index: int = 0
+) -> Optional[list[list[tuple[int, int]]]]:
+    """
+    Get content indices for all cells in a specific table.
+
+    Args:
+        doc_data: Raw document data from Google Docs API
+        table_index: Index of the table (0-based)
+
+    Returns:
+        2D list of (start_index, end_index) tuples for each cell, or None if table not found
+    """
+    tables = find_tables(doc_data)
+
+    if table_index >= len(tables):
+        logger.warning(
+            f"Table index {table_index} not found. Document has {len(tables)} tables."
+        )
+        return None
+
+    table = tables[table_index]
+    cell_indices = []
+
+    for row in table["cells"]:
+        row_indices = []
+        for cell in row:
+            # Each cell contains at least one paragraph
+            # Find the first paragraph in the cell for content insertion
+            cell_content = cell.get("content_elements", [])
+            if cell_content:
+                # Look for the first paragraph in cell content
+                first_para = None
+                for element in cell_content:
+                    if "paragraph" in element:
+                        first_para = element["paragraph"]
+                        break
+
+                if first_para and "elements" in first_para and first_para["elements"]:
+                    # Insert at the start of the first text run in the paragraph
+                    first_text_element = first_para["elements"][0]
+                    if "textRun" in first_text_element:
+                        start_idx = first_text_element.get(
+                            "startIndex", cell["start_index"] + 1
+                        )
+                        end_idx = first_text_element.get("endIndex", start_idx + 1)
+                        row_indices.append((start_idx, end_idx))
+                        continue
+
+            # Fallback: use cell boundaries with safe margins
+            content_start = cell["start_index"] + 1
+            content_end = cell["end_index"] - 1
+            row_indices.append((content_start, content_end))
+        cell_indices.append(row_indices)
+
+    return cell_indices
+
+
+def find_element_at_index(
+    doc_data: dict[str, Any], index: int
+) -> Optional[dict[str, Any]]:
+    """
+    Find what element exists at a given index in the document.
+
+    Args:
+        doc_data: Raw document data from Google Docs API
+        index: Position in the document
+
+    Returns:
+        Information about the element at that position, or None
+    """
+    structure = parse_document_structure(doc_data)
+
+    for element in structure["body"]:
+        if element["start_index"] <= index < element["end_index"]:
+            element_copy = element.copy()
+
+            # If it's a table, find which cell contains the index
+            if element["type"] == "table" and "cells" in element:
+                for row_idx, row in enumerate(element["cells"]):
+                    for col_idx, cell in enumerate(row):
+                        if cell["start_index"] <= index < cell["end_index"]:
+                            element_copy["containing_cell"] = {
+                                "row": row_idx,
+                                "column": col_idx,
+                                "cell_start": cell["start_index"],
+                                "cell_end": cell["end_index"],
+                            }
+                            break
+
+            return element_copy
+
+    return None
+
+
+def get_next_paragraph_index(doc_data: dict[str, Any], after_index: int = 0) -> int:
+    """
+    Find the next safe position to insert content after a given index.
+
+    Args:
+        doc_data: Raw document data from Google Docs API
+        after_index: Index after which to find insertion point
+
+    Returns:
+        Safe index for insertion
+    """
+    structure = parse_document_structure(doc_data)
+
+    # Find the first paragraph element after the given index
+    for element in structure["body"]:
+        if element["type"] == "paragraph" and element["start_index"] > after_index:
+            # Insert at the end of the previous element or start of this paragraph
+            return element["start_index"]
+
+    # If no paragraph found, return the end of document
+    return structure["total_length"] - 1 if structure["total_length"] > 0 else 1
+
+
+def analyze_document_complexity(doc_data: dict[str, Any]) -> dict[str, Any]:
+    """
+    Analyze document complexity and provide statistics.
+
+    Args:
+        doc_data: Raw document data from Google Docs API
+
+    Returns:
+        Dictionary with document statistics
+    """
+    structure = parse_document_structure(doc_data)
+
+    stats = {
+        "total_elements": len(structure["body"]),
+        "tables": len(structure["tables"]),
+        "paragraphs": sum(1 for e in structure["body"] if e.get("type") == "paragraph"),
+        "section_breaks": sum(
+            1 for e in structure["body"] if e.get("type") == "section_break"
+        ),
+        "total_length": structure["total_length"],
+        "has_headers": bool(structure["headers"]),
+        "has_footers": bool(structure["footers"]),
+    }
+
+    # Add table statistics
+    if structure["tables"]:
+        total_cells = sum(
+            table["rows"] * table["columns"] for table in structure["tables"]
+        )
+        stats["total_table_cells"] = total_cells
+        stats["largest_table"] = max(
+            (t["rows"] * t["columns"] for t in structure["tables"]), default=0
+        )
+
+    return stats