add helpers

2025-08-10 14:54:18 -04:00
parent d67e1fcf0f
commit 8ca2c4e83b
2 changed files with 782 additions and 0 deletions
--- a/gdocs/docs_structure.py
+++ b/gdocs/docs_structure.py
@@ -0,0 +1,340 @@
+"""
+Google Docs Document Structure Parsing and Analysis
+
+This module provides utilities for parsing and analyzing the structure
+of Google Docs documents, including finding tables, cells, and other elements.
+"""
+import logging
+from typing import Dict, Any, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+
+def parse_document_structure(doc_data: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Parse the full document structure into a navigable format.
+    
+    Args:
+        doc_data: Raw document data from Google Docs API
+    
+    Returns:
+        Dictionary containing parsed structure with elements and their positions
+    """
+    structure = {
+        'title': doc_data.get('title', ''),
+        'body': [],
+        'tables': [],
+        'headers': {},
+        'footers': {},
+        'total_length': 0
+    }
+    
+    body = doc_data.get('body', {})
+    content = body.get('content', [])
+    
+    for element in content:
+        element_info = _parse_element(element)
+        if element_info:
+            structure['body'].append(element_info)
+            if element_info['type'] == 'table':
+                structure['tables'].append(element_info)
+    
+    # Calculate total document length
+    if structure['body']:
+        last_element = structure['body'][-1]
+        structure['total_length'] = last_element.get('end_index', 0)
+    
+    # Parse headers and footers
+    for header_id, header_data in doc_data.get('headers', {}).items():
+        structure['headers'][header_id] = _parse_segment(header_data)
+    
+    for footer_id, footer_data in doc_data.get('footers', {}).items():
+        structure['footers'][footer_id] = _parse_segment(footer_data)
+    
+    return structure
+
+
+def _parse_element(element: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """
+    Parse a single document element.
+    
+    Args:
+        element: Element data from document
+    
+    Returns:
+        Parsed element information or None
+    """
+    element_info = {
+        'start_index': element.get('startIndex', 0),
+        'end_index': element.get('endIndex', 0)
+    }
+    
+    if 'paragraph' in element:
+        paragraph = element['paragraph']
+        element_info['type'] = 'paragraph'
+        element_info['text'] = _extract_paragraph_text(paragraph)
+        element_info['style'] = paragraph.get('paragraphStyle', {})
+        
+    elif 'table' in element:
+        table = element['table']
+        element_info['type'] = 'table'
+        element_info['rows'] = len(table.get('tableRows', []))
+        element_info['columns'] = len(table.get('tableRows', [{}])[0].get('tableCells', []))
+        element_info['cells'] = _parse_table_cells(table)
+        element_info['table_style'] = table.get('tableStyle', {})
+        
+    elif 'sectionBreak' in element:
+        element_info['type'] = 'section_break'
+        element_info['section_style'] = element['sectionBreak'].get('sectionStyle', {})
+        
+    elif 'tableOfContents' in element:
+        element_info['type'] = 'table_of_contents'
+        
+    else:
+        return None
+    
+    return element_info
+
+
+def _parse_table_cells(table: Dict[str, Any]) -> List[List[Dict[str, Any]]]:
+    """
+    Parse table cells with their positions and content.
+    
+    Args:
+        table: Table element data
+    
+    Returns:
+        2D list of cell information
+    """
+    cells = []
+    for row_idx, row in enumerate(table.get('tableRows', [])):
+        row_cells = []
+        for col_idx, cell in enumerate(row.get('tableCells', [])):
+            # Find the first paragraph in the cell for insertion
+            insertion_index = cell.get('startIndex', 0) + 1  # Default fallback
+            
+            # Look for the first paragraph in cell content
+            content_elements = cell.get('content', [])
+            for element in content_elements:
+                if 'paragraph' in element:
+                    paragraph = element['paragraph']
+                    # Get the first element in the paragraph
+                    para_elements = paragraph.get('elements', [])
+                    if para_elements:
+                        first_element = para_elements[0]
+                        if 'startIndex' in first_element:
+                            insertion_index = first_element['startIndex']
+                            break
+            
+            cell_info = {
+                'row': row_idx,
+                'column': col_idx,
+                'start_index': cell.get('startIndex', 0),
+                'end_index': cell.get('endIndex', 0),
+                'insertion_index': insertion_index,  # Where to insert text in this cell
+                'content': _extract_cell_text(cell),
+                'content_elements': content_elements
+            }
+            row_cells.append(cell_info)
+        cells.append(row_cells)
+    return cells
+
+
+def _extract_paragraph_text(paragraph: Dict[str, Any]) -> str:
+    """Extract text from a paragraph element."""
+    text_parts = []
+    for element in paragraph.get('elements', []):
+        if 'textRun' in element:
+            text_parts.append(element['textRun'].get('content', ''))
+    return ''.join(text_parts)
+
+
+def _extract_cell_text(cell: Dict[str, Any]) -> str:
+    """Extract text content from a table cell."""
+    text_parts = []
+    for element in cell.get('content', []):
+        if 'paragraph' in element:
+            text_parts.append(_extract_paragraph_text(element['paragraph']))
+    return ''.join(text_parts)
+
+
+def _parse_segment(segment_data: Dict[str, Any]) -> Dict[str, Any]:
+    """Parse a document segment (header/footer)."""
+    return {
+        'content': segment_data.get('content', []),
+        'start_index': segment_data.get('content', [{}])[0].get('startIndex', 0) if segment_data.get('content') else 0,
+        'end_index': segment_data.get('content', [{}])[-1].get('endIndex', 0) if segment_data.get('content') else 0
+    }
+
+
+def find_tables(doc_data: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    Find all tables in the document with their positions and dimensions.
+    
+    Args:
+        doc_data: Raw document data from Google Docs API
+    
+    Returns:
+        List of table information dictionaries
+    """
+    tables = []
+    structure = parse_document_structure(doc_data)
+    
+    for idx, table_info in enumerate(structure['tables']):
+        tables.append({
+            'index': idx,
+            'start_index': table_info['start_index'],
+            'end_index': table_info['end_index'],
+            'rows': table_info['rows'],
+            'columns': table_info['columns'],
+            'cells': table_info['cells']
+        })
+    
+    return tables
+
+
+def get_table_cell_indices(doc_data: Dict[str, Any], table_index: int = 0) -> Optional[List[List[Tuple[int, int]]]]:
+    """
+    Get content indices for all cells in a specific table.
+    
+    Args:
+        doc_data: Raw document data from Google Docs API
+        table_index: Index of the table (0-based)
+    
+    Returns:
+        2D list of (start_index, end_index) tuples for each cell, or None if table not found
+    """
+    tables = find_tables(doc_data)
+    
+    if table_index >= len(tables):
+        logger.warning(f"Table index {table_index} not found. Document has {len(tables)} tables.")
+        return None
+    
+    table = tables[table_index]
+    cell_indices = []
+    
+    for row in table['cells']:
+        row_indices = []
+        for cell in row:
+            # Each cell contains at least one paragraph
+            # Find the first paragraph in the cell for content insertion
+            cell_content = cell.get('content_elements', [])
+            if cell_content:
+                # Look for the first paragraph in cell content
+                first_para = None
+                for element in cell_content:
+                    if 'paragraph' in element:
+                        first_para = element['paragraph']
+                        break
+                
+                if first_para and 'elements' in first_para and first_para['elements']:
+                    # Insert at the start of the first text run in the paragraph
+                    first_text_element = first_para['elements'][0]
+                    if 'textRun' in first_text_element:
+                        start_idx = first_text_element.get('startIndex', cell['start_index'] + 1)
+                        end_idx = first_text_element.get('endIndex', start_idx + 1)
+                        row_indices.append((start_idx, end_idx))
+                        continue
+            
+            # Fallback: use cell boundaries with safe margins
+            content_start = cell['start_index'] + 1  
+            content_end = cell['end_index'] - 1
+            row_indices.append((content_start, content_end))
+        cell_indices.append(row_indices)
+    
+    return cell_indices
+
+
+def find_element_at_index(doc_data: Dict[str, Any], index: int) -> Optional[Dict[str, Any]]:
+    """
+    Find what element exists at a given index in the document.
+    
+    Args:
+        doc_data: Raw document data from Google Docs API
+        index: Position in the document
+    
+    Returns:
+        Information about the element at that position, or None
+    """
+    structure = parse_document_structure(doc_data)
+    
+    for element in structure['body']:
+        if element['start_index'] <= index < element['end_index']:
+            element_copy = element.copy()
+            
+            # If it's a table, find which cell contains the index
+            if element['type'] == 'table' and 'cells' in element:
+                for row_idx, row in enumerate(element['cells']):
+                    for col_idx, cell in enumerate(row):
+                        if cell['start_index'] <= index < cell['end_index']:
+                            element_copy['containing_cell'] = {
+                                'row': row_idx,
+                                'column': col_idx,
+                                'cell_start': cell['start_index'],
+                                'cell_end': cell['end_index']
+                            }
+                            break
+            
+            return element_copy
+    
+    return None
+
+
+def get_next_paragraph_index(doc_data: Dict[str, Any], after_index: int = 0) -> int:
+    """
+    Find the next safe position to insert content after a given index.
+    
+    Args:
+        doc_data: Raw document data from Google Docs API
+        after_index: Index after which to find insertion point
+    
+    Returns:
+        Safe index for insertion
+    """
+    structure = parse_document_structure(doc_data)
+    
+    # Find the first paragraph element after the given index
+    for element in structure['body']:
+        if element['type'] == 'paragraph' and element['start_index'] > after_index:
+            # Insert at the end of the previous element or start of this paragraph
+            return element['start_index']
+    
+    # If no paragraph found, return the end of document
+    return structure['total_length'] - 1 if structure['total_length'] > 0 else 1
+
+
+def analyze_document_complexity(doc_data: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Analyze document complexity and provide statistics.
+    
+    Args:
+        doc_data: Raw document data from Google Docs API
+    
+    Returns:
+        Dictionary with document statistics
+    """
+    structure = parse_document_structure(doc_data)
+    
+    stats = {
+        'total_elements': len(structure['body']),
+        'tables': len(structure['tables']),
+        'paragraphs': sum(1 for e in structure['body'] if e.get('type') == 'paragraph'),
+        'section_breaks': sum(1 for e in structure['body'] if e.get('type') == 'section_break'),
+        'total_length': structure['total_length'],
+        'has_headers': bool(structure['headers']),
+        'has_footers': bool(structure['footers'])
+    }
+    
+    # Add table statistics
+    if structure['tables']:
+        total_cells = sum(
+            table['rows'] * table['columns'] 
+            for table in structure['tables']
+        )
+        stats['total_table_cells'] = total_cells
+        stats['largest_table'] = max(
+            (t['rows'] * t['columns'] for t in structure['tables']),
+            default=0
+        )
+    
+    return stats