add helpers
This commit is contained in:
340
gdocs/docs_structure.py
Normal file
340
gdocs/docs_structure.py
Normal file
@@ -0,0 +1,340 @@
|
||||
"""
|
||||
Google Docs Document Structure Parsing and Analysis
|
||||
|
||||
This module provides utilities for parsing and analyzing the structure
|
||||
of Google Docs documents, including finding tables, cells, and other elements.
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def parse_document_structure(doc_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse the full document structure into a navigable format.
|
||||
|
||||
Args:
|
||||
doc_data: Raw document data from Google Docs API
|
||||
|
||||
Returns:
|
||||
Dictionary containing parsed structure with elements and their positions
|
||||
"""
|
||||
structure = {
|
||||
'title': doc_data.get('title', ''),
|
||||
'body': [],
|
||||
'tables': [],
|
||||
'headers': {},
|
||||
'footers': {},
|
||||
'total_length': 0
|
||||
}
|
||||
|
||||
body = doc_data.get('body', {})
|
||||
content = body.get('content', [])
|
||||
|
||||
for element in content:
|
||||
element_info = _parse_element(element)
|
||||
if element_info:
|
||||
structure['body'].append(element_info)
|
||||
if element_info['type'] == 'table':
|
||||
structure['tables'].append(element_info)
|
||||
|
||||
# Calculate total document length
|
||||
if structure['body']:
|
||||
last_element = structure['body'][-1]
|
||||
structure['total_length'] = last_element.get('end_index', 0)
|
||||
|
||||
# Parse headers and footers
|
||||
for header_id, header_data in doc_data.get('headers', {}).items():
|
||||
structure['headers'][header_id] = _parse_segment(header_data)
|
||||
|
||||
for footer_id, footer_data in doc_data.get('footers', {}).items():
|
||||
structure['footers'][footer_id] = _parse_segment(footer_data)
|
||||
|
||||
return structure
|
||||
|
||||
|
||||
def _parse_element(element: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Parse a single document element.
|
||||
|
||||
Args:
|
||||
element: Element data from document
|
||||
|
||||
Returns:
|
||||
Parsed element information or None
|
||||
"""
|
||||
element_info = {
|
||||
'start_index': element.get('startIndex', 0),
|
||||
'end_index': element.get('endIndex', 0)
|
||||
}
|
||||
|
||||
if 'paragraph' in element:
|
||||
paragraph = element['paragraph']
|
||||
element_info['type'] = 'paragraph'
|
||||
element_info['text'] = _extract_paragraph_text(paragraph)
|
||||
element_info['style'] = paragraph.get('paragraphStyle', {})
|
||||
|
||||
elif 'table' in element:
|
||||
table = element['table']
|
||||
element_info['type'] = 'table'
|
||||
element_info['rows'] = len(table.get('tableRows', []))
|
||||
element_info['columns'] = len(table.get('tableRows', [{}])[0].get('tableCells', []))
|
||||
element_info['cells'] = _parse_table_cells(table)
|
||||
element_info['table_style'] = table.get('tableStyle', {})
|
||||
|
||||
elif 'sectionBreak' in element:
|
||||
element_info['type'] = 'section_break'
|
||||
element_info['section_style'] = element['sectionBreak'].get('sectionStyle', {})
|
||||
|
||||
elif 'tableOfContents' in element:
|
||||
element_info['type'] = 'table_of_contents'
|
||||
|
||||
else:
|
||||
return None
|
||||
|
||||
return element_info
|
||||
|
||||
|
||||
def _parse_table_cells(table: Dict[str, Any]) -> List[List[Dict[str, Any]]]:
|
||||
"""
|
||||
Parse table cells with their positions and content.
|
||||
|
||||
Args:
|
||||
table: Table element data
|
||||
|
||||
Returns:
|
||||
2D list of cell information
|
||||
"""
|
||||
cells = []
|
||||
for row_idx, row in enumerate(table.get('tableRows', [])):
|
||||
row_cells = []
|
||||
for col_idx, cell in enumerate(row.get('tableCells', [])):
|
||||
# Find the first paragraph in the cell for insertion
|
||||
insertion_index = cell.get('startIndex', 0) + 1 # Default fallback
|
||||
|
||||
# Look for the first paragraph in cell content
|
||||
content_elements = cell.get('content', [])
|
||||
for element in content_elements:
|
||||
if 'paragraph' in element:
|
||||
paragraph = element['paragraph']
|
||||
# Get the first element in the paragraph
|
||||
para_elements = paragraph.get('elements', [])
|
||||
if para_elements:
|
||||
first_element = para_elements[0]
|
||||
if 'startIndex' in first_element:
|
||||
insertion_index = first_element['startIndex']
|
||||
break
|
||||
|
||||
cell_info = {
|
||||
'row': row_idx,
|
||||
'column': col_idx,
|
||||
'start_index': cell.get('startIndex', 0),
|
||||
'end_index': cell.get('endIndex', 0),
|
||||
'insertion_index': insertion_index, # Where to insert text in this cell
|
||||
'content': _extract_cell_text(cell),
|
||||
'content_elements': content_elements
|
||||
}
|
||||
row_cells.append(cell_info)
|
||||
cells.append(row_cells)
|
||||
return cells
|
||||
|
||||
|
||||
def _extract_paragraph_text(paragraph: Dict[str, Any]) -> str:
|
||||
"""Extract text from a paragraph element."""
|
||||
text_parts = []
|
||||
for element in paragraph.get('elements', []):
|
||||
if 'textRun' in element:
|
||||
text_parts.append(element['textRun'].get('content', ''))
|
||||
return ''.join(text_parts)
|
||||
|
||||
|
||||
def _extract_cell_text(cell: Dict[str, Any]) -> str:
|
||||
"""Extract text content from a table cell."""
|
||||
text_parts = []
|
||||
for element in cell.get('content', []):
|
||||
if 'paragraph' in element:
|
||||
text_parts.append(_extract_paragraph_text(element['paragraph']))
|
||||
return ''.join(text_parts)
|
||||
|
||||
|
||||
def _parse_segment(segment_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Parse a document segment (header/footer)."""
|
||||
return {
|
||||
'content': segment_data.get('content', []),
|
||||
'start_index': segment_data.get('content', [{}])[0].get('startIndex', 0) if segment_data.get('content') else 0,
|
||||
'end_index': segment_data.get('content', [{}])[-1].get('endIndex', 0) if segment_data.get('content') else 0
|
||||
}
|
||||
|
||||
|
||||
def find_tables(doc_data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Find all tables in the document with their positions and dimensions.
|
||||
|
||||
Args:
|
||||
doc_data: Raw document data from Google Docs API
|
||||
|
||||
Returns:
|
||||
List of table information dictionaries
|
||||
"""
|
||||
tables = []
|
||||
structure = parse_document_structure(doc_data)
|
||||
|
||||
for idx, table_info in enumerate(structure['tables']):
|
||||
tables.append({
|
||||
'index': idx,
|
||||
'start_index': table_info['start_index'],
|
||||
'end_index': table_info['end_index'],
|
||||
'rows': table_info['rows'],
|
||||
'columns': table_info['columns'],
|
||||
'cells': table_info['cells']
|
||||
})
|
||||
|
||||
return tables
|
||||
|
||||
|
||||
def get_table_cell_indices(doc_data: Dict[str, Any], table_index: int = 0) -> Optional[List[List[Tuple[int, int]]]]:
|
||||
"""
|
||||
Get content indices for all cells in a specific table.
|
||||
|
||||
Args:
|
||||
doc_data: Raw document data from Google Docs API
|
||||
table_index: Index of the table (0-based)
|
||||
|
||||
Returns:
|
||||
2D list of (start_index, end_index) tuples for each cell, or None if table not found
|
||||
"""
|
||||
tables = find_tables(doc_data)
|
||||
|
||||
if table_index >= len(tables):
|
||||
logger.warning(f"Table index {table_index} not found. Document has {len(tables)} tables.")
|
||||
return None
|
||||
|
||||
table = tables[table_index]
|
||||
cell_indices = []
|
||||
|
||||
for row in table['cells']:
|
||||
row_indices = []
|
||||
for cell in row:
|
||||
# Each cell contains at least one paragraph
|
||||
# Find the first paragraph in the cell for content insertion
|
||||
cell_content = cell.get('content_elements', [])
|
||||
if cell_content:
|
||||
# Look for the first paragraph in cell content
|
||||
first_para = None
|
||||
for element in cell_content:
|
||||
if 'paragraph' in element:
|
||||
first_para = element['paragraph']
|
||||
break
|
||||
|
||||
if first_para and 'elements' in first_para and first_para['elements']:
|
||||
# Insert at the start of the first text run in the paragraph
|
||||
first_text_element = first_para['elements'][0]
|
||||
if 'textRun' in first_text_element:
|
||||
start_idx = first_text_element.get('startIndex', cell['start_index'] + 1)
|
||||
end_idx = first_text_element.get('endIndex', start_idx + 1)
|
||||
row_indices.append((start_idx, end_idx))
|
||||
continue
|
||||
|
||||
# Fallback: use cell boundaries with safe margins
|
||||
content_start = cell['start_index'] + 1
|
||||
content_end = cell['end_index'] - 1
|
||||
row_indices.append((content_start, content_end))
|
||||
cell_indices.append(row_indices)
|
||||
|
||||
return cell_indices
|
||||
|
||||
|
||||
def find_element_at_index(doc_data: Dict[str, Any], index: int) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Find what element exists at a given index in the document.
|
||||
|
||||
Args:
|
||||
doc_data: Raw document data from Google Docs API
|
||||
index: Position in the document
|
||||
|
||||
Returns:
|
||||
Information about the element at that position, or None
|
||||
"""
|
||||
structure = parse_document_structure(doc_data)
|
||||
|
||||
for element in structure['body']:
|
||||
if element['start_index'] <= index < element['end_index']:
|
||||
element_copy = element.copy()
|
||||
|
||||
# If it's a table, find which cell contains the index
|
||||
if element['type'] == 'table' and 'cells' in element:
|
||||
for row_idx, row in enumerate(element['cells']):
|
||||
for col_idx, cell in enumerate(row):
|
||||
if cell['start_index'] <= index < cell['end_index']:
|
||||
element_copy['containing_cell'] = {
|
||||
'row': row_idx,
|
||||
'column': col_idx,
|
||||
'cell_start': cell['start_index'],
|
||||
'cell_end': cell['end_index']
|
||||
}
|
||||
break
|
||||
|
||||
return element_copy
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_next_paragraph_index(doc_data: Dict[str, Any], after_index: int = 0) -> int:
|
||||
"""
|
||||
Find the next safe position to insert content after a given index.
|
||||
|
||||
Args:
|
||||
doc_data: Raw document data from Google Docs API
|
||||
after_index: Index after which to find insertion point
|
||||
|
||||
Returns:
|
||||
Safe index for insertion
|
||||
"""
|
||||
structure = parse_document_structure(doc_data)
|
||||
|
||||
# Find the first paragraph element after the given index
|
||||
for element in structure['body']:
|
||||
if element['type'] == 'paragraph' and element['start_index'] > after_index:
|
||||
# Insert at the end of the previous element or start of this paragraph
|
||||
return element['start_index']
|
||||
|
||||
# If no paragraph found, return the end of document
|
||||
return structure['total_length'] - 1 if structure['total_length'] > 0 else 1
|
||||
|
||||
|
||||
def analyze_document_complexity(doc_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze document complexity and provide statistics.
|
||||
|
||||
Args:
|
||||
doc_data: Raw document data from Google Docs API
|
||||
|
||||
Returns:
|
||||
Dictionary with document statistics
|
||||
"""
|
||||
structure = parse_document_structure(doc_data)
|
||||
|
||||
stats = {
|
||||
'total_elements': len(structure['body']),
|
||||
'tables': len(structure['tables']),
|
||||
'paragraphs': sum(1 for e in structure['body'] if e.get('type') == 'paragraph'),
|
||||
'section_breaks': sum(1 for e in structure['body'] if e.get('type') == 'section_break'),
|
||||
'total_length': structure['total_length'],
|
||||
'has_headers': bool(structure['headers']),
|
||||
'has_footers': bool(structure['footers'])
|
||||
}
|
||||
|
||||
# Add table statistics
|
||||
if structure['tables']:
|
||||
total_cells = sum(
|
||||
table['rows'] * table['columns']
|
||||
for table in structure['tables']
|
||||
)
|
||||
stats['total_table_cells'] = total_cells
|
||||
stats['largest_table'] = max(
|
||||
(t['rows'] * t['columns'] for t in structure['tables']),
|
||||
default=0
|
||||
)
|
||||
|
||||
return stats
|
||||
Reference in New Issue
Block a user