feat(docs): add get_doc_as_markdown tool with comment context

Adds a new `get_doc_as_markdown` tool that converts Google Docs to clean Markdown preserving formatting (headings, bold/italic/strikethrough, links, code spans, ordered/unordered lists with nesting, and tables). Optionally overlays comments with their anchor text (quotedFileContent) — the specific text each comment is attached to — in two modes: - inline: footnote-style references placed at the anchor text location - appendix: all comments grouped at the bottom with blockquoted anchors This gives AI agents full document context in a single tool call, unlike get_doc_content which strips all formatting to plain text. New files: - gdocs/docs_markdown.py: Converter + comment formatting logic - tests/gdocs/test_docs_markdown.py: 18 tests Tool tier: extended (alongside search_docs, export_doc_to_pdf, etc.) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 19:15:56 -08:00
parent 860bc4c16f
commit 08ad7ad308
4 changed files with 667 additions and 0 deletions
--- a/gdocs/docs_tools.py
+++ b/gdocs/docs_tools.py
@@ -36,6 +36,12 @@ from gdocs.docs_structure import (
    analyze_document_complexity,
 )
 from gdocs.docs_tables import extract_table_as_data
+from gdocs.docs_markdown import (
+    convert_doc_to_markdown,
+    format_comments_inline,
+    format_comments_appendix,
+    parse_drive_comments,
+)

 # Import operation managers for complex business logic
 from gdocs.managers import (
@@ -1563,6 +1569,101 @@ async def update_paragraph_style(
    return f"Applied paragraph formatting ({', '.join(summary_parts)}) to range {start_index}-{end_index} in document {document_id}. Link: {link}"


+@server.tool()
+@handle_http_errors("get_doc_as_markdown", is_read_only=True, service_type="docs")
+@require_multiple_services(
+    [
+        {
+            "service_type": "drive",
+            "scopes": "drive_read",
+            "param_name": "drive_service",
+        },
+        {"service_type": "docs", "scopes": "docs_read", "param_name": "docs_service"},
+    ]
+)
+async def get_doc_as_markdown(
+    drive_service: Any,
+    docs_service: Any,
+    user_google_email: str,
+    document_id: str,
+    include_comments: bool = True,
+    comment_mode: str = "inline",
+    include_resolved: bool = False,
+) -> str:
+    """
+    Reads a Google Doc and returns it as clean Markdown with optional comment context.
+
+    Unlike get_doc_content which returns plain text, this tool preserves document
+    formatting as Markdown: headings, bold/italic/strikethrough, links, code spans,
+    ordered/unordered lists with nesting, and tables.
+
+    When comments are included (the default), each comment's anchor text — the specific
+    text the comment was attached to — is preserved, giving full context for the discussion.
+
+    Args:
+        user_google_email: User's Google email address
+        document_id: ID of the Google Doc (or full URL)
+        include_comments: Whether to include comments (default: True)
+        comment_mode: How to display comments:
+            - "inline": Footnote-style references placed at the anchor text location (default)
+            - "appendix": All comments grouped at the bottom with blockquoted anchor text
+            - "none": No comments included
+        include_resolved: Whether to include resolved comments (default: False)
+
+    Returns:
+        str: The document content as Markdown, optionally with comments
+    """
+    logger.info(
+        f"[get_doc_as_markdown] Doc={document_id}, comments={include_comments}, mode={comment_mode}"
+    )
+
+    # Fetch document content via Docs API
+    doc = await asyncio.to_thread(
+        docs_service.documents().get(documentId=document_id).execute
+    )
+
+    markdown = convert_doc_to_markdown(doc)
+
+    if not include_comments or comment_mode == "none":
+        return markdown
+
+    # Fetch comments via Drive API
+    all_comments = []
+    page_token = None
+
+    while True:
+        response = await asyncio.to_thread(
+            drive_service.comments()
+            .list(
+                fileId=document_id,
+                fields="comments(id,content,author,createdTime,modifiedTime,"
+                "resolved,quotedFileContent,"
+                "replies(id,content,author,createdTime,modifiedTime)),"
+                "nextPageToken",
+                includeDeleted=False,
+                pageToken=page_token,
+            )
+            .execute
+        )
+        all_comments.extend(response.get("comments", []))
+        page_token = response.get("nextPageToken")
+        if not page_token:
+            break
+
+    comments = parse_drive_comments(
+        {"comments": all_comments}, include_resolved=include_resolved
+    )
+
+    if not comments:
+        return markdown
+
+    if comment_mode == "inline":
+        return format_comments_inline(markdown, comments)
+    else:
+        appendix = format_comments_appendix(comments)
+        return markdown.rstrip("\n") + "\n\n" + appendix
+
+
 # Create comment management tools for documents
 _comment_tools = create_comment_tools("document", "document_id")