Add Google Docs checklist/checkbox recognition in markdown output

Detect checklist items in Google Docs API responses and render them as
- [ ] (unchecked) and - [x] (checked) in markdown, so LLMs understand
checkbox state instead of writing literal [x] or DONE text.

Closes #516

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Taylor Wilsdon
2026-03-01 09:30:05 -05:00
parent ca00c74634
commit df9640b321
2 changed files with 120 additions and 8 deletions

View File

@@ -5,6 +5,7 @@ Converts Google Docs API JSON responses to clean Markdown, preserving:
- Headings (H1-H6, Title, Subtitle)
- Bold, italic, strikethrough, code, links
- Ordered and unordered lists with nesting
- Checklists with checked/unchecked state
- Tables with header row separators
"""
@@ -60,9 +61,20 @@ def convert_doc_to_markdown(doc: dict[str, Any]) -> str:
if bullet:
list_id = bullet["listId"]
nesting = bullet.get("nestingLevel", 0)
is_ordered = _is_ordered_list(lists_meta, list_id, nesting)
if is_ordered:
if _is_checklist(lists_meta, list_id, nesting):
checked = _is_checked(para)
checkbox = "[x]" if checked else "[ ]"
indent = " " * nesting
# Re-render text without strikethrough for checked items
# to avoid redundant ~~text~~ alongside [x]
cb_text = (
_convert_paragraph_text(para, skip_strikethrough=True)
if checked
else text
)
lines.append(f"{indent}- {checkbox} {cb_text}")
elif _is_ordered_list(lists_meta, list_id, nesting):
key = (list_id, nesting)
ordered_counters[key] = ordered_counters.get(key, 0) + 1
counter = ordered_counters[key]
@@ -102,16 +114,20 @@ def convert_doc_to_markdown(doc: dict[str, Any]) -> str:
return result
def _convert_paragraph_text(para: dict[str, Any]) -> str:
def _convert_paragraph_text(
para: dict[str, Any], skip_strikethrough: bool = False
) -> str:
"""Convert paragraph elements to inline markdown text."""
parts: list[str] = []
for elem in para.get("elements", []):
if "textRun" in elem:
parts.append(_convert_text_run(elem["textRun"]))
parts.append(_convert_text_run(elem["textRun"], skip_strikethrough))
return "".join(parts).strip()
def _convert_text_run(text_run: dict[str, Any]) -> str:
def _convert_text_run(
text_run: dict[str, Any], skip_strikethrough: bool = False
) -> str:
"""Convert a single text run to markdown."""
content = text_run.get("content", "")
style = text_run.get("textStyle", {})
@@ -120,10 +136,12 @@ def _convert_text_run(text_run: dict[str, Any]) -> str:
if not text:
return ""
return _apply_text_style(text, style)
return _apply_text_style(text, style, skip_strikethrough)
def _apply_text_style(text: str, style: dict[str, Any]) -> str:
def _apply_text_style(
text: str, style: dict[str, Any], skip_strikethrough: bool = False
) -> str:
"""Apply markdown formatting based on text style."""
link = style.get("link", {})
url = link.get("url")
@@ -143,7 +161,7 @@ def _apply_text_style(text: str, style: dict[str, Any]) -> str:
elif italic:
text = f"*{text}*"
if strikethrough:
if strikethrough and not skip_strikethrough:
text = f"~~{text}~~"
if url:
@@ -163,6 +181,37 @@ def _is_ordered_list(lists_meta: dict[str, Any], list_id: str, nesting: int) ->
return False
def _is_checklist(lists_meta: dict[str, Any], list_id: str, nesting: int) -> bool:
"""Check if a list at a given nesting level is a checklist.
Google Docs checklists are distinguished from regular bullet lists by having
GLYPH_TYPE_UNSPECIFIED with no glyphSymbol — the Docs UI renders interactive
checkboxes rather than a static glyph character.
"""
list_info = lists_meta.get(list_id, {})
nesting_levels = list_info.get("listProperties", {}).get("nestingLevels", [])
if nesting < len(nesting_levels):
level = nesting_levels[nesting]
glyph_type = level.get("glyphType", "")
has_glyph_symbol = "glyphSymbol" in level
return glyph_type in ("", "GLYPH_TYPE_UNSPECIFIED") and not has_glyph_symbol
return False
def _is_checked(para: dict[str, Any]) -> bool:
"""Check if a checklist item is checked.
Google Docs marks checked checklist items by applying strikethrough
formatting to the paragraph text.
"""
for elem in para.get("elements", []):
if "textRun" in elem:
content = elem["textRun"].get("content", "").strip()
if content:
return elem["textRun"].get("textStyle", {}).get("strikethrough", False)
return False
def _convert_table(table: dict[str, Any]) -> str:
"""Convert a table element to markdown."""
rows = table.get("tableRows", [])

View File

@@ -270,6 +270,69 @@ class TestLists:
assert "- Item two" in md
CHECKLIST_DOC = {
"title": "Checklist Test",
"lists": {
"kix.checklist001": {
"listProperties": {
"nestingLevels": [
{"glyphType": "GLYPH_TYPE_UNSPECIFIED"},
]
}
}
},
"body": {
"content": [
{"sectionBreak": {"sectionStyle": {}}},
{
"paragraph": {
"elements": [
{"textRun": {"content": "Buy groceries\n", "textStyle": {}}}
],
"paragraphStyle": {"namedStyleType": "NORMAL_TEXT"},
"bullet": {"listId": "kix.checklist001", "nestingLevel": 0},
}
},
{
"paragraph": {
"elements": [
{
"textRun": {
"content": "Walk the dog\n",
"textStyle": {"strikethrough": True},
}
}
],
"paragraphStyle": {"namedStyleType": "NORMAL_TEXT"},
"bullet": {"listId": "kix.checklist001", "nestingLevel": 0},
}
},
]
},
}
class TestChecklists:
def test_unchecked(self):
md = convert_doc_to_markdown(CHECKLIST_DOC)
assert "- [ ] Buy groceries" in md
def test_checked(self):
md = convert_doc_to_markdown(CHECKLIST_DOC)
assert "- [x] Walk the dog" in md
def test_checked_no_strikethrough(self):
"""Checked items should not have redundant ~~strikethrough~~ markdown."""
md = convert_doc_to_markdown(CHECKLIST_DOC)
assert "~~Walk the dog~~" not in md
def test_regular_bullet_not_checklist(self):
"""Bullet lists with glyphSymbol should remain as plain bullets."""
md = convert_doc_to_markdown(LIST_DOC)
assert "[ ]" not in md
assert "[x]" not in md
class TestEmptyDoc:
def test_empty(self):
md = convert_doc_to_markdown({"title": "Empty", "body": {"content": []}})