Add Google Docs checklist/checkbox recognition in markdown output
Detect checklist items in Google Docs API responses and render them as - [ ] (unchecked) and - [x] (checked) in markdown, so LLMs understand checkbox state instead of writing literal [x] or DONE text. Closes #516 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,7 @@ Converts Google Docs API JSON responses to clean Markdown, preserving:
|
|||||||
- Headings (H1-H6, Title, Subtitle)
|
- Headings (H1-H6, Title, Subtitle)
|
||||||
- Bold, italic, strikethrough, code, links
|
- Bold, italic, strikethrough, code, links
|
||||||
- Ordered and unordered lists with nesting
|
- Ordered and unordered lists with nesting
|
||||||
|
- Checklists with checked/unchecked state
|
||||||
- Tables with header row separators
|
- Tables with header row separators
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -60,9 +61,20 @@ def convert_doc_to_markdown(doc: dict[str, Any]) -> str:
|
|||||||
if bullet:
|
if bullet:
|
||||||
list_id = bullet["listId"]
|
list_id = bullet["listId"]
|
||||||
nesting = bullet.get("nestingLevel", 0)
|
nesting = bullet.get("nestingLevel", 0)
|
||||||
is_ordered = _is_ordered_list(lists_meta, list_id, nesting)
|
|
||||||
|
|
||||||
if is_ordered:
|
if _is_checklist(lists_meta, list_id, nesting):
|
||||||
|
checked = _is_checked(para)
|
||||||
|
checkbox = "[x]" if checked else "[ ]"
|
||||||
|
indent = " " * nesting
|
||||||
|
# Re-render text without strikethrough for checked items
|
||||||
|
# to avoid redundant ~~text~~ alongside [x]
|
||||||
|
cb_text = (
|
||||||
|
_convert_paragraph_text(para, skip_strikethrough=True)
|
||||||
|
if checked
|
||||||
|
else text
|
||||||
|
)
|
||||||
|
lines.append(f"{indent}- {checkbox} {cb_text}")
|
||||||
|
elif _is_ordered_list(lists_meta, list_id, nesting):
|
||||||
key = (list_id, nesting)
|
key = (list_id, nesting)
|
||||||
ordered_counters[key] = ordered_counters.get(key, 0) + 1
|
ordered_counters[key] = ordered_counters.get(key, 0) + 1
|
||||||
counter = ordered_counters[key]
|
counter = ordered_counters[key]
|
||||||
@@ -102,16 +114,20 @@ def convert_doc_to_markdown(doc: dict[str, Any]) -> str:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _convert_paragraph_text(para: dict[str, Any]) -> str:
|
def _convert_paragraph_text(
|
||||||
|
para: dict[str, Any], skip_strikethrough: bool = False
|
||||||
|
) -> str:
|
||||||
"""Convert paragraph elements to inline markdown text."""
|
"""Convert paragraph elements to inline markdown text."""
|
||||||
parts: list[str] = []
|
parts: list[str] = []
|
||||||
for elem in para.get("elements", []):
|
for elem in para.get("elements", []):
|
||||||
if "textRun" in elem:
|
if "textRun" in elem:
|
||||||
parts.append(_convert_text_run(elem["textRun"]))
|
parts.append(_convert_text_run(elem["textRun"], skip_strikethrough))
|
||||||
return "".join(parts).strip()
|
return "".join(parts).strip()
|
||||||
|
|
||||||
|
|
||||||
def _convert_text_run(text_run: dict[str, Any]) -> str:
|
def _convert_text_run(
|
||||||
|
text_run: dict[str, Any], skip_strikethrough: bool = False
|
||||||
|
) -> str:
|
||||||
"""Convert a single text run to markdown."""
|
"""Convert a single text run to markdown."""
|
||||||
content = text_run.get("content", "")
|
content = text_run.get("content", "")
|
||||||
style = text_run.get("textStyle", {})
|
style = text_run.get("textStyle", {})
|
||||||
@@ -120,10 +136,12 @@ def _convert_text_run(text_run: dict[str, Any]) -> str:
|
|||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
return _apply_text_style(text, style)
|
return _apply_text_style(text, style, skip_strikethrough)
|
||||||
|
|
||||||
|
|
||||||
def _apply_text_style(text: str, style: dict[str, Any]) -> str:
|
def _apply_text_style(
|
||||||
|
text: str, style: dict[str, Any], skip_strikethrough: bool = False
|
||||||
|
) -> str:
|
||||||
"""Apply markdown formatting based on text style."""
|
"""Apply markdown formatting based on text style."""
|
||||||
link = style.get("link", {})
|
link = style.get("link", {})
|
||||||
url = link.get("url")
|
url = link.get("url")
|
||||||
@@ -143,7 +161,7 @@ def _apply_text_style(text: str, style: dict[str, Any]) -> str:
|
|||||||
elif italic:
|
elif italic:
|
||||||
text = f"*{text}*"
|
text = f"*{text}*"
|
||||||
|
|
||||||
if strikethrough:
|
if strikethrough and not skip_strikethrough:
|
||||||
text = f"~~{text}~~"
|
text = f"~~{text}~~"
|
||||||
|
|
||||||
if url:
|
if url:
|
||||||
@@ -163,6 +181,37 @@ def _is_ordered_list(lists_meta: dict[str, Any], list_id: str, nesting: int) ->
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _is_checklist(lists_meta: dict[str, Any], list_id: str, nesting: int) -> bool:
|
||||||
|
"""Check if a list at a given nesting level is a checklist.
|
||||||
|
|
||||||
|
Google Docs checklists are distinguished from regular bullet lists by having
|
||||||
|
GLYPH_TYPE_UNSPECIFIED with no glyphSymbol — the Docs UI renders interactive
|
||||||
|
checkboxes rather than a static glyph character.
|
||||||
|
"""
|
||||||
|
list_info = lists_meta.get(list_id, {})
|
||||||
|
nesting_levels = list_info.get("listProperties", {}).get("nestingLevels", [])
|
||||||
|
if nesting < len(nesting_levels):
|
||||||
|
level = nesting_levels[nesting]
|
||||||
|
glyph_type = level.get("glyphType", "")
|
||||||
|
has_glyph_symbol = "glyphSymbol" in level
|
||||||
|
return glyph_type in ("", "GLYPH_TYPE_UNSPECIFIED") and not has_glyph_symbol
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _is_checked(para: dict[str, Any]) -> bool:
|
||||||
|
"""Check if a checklist item is checked.
|
||||||
|
|
||||||
|
Google Docs marks checked checklist items by applying strikethrough
|
||||||
|
formatting to the paragraph text.
|
||||||
|
"""
|
||||||
|
for elem in para.get("elements", []):
|
||||||
|
if "textRun" in elem:
|
||||||
|
content = elem["textRun"].get("content", "").strip()
|
||||||
|
if content:
|
||||||
|
return elem["textRun"].get("textStyle", {}).get("strikethrough", False)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _convert_table(table: dict[str, Any]) -> str:
|
def _convert_table(table: dict[str, Any]) -> str:
|
||||||
"""Convert a table element to markdown."""
|
"""Convert a table element to markdown."""
|
||||||
rows = table.get("tableRows", [])
|
rows = table.get("tableRows", [])
|
||||||
|
|||||||
@@ -270,6 +270,69 @@ class TestLists:
|
|||||||
assert "- Item two" in md
|
assert "- Item two" in md
|
||||||
|
|
||||||
|
|
||||||
|
CHECKLIST_DOC = {
|
||||||
|
"title": "Checklist Test",
|
||||||
|
"lists": {
|
||||||
|
"kix.checklist001": {
|
||||||
|
"listProperties": {
|
||||||
|
"nestingLevels": [
|
||||||
|
{"glyphType": "GLYPH_TYPE_UNSPECIFIED"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"body": {
|
||||||
|
"content": [
|
||||||
|
{"sectionBreak": {"sectionStyle": {}}},
|
||||||
|
{
|
||||||
|
"paragraph": {
|
||||||
|
"elements": [
|
||||||
|
{"textRun": {"content": "Buy groceries\n", "textStyle": {}}}
|
||||||
|
],
|
||||||
|
"paragraphStyle": {"namedStyleType": "NORMAL_TEXT"},
|
||||||
|
"bullet": {"listId": "kix.checklist001", "nestingLevel": 0},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paragraph": {
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"textRun": {
|
||||||
|
"content": "Walk the dog\n",
|
||||||
|
"textStyle": {"strikethrough": True},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"paragraphStyle": {"namedStyleType": "NORMAL_TEXT"},
|
||||||
|
"bullet": {"listId": "kix.checklist001", "nestingLevel": 0},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestChecklists:
|
||||||
|
def test_unchecked(self):
|
||||||
|
md = convert_doc_to_markdown(CHECKLIST_DOC)
|
||||||
|
assert "- [ ] Buy groceries" in md
|
||||||
|
|
||||||
|
def test_checked(self):
|
||||||
|
md = convert_doc_to_markdown(CHECKLIST_DOC)
|
||||||
|
assert "- [x] Walk the dog" in md
|
||||||
|
|
||||||
|
def test_checked_no_strikethrough(self):
|
||||||
|
"""Checked items should not have redundant ~~strikethrough~~ markdown."""
|
||||||
|
md = convert_doc_to_markdown(CHECKLIST_DOC)
|
||||||
|
assert "~~Walk the dog~~" not in md
|
||||||
|
|
||||||
|
def test_regular_bullet_not_checklist(self):
|
||||||
|
"""Bullet lists with glyphSymbol should remain as plain bullets."""
|
||||||
|
md = convert_doc_to_markdown(LIST_DOC)
|
||||||
|
assert "[ ]" not in md
|
||||||
|
assert "[x]" not in md
|
||||||
|
|
||||||
|
|
||||||
class TestEmptyDoc:
|
class TestEmptyDoc:
|
||||||
def test_empty(self):
|
def test_empty(self):
|
||||||
md = convert_doc_to_markdown({"title": "Empty", "body": {"content": []}})
|
md = convert_doc_to_markdown({"title": "Empty", "body": {"content": []}})
|
||||||
|
|||||||
Reference in New Issue
Block a user