Skip to content

Commit 4851c4a

Browse files
authored
feat: support GFM table of markdown (#345)
1 parent 7d660c6 commit 4851c4a

File tree

6 files changed

+372
-4
lines changed

6 files changed

+372
-4
lines changed

.vscode/settings.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
"frombytes",
3636
"gundam",
3737
"LOCALAPPDATA",
38+
"markdownify",
3839
"MATHML",
3940
"mediabox",
4041
"metas",
@@ -55,6 +56,7 @@
5556
"pypdf",
5657
"pytest",
5758
"REGX",
59+
"rowspan",
5860
"SSOHH'mm",
5961
"tagfilter",
6062
"tiktoken",

pdf_craft/markdown/render/layouts.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
RefIdMap,
1414
)
1515
from ..paragraph import render_markdown_paragraph
16+
from .table import render_table_content
1617

1718
_MAX_TOC_LEVELS = 3
1819
_MAX_TITLE_LEVELS = 6
@@ -183,9 +184,13 @@ def _render_asset_content(
183184
if asset.content:
184185
if has_content_before:
185186
yield "\n\n"
186-
yield from render_markdown_paragraph(
187-
children=asset.content,
188-
render_payload=render_member,
187+
yield render_table_content(
188+
html_string="".join(
189+
render_markdown_paragraph(
190+
children=asset.content,
191+
render_payload=render_member,
192+
)
193+
)
189194
)
190195

191196
elif asset.ref == "image":

pdf_craft/markdown/render/table.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
from typing import TYPE_CHECKING
2+
3+
from markdownify import MarkdownConverter
4+
5+
if TYPE_CHECKING:
6+
from bs4 import Tag
7+
8+
9+
class _TableComplexityException(Exception):
10+
pass
11+
12+
13+
class _GFMTableConverter(MarkdownConverter):
14+
"""
15+
Custom converter that detects complex table features.
16+
17+
The base markdownify library will silently convert complex HTML tables to
18+
GFM pipe tables, but this loses information (e.g., merged cells become
19+
empty cells). This custom converter detects such cases and raises an
20+
exception to allow fallback to HTML.
21+
22+
Background:
23+
- GFM markdown tables don't support colspan/rowspan
24+
- markdownify doesn't provide rowspan support (issue #121):
25+
https://github.com/matthewwithanm/python-markdownify/issues/121
26+
- Instead of silent data loss, we detect complexity and preserve HTML
27+
28+
Raises TableComplexityException when encountering:
29+
- colspan > 1
30+
- rowspan > 1
31+
- Multiple tbody sections
32+
"""
33+
34+
def __init__(self, **options):
35+
super().__init__(**options)
36+
self._tbody_count = 0
37+
38+
def convert_td(self, el: "Tag", text: str, parent_tags: set[str]) -> str:
39+
self._check_cell_complexity(el)
40+
return super().convert_td(el, text, parent_tags) # type: ignore[attr-defined]
41+
42+
def convert_th(self, el: "Tag", text: str, parent_tags: set[str]) -> str:
43+
self._check_cell_complexity(el)
44+
return super().convert_th(el, text, parent_tags) # type: ignore[attr-defined]
45+
46+
def convert_table(self, el: "Tag", text: str, parent_tags: set[str]) -> str:
47+
self._tbody_count = len(el.find_all("tbody", recursive=False))
48+
if self._tbody_count > 1:
49+
raise _TableComplexityException(
50+
f"Table has {self._tbody_count} tbody sections (GFM only supports 1)"
51+
)
52+
return super().convert_table(el, text, parent_tags) # type: ignore[attr-defined]
53+
54+
def _check_cell_complexity(self, el: "Tag") -> None:
55+
colspan = el.get("colspan", "1")
56+
rowspan = el.get("rowspan", "1")
57+
58+
try:
59+
colspan_str = str(colspan) if colspan else "1"
60+
rowspan_str = str(rowspan) if rowspan else "1"
61+
62+
if int(colspan_str) > 1:
63+
raise _TableComplexityException(
64+
f"Table has colspan={colspan_str} (GFM doesn't support colspan)"
65+
)
66+
if int(rowspan_str) > 1:
67+
raise _TableComplexityException(
68+
f"Table has rowspan={rowspan_str} (GFM doesn't support rowspan)"
69+
)
70+
except ValueError as error:
71+
raise _TableComplexityException(
72+
"Table has invalid colspan/rowspan values"
73+
) from error
74+
75+
76+
def render_table_content(html_string: str) -> str:
77+
try:
78+
converter = _GFMTableConverter(heading_style="ATX")
79+
gfm_table = converter.convert(html_string).strip()
80+
return gfm_table
81+
except _TableComplexityException:
82+
return html_string

poetry.lock

Lines changed: 52 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ doc-page-extractor = "==1.0.12"
4040
epub-generator = "==0.1.7"
4141
pylatexenc = "^2.10"
4242
pyahocorasick = "^2.2.0"
43+
markdownify = "^1.2.2"
4344

4445
[tool.poetry.group.dev.dependencies]
4546
pylint = "^3.3.7"

0 commit comments

Comments
 (0)