Skip to content

Commit caa8aee

Browse files
maxmnemonicMaksym Lysak
andauthored
feat: Chart tabular data serialization for HTML serializer (#258)
* Add chart tabular data serialization to HTML serializer Signed-off-by: Maksym Lysak <[email protected]> * Fixing pre-commit issues Signed-off-by: Maksym Lysak <[email protected]> * Fixed table serialization for tabular chart data Signed-off-by: Maksym Lysak <[email protected]> * test for loading doctags with chart data Signed-off-by: Maksym Lysak <[email protected]> * Improved loading doctags test with chart example, added tests for chart serialization into html and md Signed-off-by: Maksym Lysak <[email protected]> --------- Signed-off-by: Maksym Lysak <[email protected]> Co-authored-by: Maksym Lysak <[email protected]>
1 parent 64bafa1 commit caa8aee

File tree

9 files changed

+212
-1
lines changed

9 files changed

+212
-1
lines changed

docling_core/experimental/serializer/html.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
NodeItem,
5858
OrderedList,
5959
PictureItem,
60+
PictureTabularChartData,
6061
SectionHeaderItem,
6162
TableCell,
6263
TableItem,
@@ -104,6 +105,9 @@ class HTMLParams(CommonParams):
104105
# Allow for different output styles
105106
output_style: HTMLOutputStyle = HTMLOutputStyle.SINGLE_COLUMN
106107

108+
# Enable charts to be printed into HTML as tables
109+
enable_chart_tables: bool = True
110+
107111

108112
class HTMLTextSerializer(BaseModel, BaseTextSerializer):
109113
"""HTML-specific text item serializer."""
@@ -402,9 +406,28 @@ def serialize(
402406
and item.image.uri.scheme == "data"
403407
):
404408
img_text = f'<img src="{quote(str(item.image.uri))}">'
409+
405410
if img_text:
406411
res_parts.append(create_ser_result(text=img_text, span_source=item))
407412

413+
if params.enable_chart_tables:
414+
# Check if picture has attached PictureTabularChartData
415+
tabular_chart_annotations = [
416+
ann
417+
for ann in item.annotations
418+
if isinstance(ann, PictureTabularChartData)
419+
]
420+
if len(tabular_chart_annotations) > 0:
421+
temp_doc = DoclingDocument(name="temp")
422+
temp_table = temp_doc.add_table(
423+
data=tabular_chart_annotations[0].chart_data
424+
)
425+
html_table_content = temp_table.export_to_html(temp_doc)
426+
if len(html_table_content) > 0:
427+
res_parts.append(
428+
create_ser_result(text=html_table_content, span_source=item)
429+
)
430+
408431
text_res = "".join([r.text for r in res_parts])
409432
if text_res:
410433
text_res = f"<figure>{text_res}</figure>"

docling_core/types/doc/document.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3143,6 +3143,7 @@ def export_to_html( # noqa: C901
31433143
from_element: int = 0,
31443144
to_element: int = sys.maxsize,
31453145
labels: Optional[set[DocItemLabel]] = None,
3146+
enable_chart_tables: bool = True,
31463147
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
31473148
formula_to_mathml: bool = True,
31483149
page_no: Optional[int] = None,
@@ -3176,6 +3177,7 @@ def export_to_html( # noqa: C901
31763177
start_idx=from_element,
31773178
stop_idx=to_element,
31783179
image_mode=image_mode,
3180+
enable_chart_tables=enable_chart_tables,
31793181
formula_to_mathml=formula_to_mathml,
31803182
html_head=html_head,
31813183
html_lang=html_lang,

test/data/doc/barchart.dt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<doctag><page_header><loc_71><loc_14><loc_217><loc_20>Probability, Combinatorics and Control</page_header>
2+
<chart><loc_102><loc_37><loc_392><loc_148><bar_chart><ched>Number of impellers<ched>single-frequency<ched>multi-frequency<nl><fcel>1<fcel>0.06<fcel>0.16<nl><fcel>2<fcel>0.12<fcel>0.26<nl><fcel>3<fcel>0.16<fcel>0.27<nl><fcel>4<fcel>0.14<fcel>0.26<nl><fcel>5<fcel>0.16<fcel>0.25<nl><fcel>6<fcel>0.24<fcel>0.24<nl></chart>
3+
</doctag>

test/data/doc/barchart.gt.html

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Document</title>
6+
<meta name="generator" content="Docling HTML Serializer">
7+
<style>
8+
html {
9+
background-color: #f5f5f5;
10+
font-family: Arial, sans-serif;
11+
line-height: 1.6;
12+
}
13+
body {
14+
max-width: 800px;
15+
margin: 0 auto;
16+
padding: 2rem;
17+
background-color: white;
18+
box-shadow: 0 0 10px rgba(0,0,0,0.1);
19+
}
20+
h1, h2, h3, h4, h5, h6 {
21+
color: #333;
22+
margin-top: 1.5em;
23+
margin-bottom: 0.5em;
24+
}
25+
h1 {
26+
font-size: 2em;
27+
border-bottom: 1px solid #eee;
28+
padding-bottom: 0.3em;
29+
}
30+
table {
31+
border-collapse: collapse;
32+
margin: 1em 0;
33+
width: 100%;
34+
}
35+
th, td {
36+
border: 1px solid #ddd;
37+
padding: 8px;
38+
text-align: left;
39+
}
40+
th {
41+
background-color: #f2f2f2;
42+
font-weight: bold;
43+
}
44+
figure {
45+
margin: 1.5em 0;
46+
text-align: center;
47+
}
48+
figcaption {
49+
color: #666;
50+
font-style: italic;
51+
margin-top: 0.5em;
52+
}
53+
img {
54+
max-width: 100%;
55+
height: auto;
56+
}
57+
pre {
58+
background-color: #f6f8fa;
59+
border-radius: 3px;
60+
padding: 1em;
61+
overflow: auto;
62+
}
63+
code {
64+
font-family: monospace;
65+
background-color: #f6f8fa;
66+
padding: 0.2em 0.4em;
67+
border-radius: 3px;
68+
}
69+
pre code {
70+
background-color: transparent;
71+
padding: 0;
72+
}
73+
.formula {
74+
text-align: center;
75+
padding: 0.5em;
76+
margin: 1em 0;
77+
background-color: #f9f9f9;
78+
}
79+
.formula-not-decoded {
80+
text-align: center;
81+
padding: 0.5em;
82+
margin: 1em 0;
83+
background: repeating-linear-gradient(
84+
45deg,
85+
#f0f0f0,
86+
#f0f0f0 10px,
87+
#f9f9f9 10px,
88+
#f9f9f9 20px
89+
);
90+
}
91+
.page-break {
92+
page-break-after: always;
93+
border-top: 1px dashed #ccc;
94+
margin: 2em 0;
95+
}
96+
.key-value-region {
97+
background-color: #f9f9f9;
98+
padding: 1em;
99+
border-radius: 4px;
100+
margin: 1em 0;
101+
}
102+
.key-value-region dt {
103+
font-weight: bold;
104+
}
105+
.key-value-region dd {
106+
margin-left: 1em;
107+
margin-bottom: 0.5em;
108+
}
109+
.form-container {
110+
border: 1px solid #ddd;
111+
padding: 1em;
112+
border-radius: 4px;
113+
margin: 1em 0;
114+
}
115+
.form-item {
116+
margin-bottom: 0.5em;
117+
}
118+
.image-classification {
119+
font-size: 0.9em;
120+
color: #666;
121+
margin-top: 0.5em;
122+
}
123+
</style>
124+
</head>
125+
<body>
126+
<div class='page'>
127+
<figure><table><tbody><tr><td>Number of impellers</td><td>single-frequency</td><td>multi-frequency</td></tr><tr><td>1</td><td>0.06</td><td>0.16</td></tr><tr><td>2</td><td>0.12</td><td>0.26</td></tr><tr><td>3</td><td>0.16</td><td>0.27</td></tr><tr><td>4</td><td>0.14</td><td>0.26</td></tr><tr><td>5</td><td>0.16</td><td>0.25</td></tr><tr><td>6</td><td>0.24</td><td>0.24</td></tr></tbody></table></figure>
128+
</div>
129+
</body>
130+
</html>

test/data/doc/barchart.gt.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<!-- image -->
2+
3+
| Number of impellers | single-frequency | multi-frequency |
4+
|-----------------------|--------------------|-------------------|
5+
| 1 | 0.06 | 0.16 |
6+
| 2 | 0.12 | 0.26 |
7+
| 3 | 0.16 | 0.27 |
8+
| 4 | 0.14 | 0.26 |
9+
| 5 | 0.16 | 0.25 |
10+
| 6 | 0.24 | 0.24 |

test/data/doc/barchart.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

test/data/doc/barchart.png

120 KB
Loading

test/test_doctags_load.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from PIL import Image as PILImage
44

55
from docling_core.types.doc import DoclingDocument
6-
from docling_core.types.doc.document import DocTagsDocument
6+
from docling_core.types.doc.document import DocTagsDocument, PictureTabularChartData
77

88

99
def test_doctags_load_from_files():
@@ -55,6 +55,20 @@ def test_multipage_doctags_load():
5555
# print(doc.export_to_html())
5656

5757

58+
def test_doctags_chart():
59+
doc = DoclingDocument(name="Document")
60+
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
61+
[Path("test/data/doc/barchart.dt")],
62+
[Path("test/data/doc/barchart.png")],
63+
)
64+
doc.load_from_doctags(doctags_doc)
65+
for pic in doc.pictures:
66+
tabular_chart_annotations = [
67+
ann for ann in pic.annotations if isinstance(ann, PictureTabularChartData)
68+
]
69+
assert len(tabular_chart_annotations) > 0
70+
71+
5872
def test_doctags_table_provenances_and_captions():
5973
doc = DoclingDocument(name="Document")
6074
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(

test/test_serialization.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,34 @@ def test_md_cross_page_list_page_break_p2():
6464
verify(exp_file=src.parent / f"{src.stem}_p2.gt.md", actual=actual)
6565

6666

67+
def test_html_charts():
68+
src = Path("./test/data/doc/barchart.json")
69+
doc = DoclingDocument.load_from_json(src)
70+
71+
ser = HTMLDocSerializer(
72+
doc=doc,
73+
params=HTMLParams(
74+
image_mode=ImageRefMode.PLACEHOLDER,
75+
),
76+
)
77+
actual = ser.serialize().text
78+
verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
79+
80+
81+
def test_md_charts():
82+
src = Path("./test/data/doc/barchart.json")
83+
doc = DoclingDocument.load_from_json(src)
84+
85+
ser = MarkdownDocSerializer(
86+
doc=doc,
87+
params=MarkdownParams(
88+
image_mode=ImageRefMode.PLACEHOLDER,
89+
),
90+
)
91+
actual = ser.serialize().text
92+
verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
93+
94+
6795
def test_html_cross_page_list_page_break():
6896
src = Path("./test/data/doc/activities.json")
6997
doc = DoclingDocument.load_from_json(src)

0 commit comments

Comments
 (0)