Skip to content

Commit 9705f40

Browse files
maxmnemonicMaksym Lysak
andauthored
fix: Proper heading support in rich tables for HTML backend (#2394)
* Fix for the proper headers support in rich tables in HTML Signed-off-by: Maksym Lysak <[email protected]> * cleaning up Signed-off-by: Maksym Lysak <[email protected]> * Compatibility with older Python versions Signed-off-by: Maksym Lysak <[email protected]> * Fixing Furniture before the first heading rule Signed-off-by: Maksym Lysak <[email protected]> * Added minimalistic test case Signed-off-by: Maksym Lysak <[email protected]> * added html for the test Signed-off-by: Maksym Lysak <[email protected]> --------- Signed-off-by: Maksym Lysak <[email protected]> Co-authored-by: Maksym Lysak <[email protected]>
1 parent 8a4b946 commit 9705f40

File tree

5 files changed

+267
-15
lines changed

5 files changed

+267
-15
lines changed

docling/backend/html_backend.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -272,9 +272,19 @@ def convert(self) -> DoclingDocument:
272272
for br in content("br"):
273273
br.replace_with(NavigableString("\n"))
274274
# set default content layer
275-
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
275+
276+
# Furniture before the first heading rule, except for headers in tables
277+
header = None
278+
# Find all headers first
279+
all_headers = content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
280+
# Keep only those that do NOT have a <table> in a parent chain
281+
clean_headers = [h for h in all_headers if not h.find_parent("table")]
282+
# Pick the first header from the remaining
283+
if len(clean_headers):
284+
header = clean_headers[0]
285+
# Set starting content layer
276286
self.content_layer = (
277-
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
287+
ContentLayer.BODY if header is None else ContentLayer.FURNITURE
278288
)
279289
# reset context
280290
self.ctx = _Context()
@@ -309,9 +319,11 @@ def process_rich_table_cells(
309319
group_name: str,
310320
doc: DoclingDocument,
311321
docling_table: TableItem,
312-
) -> tuple[bool, RefItem]:
322+
) -> tuple[bool, Union[RefItem, None]]:
313323
rich_table_cell = False
314-
ref_for_rich_cell = provs_in_cell[0]
324+
ref_for_rich_cell = None
325+
if len(provs_in_cell) > 0:
326+
ref_for_rich_cell = provs_in_cell[0]
315327
if len(provs_in_cell) > 1:
316328
# Cell has multiple elements, we need to group them
317329
rich_table_cell = True
@@ -324,7 +336,10 @@ def process_rich_table_cells(
324336
if isinstance(pr_item, TextItem):
325337
# Cell has only one element and it's just a text
326338
rich_table_cell = False
327-
doc.delete_items(node_items=[pr_item])
339+
try:
340+
doc.delete_items(node_items=[pr_item])
341+
except Exception as e:
342+
_log.error(f"Error while making rich table: {e}.")
328343
else:
329344
rich_table_cell = True
330345
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
@@ -391,17 +406,19 @@ def parse_table_data(
391406

392407
provs_in_cell: list[RefItem] = []
393408
# Parse table cell sub-tree for Rich Cells content:
409+
table_level = self.level
394410
provs_in_cell = self._walk(html_cell, doc)
411+
# After walking sub-tree in cell, restore previously set level
412+
self.level = table_level
395413

396414
rich_table_cell = False
397415
ref_for_rich_cell = None
398-
if len(provs_in_cell) > 0:
399-
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
400-
rich_table_cell, ref_for_rich_cell = (
401-
HTMLDocumentBackend.process_rich_table_cells(
402-
provs_in_cell, group_name, doc, docling_table
403-
)
416+
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
417+
rich_table_cell, ref_for_rich_cell = (
418+
HTMLDocumentBackend.process_rich_table_cells(
419+
provs_in_cell, group_name, doc, docling_table
404420
)
421+
)
405422

406423
# Extracting text
407424
text = self.get_text(html_cell).strip()
@@ -774,13 +791,15 @@ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
774791
for key in self.parents.keys():
775792
self.parents[key] = None
776793
self.level = 0
777-
docling_title = self.parents[self.level + 1] = doc.add_title(
794+
self.parents[self.level + 1] = doc.add_title(
778795
text_clean,
779796
content_layer=self.content_layer,
780797
formatting=annotated_text.formatting,
781798
hyperlink=annotated_text.hyperlink,
782799
)
783-
added_ref = [docling_title.get_ref()]
800+
p1 = self.parents[self.level + 1]
801+
if p1 is not None:
802+
added_ref = [p1.get_ref()]
784803
# the other levels need to be lowered by 1 if a title was set
785804
else:
786805
level -= 1
@@ -802,7 +821,7 @@ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
802821
_log.debug(f"Remove the tail of level {key}")
803822
self.parents[key] = None
804823
self.level = level
805-
docling_heading = self.parents[self.level + 1] = doc.add_heading(
824+
self.parents[self.level + 1] = doc.add_heading(
806825
parent=self.parents[self.level],
807826
text=text_clean,
808827
orig=annotated_text.text,
@@ -811,7 +830,9 @@ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
811830
formatting=annotated_text.formatting,
812831
hyperlink=annotated_text.hyperlink,
813832
)
814-
added_ref = [docling_heading.get_ref()]
833+
p2 = self.parents[self.level + 1]
834+
if p2 is not None:
835+
added_ref = [p2.get_ref()]
815836
self.level += 1
816837
for img_tag in tag("img"):
817838
if isinstance(img_tag, Tag):
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
item-0 at level 0: unspecified: group _root_
2+
item-1 at level 1: text: Before tha table
3+
item-2 at level 1: table with [2x2]
4+
item-3 at level 1: text: After the table
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
{
2+
"schema_name": "DoclingDocument",
3+
"version": "1.7.0",
4+
"name": "table_with_heading",
5+
"origin": {
6+
"mimetype": "text/html",
7+
"binary_hash": 5578561753677933781,
8+
"filename": "table_with_heading.html"
9+
},
10+
"furniture": {
11+
"self_ref": "#/furniture",
12+
"children": [],
13+
"content_layer": "furniture",
14+
"name": "_root_",
15+
"label": "unspecified"
16+
},
17+
"body": {
18+
"self_ref": "#/body",
19+
"children": [
20+
{
21+
"$ref": "#/texts/0"
22+
},
23+
{
24+
"$ref": "#/tables/0"
25+
},
26+
{
27+
"$ref": "#/texts/1"
28+
}
29+
],
30+
"content_layer": "body",
31+
"name": "_root_",
32+
"label": "unspecified"
33+
},
34+
"groups": [],
35+
"texts": [
36+
{
37+
"self_ref": "#/texts/0",
38+
"parent": {
39+
"$ref": "#/body"
40+
},
41+
"children": [],
42+
"content_layer": "body",
43+
"label": "text",
44+
"prov": [],
45+
"orig": "Before tha table",
46+
"text": "Before tha table"
47+
},
48+
{
49+
"self_ref": "#/texts/1",
50+
"parent": {
51+
"$ref": "#/body"
52+
},
53+
"children": [],
54+
"content_layer": "body",
55+
"label": "text",
56+
"prov": [],
57+
"orig": "After the table",
58+
"text": "After the table"
59+
}
60+
],
61+
"pictures": [],
62+
"tables": [
63+
{
64+
"self_ref": "#/tables/0",
65+
"parent": {
66+
"$ref": "#/body"
67+
},
68+
"children": [],
69+
"content_layer": "body",
70+
"label": "table",
71+
"prov": [],
72+
"captions": [],
73+
"references": [],
74+
"footnotes": [],
75+
"data": {
76+
"table_cells": [
77+
{
78+
"row_span": 1,
79+
"col_span": 1,
80+
"start_row_offset_idx": 0,
81+
"end_row_offset_idx": 1,
82+
"start_col_offset_idx": 0,
83+
"end_col_offset_idx": 1,
84+
"text": "A",
85+
"column_header": false,
86+
"row_header": false,
87+
"row_section": false,
88+
"fillable": false
89+
},
90+
{
91+
"row_span": 1,
92+
"col_span": 1,
93+
"start_row_offset_idx": 0,
94+
"end_row_offset_idx": 1,
95+
"start_col_offset_idx": 1,
96+
"end_col_offset_idx": 2,
97+
"text": "B",
98+
"column_header": false,
99+
"row_header": false,
100+
"row_section": false,
101+
"fillable": false
102+
},
103+
{
104+
"row_span": 1,
105+
"col_span": 1,
106+
"start_row_offset_idx": 1,
107+
"end_row_offset_idx": 2,
108+
"start_col_offset_idx": 0,
109+
"end_col_offset_idx": 1,
110+
"text": "1...",
111+
"column_header": false,
112+
"row_header": false,
113+
"row_section": false,
114+
"fillable": false
115+
},
116+
{
117+
"row_span": 1,
118+
"col_span": 1,
119+
"start_row_offset_idx": 1,
120+
"end_row_offset_idx": 2,
121+
"start_col_offset_idx": 1,
122+
"end_col_offset_idx": 2,
123+
"text": "2...",
124+
"column_header": false,
125+
"row_header": false,
126+
"row_section": false,
127+
"fillable": false
128+
}
129+
],
130+
"num_rows": 2,
131+
"num_cols": 2,
132+
"grid": [
133+
[
134+
{
135+
"row_span": 1,
136+
"col_span": 1,
137+
"start_row_offset_idx": 0,
138+
"end_row_offset_idx": 1,
139+
"start_col_offset_idx": 0,
140+
"end_col_offset_idx": 1,
141+
"text": "A",
142+
"column_header": false,
143+
"row_header": false,
144+
"row_section": false,
145+
"fillable": false
146+
},
147+
{
148+
"row_span": 1,
149+
"col_span": 1,
150+
"start_row_offset_idx": 0,
151+
"end_row_offset_idx": 1,
152+
"start_col_offset_idx": 1,
153+
"end_col_offset_idx": 2,
154+
"text": "B",
155+
"column_header": false,
156+
"row_header": false,
157+
"row_section": false,
158+
"fillable": false
159+
}
160+
],
161+
[
162+
{
163+
"row_span": 1,
164+
"col_span": 1,
165+
"start_row_offset_idx": 1,
166+
"end_row_offset_idx": 2,
167+
"start_col_offset_idx": 0,
168+
"end_col_offset_idx": 1,
169+
"text": "1...",
170+
"column_header": false,
171+
"row_header": false,
172+
"row_section": false,
173+
"fillable": false
174+
},
175+
{
176+
"row_span": 1,
177+
"col_span": 1,
178+
"start_row_offset_idx": 1,
179+
"end_row_offset_idx": 2,
180+
"start_col_offset_idx": 1,
181+
"end_col_offset_idx": 2,
182+
"text": "2...",
183+
"column_header": false,
184+
"row_header": false,
185+
"row_section": false,
186+
"fillable": false
187+
}
188+
]
189+
]
190+
},
191+
"annotations": []
192+
}
193+
],
194+
"key_value_items": [],
195+
"form_items": [],
196+
"pages": {}
197+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Before tha table
2+
3+
| A | B |
4+
|------|------|
5+
| 1... | 2... |
6+
7+
After the table
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<html>
2+
<head>
3+
<style>
4+
table, th, td {border: 1px solid black; border-collapse: collapse;}
5+
td {padding:30px;}
6+
table {margin: 30px;}
7+
</style>
8+
</head>
9+
<body>
10+
<p>Before tha table</p>
11+
<table>
12+
<tr>
13+
<td><h1>A</h1></td>
14+
<td>B</td>
15+
</tr>
16+
<tr>
17+
<td>1...</td>
18+
<td>2...</td>
19+
</tr>
20+
</table>
21+
After the table
22+
</body>
23+
</html>

0 commit comments

Comments
 (0)