Skip to content

Commit 2dc57c1

Browse files
authored
fix: fix handling of generic groups in rich table cells (#383)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent b60ac19 commit 2dc57c1

File tree

10 files changed

+203
-21
lines changed

10 files changed

+203
-21
lines changed

docling_core/transforms/serializer/common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,7 @@ def serialize(
394394
item=item,
395395
doc_serializer=self,
396396
doc=self.doc,
397+
visited=my_visited,
397398
**my_kwargs,
398399
)
399400
return part

docling_core/transforms/serializer/doctags.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
DoclingDocument,
3333
FloatingItem,
3434
FormItem,
35+
GroupItem,
3536
InlineGroup,
3637
KeyValueItem,
3738
ListGroup,
@@ -516,7 +517,12 @@ def serialize(
516517
**kwargs: Any,
517518
) -> SerializationResult:
518519
"""Serializes the passed item."""
519-
return create_ser_result()
520+
if isinstance(item, GroupItem):
521+
parts = doc_serializer.get_parts(item=item, **kwargs)
522+
text_res = "\n".join([p.text for p in parts if p.text])
523+
return create_ser_result(text=text_res, span_source=parts)
524+
else:
525+
return create_ser_result()
520526

521527

522528
class DocTagsAnnotationSerializer(BaseAnnotationSerializer):

docling_core/transforms/serializer/html.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
FormItem,
5656
FormulaItem,
5757
GraphData,
58+
GroupItem,
5859
ImageRef,
5960
InlineGroup,
6061
KeyValueItem,
@@ -787,21 +788,30 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
787788
"""HTML-specific fallback serializer."""
788789

789790
@override
790-
def serialize(self, *, item: NodeItem, **kwargs: Any) -> SerializationResult:
791+
def serialize(
792+
self,
793+
*,
794+
item: NodeItem,
795+
doc_serializer: "BaseDocSerializer",
796+
doc: DoclingDocument,
797+
**kwargs: Any,
798+
) -> SerializationResult:
791799
"""Fallback serializer for items not handled by other serializers."""
792-
if isinstance(item, DocItem):
800+
if isinstance(item, GroupItem):
801+
parts = doc_serializer.get_parts(item=item, **kwargs)
802+
text_res = "\n".join([p.text for p in parts if p.text])
803+
return create_ser_result(text=text_res, span_source=parts)
804+
else:
793805
return create_ser_result(
794806
text=f"<!-- Unhandled item type: {item.__class__.__name__} -->",
795-
span_source=item,
807+
span_source=item if isinstance(item, DocItem) else [],
796808
)
797-
else:
798-
# For group items, we don't generate any markup
799-
return create_ser_result()
800809

801810

802811
class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
803812
"""HTML-specific annotation serializer."""
804813

814+
@override
805815
def serialize(
806816
self,
807817
*,

docling_core/transforms/serializer/markdown.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
Formatting,
4646
FormItem,
4747
FormulaItem,
48+
GroupItem,
4849
ImageRef,
4950
InlineGroup,
5051
KeyValueItem,
@@ -599,13 +600,15 @@ def serialize(
599600
**kwargs: Any,
600601
) -> SerializationResult:
601602
"""Serializes the passed item."""
602-
if isinstance(item, DocItem):
603+
if isinstance(item, GroupItem):
604+
parts = doc_serializer.get_parts(item=item, **kwargs)
605+
text_res = "\n\n".join([p.text for p in parts if p.text])
606+
return create_ser_result(text=text_res, span_source=parts)
607+
else:
603608
return create_ser_result(
604609
text="<!-- missing-text -->",
605-
span_source=item,
610+
span_source=item if isinstance(item, DocItem) else [],
606611
)
607-
else:
608-
return create_ser_result()
609612

610613

611614
class MarkdownDocSerializer(DocSerializer):

test/data/doc/rich_table.gt.html

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,8 @@ <h1>Rich tables</h1>
128128
<table><tbody><tr><td>cell 0,0</td><td>cell 0,1</td></tr><tr><td>cell 1,0</td><td><em><p>text in italic</p></em></td></tr><tr><td><ul>
129129
<li>list item 1</li>
130130
<li>list item 2</li>
131-
</ul></td><td>cell 2,1</td></tr><tr><td>cell 3,0</td><td><table><tbody><tr><td>inner cell 0,0</td><td>inner cell 0,1</td><td>inner cell 0,2</td></tr><tr><td>inner cell 1,0</td><td>inner cell 1,1</td><td>inner cell 1,2</td></tr></tbody></table></td></tr></tbody></table>
131+
</ul></td><td>cell 2,1</td></tr><tr><td>cell 3,0</td><td><table><tbody><tr><td>inner cell 0,0</td><td>inner cell 0,1</td><td>inner cell 0,2</td></tr><tr><td>inner cell 1,0</td><td>inner cell 1,1</td><td>inner cell 1,2</td></tr></tbody></table></td></tr><tr><td><p>Some text in a generic group.</p>
132+
<p>More text in the group.</p></td><td>cell 4,1</td></tr></tbody></table>
132133
</div>
133134
</body>
134135
</html>

test/data/doc/rich_table.gt.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# Rich tables
22

3-
| cell 0,0 | cell 0,1 |
4-
|-----------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
5-
| cell 1,0 | *text in italic* |
6-
| - list item 1 - list item 2 | cell 2,1 |
7-
| cell 3,0 | | inner cell 0,0 | inner cell 0,1 | inner cell 0,2 | |------------------|------------------|------------------| | inner cell 1,0 | inner cell 1,1 | inner cell 1,2 | |
3+
| cell 0,0 | cell 0,1 |
4+
|--------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
5+
| cell 1,0 | *text in italic* |
6+
| - list item 1 - list item 2 | cell 2,1 |
7+
| cell 3,0 | | inner cell 0,0 | inner cell 0,1 | inner cell 0,2 | |------------------|------------------|------------------| | inner cell 1,0 | inner cell 1,1 | inner cell 1,2 | |
8+
| Some text in a generic group. More text in the group. | cell 4,1 |

test/data/doc/rich_table.out.dt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
<doctag><title>Rich tables</title>
22
<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel><text>text in italic</text><nl><fcel><unordered_list><list_item>list item 1</list_item>
33
<list_item>list item 2</list_item>
4-
</unordered_list><fcel>cell 2,1<nl><fcel>cell 3,0<fcel><otsl><fcel>inner cell 0,0<fcel>inner cell 0,1<fcel>inner cell 0,2<nl><fcel>inner cell 1,0<fcel>inner cell 1,1<fcel>inner cell 1,2<nl></otsl><nl></otsl>
4+
</unordered_list><fcel>cell 2,1<nl><fcel>cell 3,0<fcel><otsl><fcel>inner cell 0,0<fcel>inner cell 0,1<fcel>inner cell 0,2<nl><fcel>inner cell 1,0<fcel>inner cell 1,1<fcel>inner cell 1,2<nl></otsl><nl><fcel><text>Some text in a generic group.</text>
5+
<text>More text in the group.</text><fcel>cell 4,1<nl></otsl>
56
</doctag>

test/data/doc/rich_table.out.yaml

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,15 @@ groups:
2323
parent:
2424
$ref: '#/tables/0'
2525
self_ref: '#/groups/0'
26+
- children:
27+
- $ref: '#/texts/4'
28+
- $ref: '#/texts/5'
29+
content_layer: body
30+
label: unspecified
31+
name: group
32+
parent:
33+
$ref: '#/tables/0'
34+
self_ref: '#/groups/1'
2635
key_value_items: []
2736
name: ''
2837
pages: {}
@@ -35,6 +44,7 @@ tables:
3544
- $ref: '#/texts/1'
3645
- $ref: '#/groups/0'
3746
- $ref: '#/tables/1'
47+
- $ref: '#/groups/1'
3848
content_layer: body
3949
data:
4050
grid:
@@ -118,8 +128,28 @@ tables:
118128
start_col_offset_idx: 1
119129
start_row_offset_idx: 3
120130
text: ''
131+
- - col_span: 1
132+
column_header: false
133+
end_col_offset_idx: 1
134+
end_row_offset_idx: 5
135+
row_header: false
136+
row_section: false
137+
row_span: 1
138+
start_col_offset_idx: 0
139+
start_row_offset_idx: 4
140+
text: ''
141+
- col_span: 1
142+
column_header: false
143+
end_col_offset_idx: 2
144+
end_row_offset_idx: 5
145+
row_header: false
146+
row_section: false
147+
row_span: 1
148+
start_col_offset_idx: 1
149+
start_row_offset_idx: 4
150+
text: cell 4,1
121151
num_cols: 2
122-
num_rows: 4
152+
num_rows: 5
123153
table_cells:
124154
- col_span: 1
125155
column_header: false
@@ -207,6 +237,28 @@ tables:
207237
start_col_offset_idx: 1
208238
start_row_offset_idx: 3
209239
text: ''
240+
- col_span: 1
241+
column_header: false
242+
end_col_offset_idx: 1
243+
end_row_offset_idx: 5
244+
ref:
245+
$ref: '#/groups/1'
246+
row_header: false
247+
row_section: false
248+
row_span: 1
249+
start_col_offset_idx: 0
250+
start_row_offset_idx: 4
251+
text: ''
252+
- col_span: 1
253+
column_header: false
254+
end_col_offset_idx: 2
255+
end_row_offset_idx: 5
256+
row_header: false
257+
row_section: false
258+
row_span: 1
259+
start_col_offset_idx: 1
260+
start_row_offset_idx: 4
261+
text: cell 4,1
210262
footnotes: []
211263
label: table
212264
parent:
@@ -397,4 +449,22 @@ texts:
397449
prov: []
398450
self_ref: '#/texts/3'
399451
text: list item 2
452+
- children: []
453+
content_layer: body
454+
label: text
455+
orig: Some text in a generic group.
456+
parent:
457+
$ref: '#/groups/1'
458+
prov: []
459+
self_ref: '#/texts/4'
460+
text: Some text in a generic group.
461+
- children: []
462+
content_layer: body
463+
label: text
464+
orig: More text in the group.
465+
parent:
466+
$ref: '#/groups/1'
467+
prov: []
468+
self_ref: '#/texts/5'
469+
text: More text in the group.
400470
version: 1.6.0

test/data/doc/rich_table_post_text_del.out.yaml

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,15 @@ groups:
2222
parent:
2323
$ref: '#/tables/0'
2424
self_ref: '#/groups/0'
25+
- children:
26+
- $ref: '#/texts/3'
27+
- $ref: '#/texts/4'
28+
content_layer: body
29+
label: unspecified
30+
name: group
31+
parent:
32+
$ref: '#/tables/0'
33+
self_ref: '#/groups/1'
2534
key_value_items: []
2635
name: ''
2736
pages: {}
@@ -34,6 +43,7 @@ tables:
3443
- $ref: '#/texts/0'
3544
- $ref: '#/groups/0'
3645
- $ref: '#/tables/1'
46+
- $ref: '#/groups/1'
3747
content_layer: body
3848
data:
3949
grid:
@@ -117,8 +127,28 @@ tables:
117127
start_col_offset_idx: 1
118128
start_row_offset_idx: 3
119129
text: ''
130+
- - col_span: 1
131+
column_header: false
132+
end_col_offset_idx: 1
133+
end_row_offset_idx: 5
134+
row_header: false
135+
row_section: false
136+
row_span: 1
137+
start_col_offset_idx: 0
138+
start_row_offset_idx: 4
139+
text: ''
140+
- col_span: 1
141+
column_header: false
142+
end_col_offset_idx: 2
143+
end_row_offset_idx: 5
144+
row_header: false
145+
row_section: false
146+
row_span: 1
147+
start_col_offset_idx: 1
148+
start_row_offset_idx: 4
149+
text: cell 4,1
120150
num_cols: 2
121-
num_rows: 4
151+
num_rows: 5
122152
table_cells:
123153
- col_span: 1
124154
column_header: false
@@ -206,6 +236,28 @@ tables:
206236
start_col_offset_idx: 1
207237
start_row_offset_idx: 3
208238
text: ''
239+
- col_span: 1
240+
column_header: false
241+
end_col_offset_idx: 1
242+
end_row_offset_idx: 5
243+
ref:
244+
$ref: '#/groups/1'
245+
row_header: false
246+
row_section: false
247+
row_span: 1
248+
start_col_offset_idx: 0
249+
start_row_offset_idx: 4
250+
text: ''
251+
- col_span: 1
252+
column_header: false
253+
end_col_offset_idx: 2
254+
end_row_offset_idx: 5
255+
row_header: false
256+
row_section: false
257+
row_span: 1
258+
start_col_offset_idx: 1
259+
start_row_offset_idx: 4
260+
text: cell 4,1
209261
footnotes: []
210262
label: table
211263
parent:
@@ -387,4 +439,22 @@ texts:
387439
prov: []
388440
self_ref: '#/texts/2'
389441
text: list item 2
442+
- children: []
443+
content_layer: body
444+
label: text
445+
orig: Some text in a generic group.
446+
parent:
447+
$ref: '#/groups/1'
448+
prov: []
449+
self_ref: '#/texts/3'
450+
text: Some text in a generic group.
451+
- children: []
452+
content_layer: body
453+
label: text
454+
orig: More text in the group.
455+
parent:
456+
$ref: '#/groups/1'
457+
prov: []
458+
self_ref: '#/texts/4'
459+
text: More text in the group.
390460
version: 1.6.0

test/test_docling_doc.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2102,7 +2102,7 @@ def _construct_rich_table_doc():
21022102

21032103
table_item = doc.add_table(
21042104
data=TableData(
2105-
num_rows=4,
2105+
num_rows=5,
21062106
num_cols=2,
21072107
),
21082108
)
@@ -2121,6 +2121,17 @@ def _construct_rich_table_doc():
21212121
rich_item_3 = doc.add_table(
21222122
data=TableData(num_rows=2, num_cols=3), parent=table_item
21232123
)
2124+
2125+
rich_item_4 = doc.add_group(parent=table_item, label=GroupLabel.UNSPECIFIED)
2126+
doc.add_text(
2127+
parent=rich_item_4,
2128+
text="Some text in a generic group.",
2129+
label=DocItemLabel.TEXT,
2130+
)
2131+
doc.add_text(
2132+
parent=rich_item_4, text="More text in the group.", label=DocItemLabel.TEXT
2133+
)
2134+
21242135
for i in range(rich_item_3.data.num_rows):
21252136
for j in range(rich_item_3.data.num_cols):
21262137
cell = TableCell(
@@ -2158,6 +2169,14 @@ def _construct_rich_table_doc():
21582169
end_col_offset_idx=j + 1,
21592170
ref=rich_item_3.get_ref(),
21602171
)
2172+
elif i == 4 and j == 0:
2173+
cell = RichTableCell(
2174+
start_row_offset_idx=i,
2175+
end_row_offset_idx=i + 1,
2176+
start_col_offset_idx=j,
2177+
end_col_offset_idx=j + 1,
2178+
ref=rich_item_4.get_ref(),
2179+
)
21612180
else:
21622181
cell = TableCell(
21632182
start_row_offset_idx=i,

0 commit comments

Comments
 (0)