Skip to content

Commit ca2be7f

Browse files
cau-gitdolfim-ibm
andauthored
fix: Empty table handling (#2365)
* add table raw cells when no table structure model was used Signed-off-by: Michele Dolfi <[email protected]> * Add RichTableCell instance for tables with missing structure. Signed-off-by: Christoph Auer <[email protected]> * Update test GT Signed-off-by: Christoph Auer <[email protected]> * update test results Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Michele Dolfi <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
1 parent e6c3b05 commit ca2be7f

File tree

4 files changed

+19
-60
lines changed

4 files changed

+19
-60
lines changed

docling/models/readingorder_model.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -216,8 +216,13 @@ def _readingorder_elements_to_docling_doc(
216216
elif isinstance(element, Table):
217217
# Check if table has no structure prediction
218218
if element.num_rows == 0 and element.num_cols == 0:
219-
# Create minimal 1x1 table with rich cell containing all children
220-
tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
219+
# Only create 1x1 table if there are children to put in it
220+
if element.cluster.children:
221+
# Create minimal 1x1 table with rich cell containing all children
222+
tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
223+
else:
224+
# Create empty table with no structure
225+
tbl_data = TableData(num_rows=0, num_cols=0, table_cells=[])
221226
else:
222227
tbl_data = TableData(
223228
num_rows=element.num_rows,
@@ -253,8 +258,12 @@ def _readingorder_elements_to_docling_doc(
253258

254259
tbl.footnotes.append(new_footnote_item.get_ref())
255260

256-
# Handle case where table has no structure prediction
257-
if element.num_rows == 0 and element.num_cols == 0:
261+
# Handle case where table has no structure prediction but has children
262+
if (
263+
element.num_rows == 0
264+
and element.num_cols == 0
265+
and element.cluster.children
266+
):
258267
# Create rich cell containing all child elements
259268
rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)
260269

tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@
218218
<picture><loc_67><loc_92><loc_205><loc_308></picture>
219219
<caption><loc_52><loc_317><loc_223><loc_323>Figure 8: Example of a table with multi-line header.</caption>
220220
<picture><loc_252><loc_59><loc_455><loc_185><caption><loc_252><loc_194><loc_445><loc_207>Figure 9: Example of a table with big empty distance between cells.</caption></picture>
221-
<otsl><loc_274><loc_286><loc_400><loc_317><ecel><nl></otsl>
221+
<otsl><loc_274><loc_286><loc_400><loc_317></otsl>
222222
<picture><loc_273><loc_239><loc_424><loc_420><caption><loc_255><loc_430><loc_443><loc_435>Figure 10: Example of a complex table with empty cells.</caption></picture>
223223
<page_footer><loc_239><loc_464><loc_247><loc_469>13</page_footer>
224224
<page_break>

tests/data/groundtruth/docling_v2/2203.01017v2.json

Lines changed: 5 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -829,16 +829,6 @@
829829
"content_layer": "body",
830830
"name": "list",
831831
"label": "list"
832-
},
833-
{
834-
"self_ref": "#/groups/12",
835-
"parent": {
836-
"$ref": "#/tables/6"
837-
},
838-
"children": [],
839-
"content_layer": "body",
840-
"name": "rich_cell_group_7_0_0",
841-
"label": "unspecified"
842832
}
843833
],
844834
"texts": [
@@ -25418,11 +25408,7 @@
2541825408
"parent": {
2541925409
"$ref": "#/body"
2542025410
},
25421-
"children": [
25422-
{
25423-
"$ref": "#/groups/12"
25424-
}
25425-
],
25411+
"children": [],
2542625412
"content_layer": "body",
2542725413
"label": "table",
2542825414
"prov": [
@@ -25445,43 +25431,10 @@
2544525431
"references": [],
2544625432
"footnotes": [],
2544725433
"data": {
25448-
"table_cells": [
25449-
{
25450-
"row_span": 1,
25451-
"col_span": 1,
25452-
"start_row_offset_idx": 0,
25453-
"end_row_offset_idx": 1,
25454-
"start_col_offset_idx": 0,
25455-
"end_col_offset_idx": 1,
25456-
"text": "",
25457-
"column_header": false,
25458-
"row_header": false,
25459-
"row_section": false,
25460-
"fillable": false,
25461-
"ref": {
25462-
"$ref": "#/groups/12"
25463-
}
25464-
}
25465-
],
25466-
"num_rows": 1,
25467-
"num_cols": 1,
25468-
"grid": [
25469-
[
25470-
{
25471-
"row_span": 1,
25472-
"col_span": 1,
25473-
"start_row_offset_idx": 0,
25474-
"end_row_offset_idx": 1,
25475-
"start_col_offset_idx": 0,
25476-
"end_col_offset_idx": 1,
25477-
"text": "",
25478-
"column_header": false,
25479-
"row_header": false,
25480-
"row_section": false,
25481-
"fillable": false
25482-
}
25483-
]
25484-
]
25434+
"table_cells": [],
25435+
"num_rows": 0,
25436+
"num_cols": 0,
25437+
"grid": []
2548525438
},
2548625439
"annotations": []
2548725440
}

tests/data/groundtruth/docling_v2/2203.01017v2.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -367,9 +367,6 @@ Figure 9: Example of a table with big empty distance between cells.
367367

368368
<!-- image -->
369369

370-
| |
371-
|----|
372-
373370
Figure 10: Example of a complex table with empty cells.
374371

375372
<!-- image -->

0 commit comments

Comments
 (0)