Skip to content

Commit bba05d1

Browse files
committed
fix(layout,table): orientation-aware layout and table detection
Signed-off-by: Clément Doumouro <[email protected]>
1 parent 8ffa01b commit bba05d1

24 files changed

+38557
-2377
lines changed

tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,10 +213,10 @@
213213
"prov": [
214214
{
215215
"bbox": [
216-
139.66741943359375,
216+
139.66746520996094,
217217
322.5054626464844,
218-
475.00927734375,
219-
454.45458984375
218+
475.0093078613281,
219+
454.4546203613281
220220
],
221221
"page": 1,
222222
"span": [

tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2705,7 +2705,7 @@
27052705
"b": 102.78223000000003,
27062706
"coord_origin": "TOPLEFT"
27072707
},
2708-
"confidence": 0.9373534917831421,
2708+
"confidence": 0.9373531937599182,
27092709
"cells": [
27102710
{
27112711
"index": 0,
@@ -2745,7 +2745,7 @@
27452745
"b": 102.78223000000003,
27462746
"coord_origin": "TOPLEFT"
27472747
},
2748-
"confidence": 0.8858680725097656,
2748+
"confidence": 0.8858677744865417,
27492749
"cells": [
27502750
{
27512751
"index": 1,
@@ -2785,7 +2785,7 @@
27852785
"b": 152.90697999999998,
27862786
"coord_origin": "TOPLEFT"
27872787
},
2788-
"confidence": 0.9806433916091919,
2788+
"confidence": 0.9806435108184814,
27892789
"cells": [
27902790
{
27912791
"index": 2,
@@ -3155,7 +3155,7 @@
31553155
"b": 327.98218,
31563156
"coord_origin": "TOPLEFT"
31573157
},
3158-
"confidence": 0.9591909050941467,
3158+
"confidence": 0.9591910243034363,
31593159
"cells": [
31603160
{
31613161
"index": 15,
@@ -3339,9 +3339,9 @@
33393339
"id": 0,
33403340
"label": "table",
33413341
"bbox": {
3342-
"l": 139.66741943359375,
3343-
"t": 337.54541015625,
3344-
"r": 475.00927734375,
3342+
"l": 139.66746520996094,
3343+
"t": 337.5453796386719,
3344+
"r": 475.0093078613281,
33453345
"b": 469.4945373535156,
33463346
"coord_origin": "TOPLEFT"
33473347
},
@@ -7846,7 +7846,7 @@
78467846
"b": 518.17419,
78477847
"coord_origin": "TOPLEFT"
78487848
},
7849-
"confidence": 0.9589294195175171,
7849+
"confidence": 0.9589295387268066,
78507850
"cells": [
78517851
{
78527852
"index": 91,
@@ -8243,9 +8243,9 @@
82438243
"id": 0,
82448244
"label": "table",
82458245
"bbox": {
8246-
"l": 139.66741943359375,
8247-
"t": 337.54541015625,
8248-
"r": 475.00927734375,
8246+
"l": 139.66746520996094,
8247+
"t": 337.5453796386719,
8248+
"r": 475.0093078613281,
82498249
"b": 469.4945373535156,
82508250
"coord_origin": "TOPLEFT"
82518251
},
@@ -13641,7 +13641,7 @@
1364113641
"b": 102.78223000000003,
1364213642
"coord_origin": "TOPLEFT"
1364313643
},
13644-
"confidence": 0.9373534917831421,
13644+
"confidence": 0.9373531937599182,
1364513645
"cells": [
1364613646
{
1364713647
"index": 0,
@@ -13687,7 +13687,7 @@
1368713687
"b": 102.78223000000003,
1368813688
"coord_origin": "TOPLEFT"
1368913689
},
13690-
"confidence": 0.8858680725097656,
13690+
"confidence": 0.8858677744865417,
1369113691
"cells": [
1369213692
{
1369313693
"index": 1,
@@ -13733,7 +13733,7 @@
1373313733
"b": 152.90697999999998,
1373413734
"coord_origin": "TOPLEFT"
1373513735
},
13736-
"confidence": 0.9806433916091919,
13736+
"confidence": 0.9806435108184814,
1373713737
"cells": [
1373813738
{
1373913739
"index": 2,
@@ -14121,7 +14121,7 @@
1412114121
"b": 327.98218,
1412214122
"coord_origin": "TOPLEFT"
1412314123
},
14124-
"confidence": 0.9591909050941467,
14124+
"confidence": 0.9591910243034363,
1412514125
"cells": [
1412614126
{
1412714127
"index": 15,
@@ -14311,9 +14311,9 @@
1431114311
"id": 0,
1431214312
"label": "table",
1431314313
"bbox": {
14314-
"l": 139.66741943359375,
14315-
"t": 337.54541015625,
14316-
"r": 475.00927734375,
14314+
"l": 139.66746520996094,
14315+
"t": 337.5453796386719,
14316+
"r": 475.0093078613281,
1431714317
"b": 469.4945373535156,
1431814318
"coord_origin": "TOPLEFT"
1431914319
},
@@ -19701,7 +19701,7 @@
1970119701
"b": 518.17419,
1970219702
"coord_origin": "TOPLEFT"
1970319703
},
19704-
"confidence": 0.9589294195175171,
19704+
"confidence": 0.9589295387268066,
1970519705
"cells": [
1970619706
{
1970719707
"index": 91,
@@ -20116,7 +20116,7 @@
2011620116
"b": 152.90697999999998,
2011720117
"coord_origin": "TOPLEFT"
2011820118
},
20119-
"confidence": 0.9806433916091919,
20119+
"confidence": 0.9806435108184814,
2012020120
"cells": [
2012120121
{
2012220122
"index": 2,
@@ -20504,7 +20504,7 @@
2050420504
"b": 327.98218,
2050520505
"coord_origin": "TOPLEFT"
2050620506
},
20507-
"confidence": 0.9591909050941467,
20507+
"confidence": 0.9591910243034363,
2050820508
"cells": [
2050920509
{
2051020510
"index": 15,
@@ -20694,9 +20694,9 @@
2069420694
"id": 0,
2069520695
"label": "table",
2069620696
"bbox": {
20697-
"l": 139.66741943359375,
20698-
"t": 337.54541015625,
20699-
"r": 475.00927734375,
20697+
"l": 139.66746520996094,
20698+
"t": 337.5453796386719,
20699+
"r": 475.0093078613281,
2070020700
"b": 469.4945373535156,
2070120701
"coord_origin": "TOPLEFT"
2070220702
},
@@ -26084,7 +26084,7 @@
2608426084
"b": 518.17419,
2608526085
"coord_origin": "TOPLEFT"
2608626086
},
26087-
"confidence": 0.9589294195175171,
26087+
"confidence": 0.9589295387268066,
2608826088
"cells": [
2608926089
{
2609026090
"index": 91,
@@ -26499,7 +26499,7 @@
2649926499
"b": 102.78223000000003,
2650026500
"coord_origin": "TOPLEFT"
2650126501
},
26502-
"confidence": 0.9373534917831421,
26502+
"confidence": 0.9373531937599182,
2650326503
"cells": [
2650426504
{
2650526505
"index": 0,
@@ -26545,7 +26545,7 @@
2654526545
"b": 102.78223000000003,
2654626546
"coord_origin": "TOPLEFT"
2654726547
},
26548-
"confidence": 0.8858680725097656,
26548+
"confidence": 0.8858677744865417,
2654926549
"cells": [
2655026550
{
2655126551
"index": 1,
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
<doctag><text><loc_59><loc_46><loc_424><loc_90>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
1+
<doctag><text><loc_60><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
22
</doctag>
Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
<document>
2-
<table>
3-
<location><page_1><loc_12><loc_39><loc_67><loc_87></location>
4-
<row_0><col_0><body></col_0><col_1><col_header>Column 0</col_1><col_2><col_header>Column 1</col_2><col_3><col_header>Column 2</col_3></row_0>
5-
<row_1><col_0><row_header>this is row 0</col_0><col_1><body>some cells</col_1><col_2><body>have content</col_2><col_3><body>and</col_3></row_1>
6-
<row_2><col_0><row_header>and row 1</col_0><col_1><body></col_1><col_2><body>other</col_2><col_3><body>have</col_3></row_2>
7-
<row_3><col_0><row_header>and last row 2</col_0><col_1><body>nothing</col_1><col_2><body></col_2><col_3><body>inside</col_3></row_3>
8-
</table>
2+
<subtitle-level-1><location><page_1><loc_33><loc_87><loc_68><loc_91></location>This is a table test</subtitle-level-1>
3+
<paragraph><location><page_1><loc_12><loc_83><loc_61><loc_84></location>The test starts with some random text and then a table image:</paragraph>
4+
<paragraph><location><page_1><loc_45><loc_76><loc_56><loc_77></location>Some column</paragraph>
5+
<paragraph><location><page_1><loc_62><loc_76><loc_78><loc_77></location>Some other column</paragraph>
6+
<paragraph><location><page_1><loc_29><loc_70><loc_37><loc_71></location>Some row</paragraph>
7+
<paragraph><location><page_1><loc_47><loc_70><loc_54><loc_71></location>some cell</paragraph>
8+
<paragraph><location><page_1><loc_65><loc_70><loc_76><loc_71></location>have content</paragraph>
9+
<paragraph><location><page_1><loc_26><loc_64><loc_39><loc_65></location>Some other row</paragraph>
10+
<paragraph><location><page_1><loc_46><loc_64><loc_55><loc_65></location>other don't</paragraph>
911
</document>

0 commit comments

Comments
 (0)