Skip to content

Commit 8b4af6f

Browse files
committed
fix(layout,table): orientation-aware layout and table detection
Signed-off-by: Clément Doumouro <[email protected]>
1 parent f43d885 commit 8b4af6f

26 files changed

+593
-124
lines changed

docling/models/layout_model.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import copy
22
import logging
33
import warnings
4-
from copy import deepcopy
54
from collections.abc import Iterable
5+
from copy import deepcopy
66
from pathlib import Path
77
from typing import Optional
88

@@ -18,7 +18,7 @@
1818
from docling.models.utils.hf_model_download import download_hf_model
1919
from docling.utils.accelerator_utils import decide_device
2020
from docling.utils.layout_postprocessor import LayoutPostprocessor
21-
from docling.utils.orientation import detect_orientation
21+
from docling.utils.orientation import detect_orientation, rotate_bounding_box
2222
from docling.utils.profiling import TimeRecorder
2323
from docling.utils.visualization import draw_clusters
2424

@@ -99,7 +99,6 @@ def draw_clusters_and_cells_side_by_side(
9999
self,
100100
conv_res,
101101
page,
102-
page_orientation: int,
103102
clusters,
104103
mode_prefix: str,
105104
show: bool = False,
@@ -113,10 +112,6 @@ def draw_clusters_and_cells_side_by_side(
113112
page_image = deepcopy(page.image)
114113
scale_x = page_image.width / page.size.width
115114
scale_y = page_image.height / page.size.height
116-
if page_orientation:
117-
page_image = page_image.rotate(-page_orientation, expand=True)
118-
if abs(page_orientation) in [90, 270]:
119-
scale_x, scale_y = scale_y, scale_x
120115
# Filter clusters for left and right images
121116
exclude_labels = {
122117
DocItemLabel.FORM,
@@ -132,9 +127,6 @@ def draw_clusters_and_cells_side_by_side(
132127
# Draw clusters on both images
133128
draw_clusters(left_image, left_clusters, scale_x, scale_y)
134129
draw_clusters(right_image, right_clusters, scale_x, scale_y)
135-
if page_orientation:
136-
left_image = left_image.rotate(page_orientation, expand=True)
137-
right_image = right_image.rotate(page_orientation, expand=True)
138130
# Combine the images side by side
139131
combined_width = left_image.width * 2
140132
combined_height = left_image.height
@@ -177,11 +169,16 @@ def __call__(
177169
.replace(" ", "_")
178170
.replace("-", "_")
179171
) # Temporary, until docling-ibm-model uses docling-core types
172+
bbox = BoundingBox.model_validate(pred_item)
173+
if page_orientation:
174+
bbox = rotate_bounding_box(
175+
bbox, page_orientation, page_image.size
176+
).to_bounding_box()
180177
cluster = Cluster(
181178
id=ix,
182179
label=label,
183180
confidence=pred_item["confidence"],
184-
bbox=BoundingBox.model_validate(pred_item),
181+
bbox=bbox,
185182
cells=[],
186183
)
187184
clusters.append(cluster)
@@ -190,7 +187,6 @@ def __call__(
190187
self.draw_clusters_and_cells_side_by_side(
191188
conv_res,
192189
page,
193-
page_orientation,
194190
clusters,
195191
mode_prefix="raw",
196192
)
@@ -228,7 +224,6 @@ def __call__(
228224
self.draw_clusters_and_cells_side_by_side(
229225
conv_res,
230226
page,
231-
page_orientation,
232227
processed_clusters,
233228
mode_prefix="postprocessed",
234229
)

docling/models/table_structure_model.py

Lines changed: 54 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import copy
22
import warnings
3-
from collections.abc import Iterable
43
from pathlib import Path
5-
from typing import Optional
4+
from typing import Iterable, Optional, Tuple, cast
65

76
import numpy
87
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@@ -11,6 +10,7 @@
1110
TextCellUnit,
1211
)
1312
from PIL import ImageDraw
13+
from PIL.Image import Image
1414

1515
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
1616
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
@@ -23,13 +23,16 @@
2323
from docling.models.base_model import BasePageModel
2424
from docling.models.utils.hf_model_download import download_hf_model
2525
from docling.utils.accelerator_utils import decide_device
26+
from docling.utils.orientation import detect_orientation, rotate_bounding_box
2627
from docling.utils.profiling import TimeRecorder
2728

2829

2930
class TableStructureModel(BasePageModel):
3031
_model_repo_folder = "ds4sd--docling-models"
3132
_model_path = "model_artifacts/tableformer"
3233

34+
_table_labels = {DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX}
35+
3336
def __init__(
3437
self,
3538
enabled: bool,
@@ -186,31 +189,48 @@ def __call__(
186189
page.predictions.tablestructure = (
187190
TableStructurePrediction()
188191
) # dummy
192+
cells_orientation = detect_orientation(page.cells)
193+
# Keep only table bboxes
194+
in_tables_clusters = [
195+
cluster
196+
for cluster in page.predictions.layout.clusters
197+
if cluster.label in self._table_labels
198+
]
189199

200+
if not len(in_tables_clusters):
201+
yield page
202+
continue
203+
# Rotate and scale table image
204+
page_im = cast(Image, page.get_image())
205+
scaled_page_im: Image = cast(
206+
Image, page.get_image(scale=self.scale)
207+
)
208+
if cells_orientation:
209+
scaled_page_im = scaled_page_im.rotate(
210+
-cells_orientation, expand=True
211+
)
212+
page_input = {
213+
"width": scaled_page_im.size[0],
214+
"height": scaled_page_im.size[1],
215+
"image": numpy.asarray(scaled_page_im),
216+
}
217+
# Rotate and scale table cells
190218
in_tables = [
191219
(
192-
cluster,
220+
c,
193221
[
194-
round(cluster.bbox.l) * self.scale,
195-
round(cluster.bbox.t) * self.scale,
196-
round(cluster.bbox.r) * self.scale,
197-
round(cluster.bbox.b) * self.scale,
222+
round(x) * self.scale
223+
for x in _rotate_bbox(
224+
c.bbox,
225+
orientation=-cells_orientation,
226+
im_size=page_im.size,
227+
)
228+
.to_top_left_origin(page_im.size[1])
229+
.as_tuple()
198230
],
199231
)
200-
for cluster in page.predictions.layout.clusters
201-
if cluster.label
202-
in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
232+
for c in in_tables_clusters
203233
]
204-
if not len(in_tables):
205-
yield page
206-
continue
207-
208-
page_input = {
209-
"width": page.size.width * self.scale,
210-
"height": page.size.height * self.scale,
211-
"image": numpy.asarray(page.get_image(scale=self.scale)),
212-
}
213-
214234
table_clusters, table_bboxes = zip(*in_tables)
215235

216236
if len(table_bboxes):
@@ -238,11 +258,16 @@ def __call__(
238258
scale=self.scale
239259
)
240260
)
261+
new_bbox = _rotate_bbox(
262+
new_cell.to_bounding_box(),
263+
orientation=-cells_orientation,
264+
im_size=scaled_page_im.size,
265+
).model_dump()
241266
tokens.append(
242267
{
243268
"id": new_cell.index,
244269
"text": new_cell.text,
245-
"bbox": new_cell.rect.to_bounding_box().model_dump(),
270+
"bbox": new_bbox,
246271
}
247272
)
248273
page_input["tokens"] = tokens
@@ -302,3 +327,11 @@ def __call__(
302327
)
303328

304329
yield page
330+
331+
332+
def _rotate_bbox(
333+
bbox: BoundingBox, *, orientation: int, im_size: Tuple[int, int]
334+
) -> BoundingBox:
335+
if orientation:
336+
return rotate_bounding_box(bbox, orientation, im_size).to_bounding_box()
337+
return bbox

docling/models/tesseract_ocr_cli_model.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
parse_tesseract_orientation,
2828
tesseract_box_to_bounding_rectangle,
2929
)
30-
from docling.utils.orientation import Box
3130
from docling.utils.profiling import TimeRecorder
3231

3332
_log = logging.getLogger(__name__)

docling/utils/ocr_utils.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Optional
1+
from typing import Optional, Tuple
22

33
from docling_core.types.doc import BoundingBox, CoordOrigin
44
from docling_core.types.doc.page import BoundingRectangle
@@ -43,8 +43,10 @@ def tesseract_box_to_bounding_rectangle(
4343
orientation: int,
4444
im_size: Tuple[int, int],
4545
) -> BoundingRectangle:
46-
# box is in the top, left, height, width format, top left coordinates
47-
rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
46+
# bbox is in the top, left, height, width format, top left coordinates
47+
# We detected the tesseract on the document rotated with minus orientation, we have
48+
# to apply an orientation angle
49+
rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
4850
rect = BoundingRectangle(
4951
r_x0=rect.r_x0 / scale,
5052
r_y0=rect.r_y0 / scale,
@@ -54,7 +56,7 @@ def tesseract_box_to_bounding_rectangle(
5456
r_y2=rect.r_y2 / scale,
5557
r_x3=rect.r_x3 / scale,
5658
r_y3=rect.r_y3 / scale,
57-
coord_origin=CoordOrigin.TOPLEFT,
59+
coord_origin=rect.coord_origin,
5860
)
5961
if original_offset is not None:
6062
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:

docling/utils/orientation.py

Lines changed: 27 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,22 @@
11
from collections import Counter
22
from operator import itemgetter
3+
from typing import Tuple
34

4-
from docling_core.types.doc.page import TextCell
5+
from docling_core.types.doc import BoundingBox, CoordOrigin
6+
from docling_core.types.doc.page import BoundingRectangle, TextCell
57

6-
_ORIENTATIONS = [0, 90, 180, 270]
8+
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
79

810

911
def _clipped_orientation(angle: float) -> int:
10-
return min((abs(angle - o) % 360, o) for o in _ORIENTATIONS)[1]
12+
return min((abs(angle - o) % 360, o) for o in CLIPPED_ORIENTATIONS)[1]
1113

1214

1315
def detect_orientation(cells: list[TextCell]) -> int:
1416
if not cells:
1517
return 0
1618
orientation_counter = Counter(_clipped_orientation(c.rect.angle_360) for c in cells)
1719
return max(orientation_counter.items(), key=itemgetter(1))[0]
18-
from typing import Tuple
19-
20-
from docling_core.types.doc import BoundingBox, CoordOrigin
21-
from docling_core.types.doc.page import BoundingRectangle
22-
23-
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
2420

2521

2622
def rotate_bounding_box(
@@ -31,51 +27,44 @@ def rotate_bounding_box(
3127
# coordinate system. Then other corners are found rotating counterclockwise
3228
bbox = bbox.to_top_left_origin(im_size[1])
3329
left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
34-
im_h, im_w = im_size
30+
im_w, im_h = im_size
3531
angle = angle % 360
3632
if angle == 0:
37-
r_x0 = left
38-
r_y0 = top + height
39-
r_x1 = r_x0 + width
40-
r_y1 = r_y0
41-
r_x2 = r_x0 + width
42-
r_y2 = r_y0 - height
43-
r_x3 = r_x0
44-
r_y3 = r_y0 - height
33+
return BoundingRectangle.from_bounding_box(bbox)
4534
elif angle == 90:
46-
r_x0 = im_w - (top + height)
47-
r_y0 = left
35+
r_x0 = top + height
36+
r_y0 = im_w - left
4837
r_x1 = r_x0
49-
r_y1 = r_y0 + width
50-
r_x2 = r_x0 + height
51-
r_y2 = r_y0 + width
52-
r_x3 = r_x0
53-
r_y3 = r_y0 + width
38+
r_y1 = r_y0 - width
39+
r_x2 = r_x1 - height
40+
r_y2 = r_y1
41+
r_x3 = r_x2
42+
r_y3 = r_y0
5443
elif angle == 180:
55-
r_x0 = im_h - left
56-
r_y0 = im_w - (top + height)
44+
r_x0 = width + left
45+
r_y0 = im_h - (top + height)
5746
r_x1 = r_x0 - width
5847
r_y1 = r_y0
59-
r_x2 = r_x0 - width
60-
r_y2 = r_y0 + height
48+
r_x2 = r_x1
49+
r_y2 = r_x2 + height
6150
r_x3 = r_x0
62-
r_y3 = r_y0 + height
51+
r_y3 = r_y2
6352
elif angle == 270:
64-
r_x0 = top + height
65-
r_y0 = im_h - left
53+
r_x0 = im_h - (top + height)
54+
r_y0 = left
6655
r_x1 = r_x0
67-
r_y1 = r_y0 - width
68-
r_x2 = r_x0 - height
69-
r_y2 = r_y0 - width
70-
r_x3 = r_x0 - height
56+
r_y1 = r_y0 + width
57+
r_x2 = r_x1 + height
58+
r_y2 = r_y1
59+
r_x3 = r_x2
7160
r_y3 = r_y0
7261
else:
7362
msg = (
7463
f"invalid orientation {angle}, expected values in:"
7564
f" {sorted(CLIPPED_ORIENTATIONS)}"
7665
)
7766
raise ValueError(msg)
78-
return BoundingRectangle(
67+
rectangle = BoundingRectangle(
7968
r_x0=r_x0,
8069
r_y0=r_y0,
8170
r_x1=r_x1,
@@ -86,3 +75,4 @@ def rotate_bounding_box(
8675
r_y3=r_y3,
8776
coord_origin=CoordOrigin.TOPLEFT,
8877
)
78+
return rectangle
Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
11
<document>
2-
<paragraph><location><page_1><loc_12><loc_82><loc_85><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
2+
<table>
3+
<location><page_1><loc_12><loc_39><loc_67><loc_87></location>
4+
<row_0><col_0><body></col_0><col_1><col_header>Column 0</col_1><col_2><col_header>Column 1</col_2><col_3><col_header>Column 2</col_3></row_0>
5+
<row_1><col_0><row_header>this is row 0</col_0><col_1><body>some cells</col_1><col_2><body>have content</col_2><col_3><body>and</col_3></row_1>
6+
<row_2><col_0><row_header>and row 1</col_0><col_1><body></col_1><col_2><body>other</col_2><col_3><body>have</col_3></row_2>
7+
<row_3><col_0><row_header>and last row 2</col_0><col_1><body>nothing</col_1><col_2><body></col_2><col_3><body>inside</col_3></row_3>
8+
</table>
39
</document>
Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
1-
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
1+
| | Column 0 | Column 1 | Column 2 |
2+
|----------------|------------|--------------|------------|
3+
| this is row 0 | some cells | have content | and |
4+
| and row 1 | | other | have |
5+
| and last row 2 | nothing | | inside |

tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.doctags.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.

tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.json

Lines changed: 0 additions & 1 deletion
This file was deleted.

tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.md

Lines changed: 0 additions & 1 deletion
This file was deleted.

0 commit comments

Comments
 (0)