Skip to content

Commit 8ffa01b

Browse files
committed
fix(layout,table): orientation-aware layout and table detection
Signed-off-by: Clément Doumouro <[email protected]>
1 parent a47fd83 commit 8ffa01b

26 files changed

+570
-95
lines changed

docling/models/layout_model.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import copy
22
import logging
33
import warnings
4-
from copy import deepcopy
54
from collections.abc import Iterable
5+
from copy import deepcopy
66
from pathlib import Path
77
from typing import Optional
88

@@ -19,7 +19,7 @@
1919
from docling.models.utils.hf_model_download import download_hf_model
2020
from docling.utils.accelerator_utils import decide_device
2121
from docling.utils.layout_postprocessor import LayoutPostprocessor
22-
from docling.utils.orientation import detect_orientation
22+
from docling.utils.orientation import detect_orientation, rotate_bounding_box
2323
from docling.utils.profiling import TimeRecorder
2424
from docling.utils.visualization import draw_clusters
2525

@@ -105,7 +105,6 @@ def draw_clusters_and_cells_side_by_side(
105105
self,
106106
conv_res,
107107
page,
108-
page_orientation: int,
109108
clusters,
110109
mode_prefix: str,
111110
show: bool = False,
@@ -119,10 +118,6 @@ def draw_clusters_and_cells_side_by_side(
119118
page_image = deepcopy(page.image)
120119
scale_x = page_image.width / page.size.width
121120
scale_y = page_image.height / page.size.height
122-
if page_orientation:
123-
page_image = page_image.rotate(-page_orientation, expand=True)
124-
if abs(page_orientation) in [90, 270]:
125-
scale_x, scale_y = scale_y, scale_x
126121
# Filter clusters for left and right images
127122
exclude_labels = {
128123
DocItemLabel.FORM,
@@ -138,9 +133,6 @@ def draw_clusters_and_cells_side_by_side(
138133
# Draw clusters on both images
139134
draw_clusters(left_image, left_clusters, scale_x, scale_y)
140135
draw_clusters(right_image, right_clusters, scale_x, scale_y)
141-
if page_orientation:
142-
left_image = left_image.rotate(page_orientation, expand=True)
143-
right_image = right_image.rotate(page_orientation, expand=True)
144136
# Combine the images side by side
145137
combined_width = left_image.width * 2
146138
combined_height = left_image.height
@@ -183,11 +175,16 @@ def __call__(
183175
.replace(" ", "_")
184176
.replace("-", "_")
185177
) # Temporary, until docling-ibm-model uses docling-core types
178+
bbox = BoundingBox.model_validate(pred_item)
179+
if page_orientation:
180+
bbox = rotate_bounding_box(
181+
bbox, page_orientation, page_image.size
182+
).to_bounding_box()
186183
cluster = Cluster(
187184
id=ix,
188185
label=label,
189186
confidence=pred_item["confidence"],
190-
bbox=BoundingBox.model_validate(pred_item),
187+
bbox=bbox,
191188
cells=[],
192189
)
193190
clusters.append(cluster)
@@ -196,7 +193,6 @@ def __call__(
196193
self.draw_clusters_and_cells_side_by_side(
197194
conv_res,
198195
page,
199-
page_orientation,
200196
clusters,
201197
mode_prefix="raw",
202198
)
@@ -234,7 +230,6 @@ def __call__(
234230
self.draw_clusters_and_cells_side_by_side(
235231
conv_res,
236232
page,
237-
page_orientation,
238233
processed_clusters,
239234
mode_prefix="postprocessed",
240235
)

docling/models/table_structure_model.py

Lines changed: 54 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import copy
22
import warnings
3-
from collections.abc import Iterable
43
from pathlib import Path
5-
from typing import Optional
4+
from typing import Iterable, Optional, Tuple, cast
65

76
import numpy
87
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@@ -11,6 +10,7 @@
1110
TextCellUnit,
1211
)
1312
from PIL import ImageDraw
13+
from PIL.Image import Image
1414

1515
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
1616
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
@@ -23,13 +23,16 @@
2323
from docling.models.base_model import BasePageModel
2424
from docling.models.utils.hf_model_download import download_hf_model
2525
from docling.utils.accelerator_utils import decide_device
26+
from docling.utils.orientation import detect_orientation, rotate_bounding_box
2627
from docling.utils.profiling import TimeRecorder
2728

2829

2930
class TableStructureModel(BasePageModel):
3031
_model_repo_folder = "ds4sd--docling-models"
3132
_model_path = "model_artifacts/tableformer"
3233

34+
_table_labels = {DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX}
35+
3336
def __init__(
3437
self,
3538
enabled: bool,
@@ -186,31 +189,48 @@ def __call__(
186189
page.predictions.tablestructure = (
187190
TableStructurePrediction()
188191
) # dummy
192+
cells_orientation = detect_orientation(page.cells)
193+
# Keep only table bboxes
194+
in_tables_clusters = [
195+
cluster
196+
for cluster in page.predictions.layout.clusters
197+
if cluster.label in self._table_labels
198+
]
189199

200+
if not len(in_tables_clusters):
201+
yield page
202+
continue
203+
# Rotate and scale table image
204+
page_im = cast(Image, page.get_image())
205+
scaled_page_im: Image = cast(
206+
Image, page.get_image(scale=self.scale)
207+
)
208+
if cells_orientation:
209+
scaled_page_im = scaled_page_im.rotate(
210+
-cells_orientation, expand=True
211+
)
212+
page_input = {
213+
"width": scaled_page_im.size[0],
214+
"height": scaled_page_im.size[1],
215+
"image": numpy.asarray(scaled_page_im),
216+
}
217+
# Rotate and scale table cells
190218
in_tables = [
191219
(
192-
cluster,
220+
c,
193221
[
194-
round(cluster.bbox.l) * self.scale,
195-
round(cluster.bbox.t) * self.scale,
196-
round(cluster.bbox.r) * self.scale,
197-
round(cluster.bbox.b) * self.scale,
222+
round(x) * self.scale
223+
for x in _rotate_bbox(
224+
c.bbox,
225+
orientation=-cells_orientation,
226+
im_size=page_im.size,
227+
)
228+
.to_top_left_origin(page_im.size[1])
229+
.as_tuple()
198230
],
199231
)
200-
for cluster in page.predictions.layout.clusters
201-
if cluster.label
202-
in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
232+
for c in in_tables_clusters
203233
]
204-
if not len(in_tables):
205-
yield page
206-
continue
207-
208-
page_input = {
209-
"width": page.size.width * self.scale,
210-
"height": page.size.height * self.scale,
211-
"image": numpy.asarray(page.get_image(scale=self.scale)),
212-
}
213-
214234
table_clusters, table_bboxes = zip(*in_tables)
215235

216236
if len(table_bboxes):
@@ -238,11 +258,16 @@ def __call__(
238258
scale=self.scale
239259
)
240260
)
261+
new_bbox = _rotate_bbox(
262+
new_cell.to_bounding_box(),
263+
orientation=-cells_orientation,
264+
im_size=scaled_page_im.size,
265+
).model_dump()
241266
tokens.append(
242267
{
243268
"id": new_cell.index,
244269
"text": new_cell.text,
245-
"bbox": new_cell.rect.to_bounding_box().model_dump(),
270+
"bbox": new_bbox,
246271
}
247272
)
248273
page_input["tokens"] = tokens
@@ -302,3 +327,11 @@ def __call__(
302327
)
303328

304329
yield page
330+
331+
332+
def _rotate_bbox(
333+
bbox: BoundingBox, *, orientation: int, im_size: Tuple[int, int]
334+
) -> BoundingBox:
335+
if orientation:
336+
return rotate_bounding_box(bbox, orientation, im_size).to_bounding_box()
337+
return bbox

docling/models/tesseract_ocr_cli_model.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
parse_tesseract_orientation,
2828
tesseract_box_to_bounding_rectangle,
2929
)
30-
from docling.utils.orientation import Box
3130
from docling.utils.profiling import TimeRecorder
3231

3332
_log = logging.getLogger(__name__)

docling/utils/ocr_utils.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Optional
1+
from typing import Optional, Tuple
22

33
from docling_core.types.doc import BoundingBox, CoordOrigin
44
from docling_core.types.doc.page import BoundingRectangle
@@ -43,7 +43,9 @@ def tesseract_box_to_bounding_rectangle(
4343
orientation: int,
4444
im_size: Tuple[int, int],
4545
) -> BoundingRectangle:
46-
# box is in the top, left, height, width format, top left coordinates
46+
# bbox is in the top, left, height, width format, top left coordinates
47+
# We detected the tesseract on the document rotated with minus orientation, we have
48+
# to apply an orientation angle
4749
rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
4850
rect = BoundingRectangle(
4951
r_x0=rect.r_x0 / scale,
@@ -54,7 +56,7 @@ def tesseract_box_to_bounding_rectangle(
5456
r_y2=rect.r_y2 / scale,
5557
r_x3=rect.r_x3 / scale,
5658
r_y3=rect.r_y3 / scale,
57-
coord_origin=CoordOrigin.TOPLEFT,
59+
coord_origin=rect.coord_origin,
5860
)
5961
if original_offset is not None:
6062
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:

docling/utils/orientation.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,22 @@
11
from collections import Counter
22
from operator import itemgetter
3+
from typing import Tuple
34

4-
from docling_core.types.doc.page import TextCell
5+
from docling_core.types.doc import BoundingBox, CoordOrigin
6+
from docling_core.types.doc.page import BoundingRectangle, TextCell
57

6-
_ORIENTATIONS = [0, 90, 180, 270]
8+
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
79

810

911
def _clipped_orientation(angle: float) -> int:
10-
return min((abs(angle - o) % 360, o) for o in _ORIENTATIONS)[1]
12+
return min((abs(angle - o) % 360, o) for o in CLIPPED_ORIENTATIONS)[1]
1113

1214

1315
def detect_orientation(cells: list[TextCell]) -> int:
1416
if not cells:
1517
return 0
1618
orientation_counter = Counter(_clipped_orientation(c.rect.angle_360) for c in cells)
1719
return max(orientation_counter.items(), key=itemgetter(1))[0]
18-
from typing import Tuple
19-
20-
from docling_core.types.doc import BoundingBox, CoordOrigin
21-
from docling_core.types.doc.page import BoundingRectangle
22-
23-
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
2420

2521

2622
def rotate_bounding_box(
Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
11
<document>
2-
<paragraph><location><page_1><loc_12><loc_82><loc_85><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
2+
<table>
3+
<location><page_1><loc_12><loc_39><loc_67><loc_87></location>
4+
<row_0><col_0><body></col_0><col_1><col_header>Column 0</col_1><col_2><col_header>Column 1</col_2><col_3><col_header>Column 2</col_3></row_0>
5+
<row_1><col_0><row_header>this is row 0</col_0><col_1><body>some cells</col_1><col_2><body>have content</col_2><col_3><body>and</col_3></row_1>
6+
<row_2><col_0><row_header>and row 1</col_0><col_1><body></col_1><col_2><body>other</col_2><col_3><body>have</col_3></row_2>
7+
<row_3><col_0><row_header>and last row 2</col_0><col_1><body>nothing</col_1><col_2><body></col_2><col_3><body>inside</col_3></row_3>
8+
</table>
39
</document>
Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
1-
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
1+
| | Column 0 | Column 1 | Column 2 |
2+
|----------------|------------|--------------|------------|
3+
| this is row 0 | some cells | have content | and |
4+
| and row 1 | | other | have |
5+
| and last row 2 | nothing | | inside |

tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.doctags.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.

tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.json

Lines changed: 0 additions & 1 deletion
This file was deleted.

tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.md

Lines changed: 0 additions & 1 deletion
This file was deleted.

0 commit comments

Comments
 (0)