Skip to content

Commit d43af71

Browse files
divekarsccau-gitSamved Divekar
authored
Add layout information to AWS to Docling Predictor (#77)
* Add README for Docling-DPBench Signed-off-by: Christoph Auer <[email protected]> * Draft commit for adding layout information to AWS Predictor Signed-off-by: Samved Divekar <[email protected]> * Draft commit for adding layout information to AWS Predictor Signed-off-by: Samved Divekar <[email protected]> * Add layout predictor to response object, add tests Signed-off-by: Samved Divekar <[email protected]> * Map Key-Vale regions to TEXT, add call to download GT in tests Signed-off-by: Samved Divekar <[email protected]> * Add tests for reading order on AWS predictions Signed-off-by: Samved Divekar <[email protected]> * Add tests for reading order on AWS predictions, fix directory structure Signed-off-by: Samved Divekar <[email protected]> * Add tests for markdown for AWS, add missing label Signed-off-by: Samved Divekar <[email protected]> * fix formatting Signed-off-by: Samved Divekar <[email protected]> * Minor refactoring to resolve review comments Signed-off-by: Samved Divekar <[email protected]> * Establish SegmentedPage support in DatasetRecord and DatasetRecordWithPrediction Signed-off-by: Christoph Auer <[email protected]> * Add SegmentedPage usage to PixParse dataset provider Signed-off-by: Christoph Auer <[email protected]> * Use segmented page for populating word level information, remove words from DoclingDocument Signed-off-by: Samved Divekar <[email protected]> * Fix isort failure Signed-off-by: Samved Divekar <[email protected]> * Fix precommit errors Signed-off-by: Samved Divekar <[email protected]> * Do not add a word if text or bbox is missing Signed-off-by: Samved Divekar <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Samved Divekar <[email protected]> Co-authored-by: Christoph Auer <[email protected]> Co-authored-by: Samved Divekar <[email protected]>
1 parent be0ff6a commit d43af71

File tree

4 files changed

+583
-28
lines changed

4 files changed

+583
-28
lines changed

docling_eval/prediction_providers/aws_prediction_provider.py

Lines changed: 276 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,10 @@
22
import json
33
import logging
44
import os
5-
from io import BytesIO
6-
from pathlib import Path
7-
from typing import Dict, List, Optional, Set
5+
from typing import Dict, Optional, Set, Tuple
86

97
import boto3
108
from docling.datamodel.base_models import ConversionStatus
11-
from docling_core.types import DoclingDocument
129
from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
1310
from docling_core.types.doc.document import (
1411
DoclingDocument,
@@ -19,8 +16,14 @@
1916
TableData,
2017
)
2118
from docling_core.types.doc.labels import DocItemLabel
19+
from docling_core.types.doc.page import (
20+
BoundingRectangle,
21+
PageGeometry,
22+
SegmentedPage,
23+
TextCell,
24+
)
2225
from docling_core.types.io import DocumentStream
23-
from PIL import Image
26+
from skimage.draw import polygon
2427

2528
from docling_eval.datamodels.dataset_record import (
2629
DatasetRecord,
@@ -186,13 +189,14 @@ def process_table(self, table, blocks_map, page_no):
186189

187190
def convert_aws_output_to_docling(
188191
self, analyze_result, record: DatasetRecord, file_bytes
189-
) -> DoclingDocument:
192+
) -> Tuple[DoclingDocument, Dict[int, SegmentedPage]]:
190193
"""Converts AWS Textract output to DoclingDocument format."""
191194
doc = DoclingDocument(name=record.doc_id)
192195

193196
blocks_map = {block["Id"]: block for block in analyze_result.get("Blocks", [])}
194197

195198
processed_pages = set()
199+
segmented_pages: Dict[int, SegmentedPage] = {}
196200

197201
# Get page dimensions from page block
198202
# AWS provides normalized coordinates, so we need to multiply by a typical page size
@@ -201,7 +205,6 @@ def convert_aws_output_to_docling(
201205
im = record.ground_truth_page_images[0]
202206
width, height = im.size
203207

204-
# TODO: Can we get more detail than just "Text blocks" from AWS Textract? If they provide layout labels, let's use it here.
205208
for block in analyze_result.get("Blocks", []):
206209
if block["BlockType"] == "PAGE":
207210
page_no = int(block.get("Page", 1))
@@ -223,33 +226,276 @@ def convert_aws_output_to_docling(
223226

224227
doc.pages[page_no] = page_item
225228

226-
if block["BlockType"] == "WORD" and block.get("Page", 1) == page_no:
229+
# Create SegmentedPage Entry if not already present for the page number
230+
if page_no not in segmented_pages.keys():
231+
seg_page = SegmentedPage(
232+
dimension=PageGeometry(
233+
angle=0,
234+
rect=BoundingRectangle.from_bounding_box(
235+
BoundingBox(
236+
l=0,
237+
t=0,
238+
r=page_item.size.width,
239+
b=page_item.size.height,
240+
)
241+
),
242+
)
243+
)
244+
segmented_pages[page_no] = seg_page
245+
246+
elif block["BlockType"] == "WORD" and block.get("Page", 1) == page_no:
247+
text_content = block.get("Text", None)
248+
geometry = block.get("Geometry", None)
249+
250+
if text_content is not None and geometry is not None:
251+
bbox = self.extract_bbox_from_geometry(geometry)
252+
# Scale normalized coordinates to the page dimensions
253+
bbox_obj = BoundingBox(
254+
l=bbox["l"] * width,
255+
t=bbox["t"] * height,
256+
r=bbox["r"] * width,
257+
b=bbox["b"] * height,
258+
coord_origin=CoordOrigin.TOPLEFT,
259+
)
260+
261+
segmented_pages[page_no].word_cells.append(
262+
TextCell(
263+
rect=BoundingRectangle.from_bounding_box(bbox_obj),
264+
text=text_content,
265+
orig=text_content,
266+
# Keeping from_ocr flag False since AWS output doesn't indicate whether the given word is programmatic or OCR
267+
from_ocr=False,
268+
)
269+
)
270+
271+
elif block["BlockType"] == "LAYOUT_TITLE":
227272
text_content = block.get("Text", "")
228-
bbox = self.extract_bbox_from_geometry(block.get("Geometry", {}))
229-
230-
# Scale normalized coordinates to the page dimensions
231-
bbox_obj = BoundingBox(
232-
l=bbox["l"] * width,
233-
t=bbox["t"] * height,
234-
r=bbox["r"] * width,
235-
b=bbox["b"] * height,
236-
coord_origin=CoordOrigin.TOPLEFT,
237-
)
273+
self._add_title(block, doc, height, page_no, text_content, width)
238274

239-
prov = ProvenanceItem(
240-
page_no=page_no,
241-
bbox=bbox_obj,
242-
charspan=(0, len(text_content)),
243-
)
275+
elif block["BlockType"] == "LAYOUT_HEADER":
276+
self._add_page_header(block, doc, height, page_no, width)
277+
278+
elif block["BlockType"] == "LAYOUT_FOOTER":
279+
self._add_page_footer(block, doc, height, page_no, width)
280+
281+
elif block["BlockType"] == "LAYOUT_SECTION_HEADER":
282+
self._add_heading(block, doc, height, page_no, width)
283+
284+
elif block["BlockType"] == "LAYOUT_PAGE_NUMBER":
285+
self._add_page_number(block, doc, height, page_no, width)
286+
287+
elif block["BlockType"] == "LAYOUT_LIST":
288+
self._add_list(block, doc, height, page_no, width)
289+
290+
elif block["BlockType"] == "LAYOUT_FIGURE":
291+
self._add_figure(block, doc, height, page_no, width)
292+
293+
elif block["BlockType"] == "LAYOUT_KEY_VALUE":
294+
self._add_key_value(block, doc, height, page_no, width)
244295

245-
doc.add_text(label=DocItemLabel.TEXT, text=text_content, prov=prov)
296+
# This condition is to add only the layout of the table as predicted, doesn't contain the cell structure
297+
elif block["BlockType"] == "LAYOUT_TABLE":
298+
self._add_table_layout(block, doc, height, page_no, width)
246299

247-
if block["BlockType"] == "TABLE":
300+
elif block["BlockType"] == "LAYOUT_TEXT":
301+
self._add_text(block, doc, height, page_no, width)
302+
303+
# This condition is to add output from actual tables API which adds detailed table
304+
elif block["BlockType"] == "TABLE":
248305
page_no = int(block.get("Page", 1))
249306
table_prov, table_data = self.process_table(block, blocks_map, page_no)
250307
doc.add_table(prov=table_prov, data=table_data, caption=None)
251308

252-
return doc
309+
return doc, segmented_pages
310+
311+
def _add_text(self, block, doc, height, page_no, width):
312+
"""Maps AWS text to Docling text."""
313+
text_content = block.get("Text", "")
314+
bbox = self.extract_bbox_from_geometry(block.get("Geometry", {}))
315+
# Scale normalized coordinates to the page dimensions
316+
bbox_obj = BoundingBox(
317+
l=bbox["l"] * width,
318+
t=bbox["t"] * height,
319+
r=bbox["r"] * width,
320+
b=bbox["b"] * height,
321+
coord_origin=CoordOrigin.TOPLEFT,
322+
)
323+
prov = ProvenanceItem(
324+
page_no=page_no,
325+
bbox=bbox_obj,
326+
charspan=(0, len(text_content)),
327+
)
328+
doc.add_text(label=DocItemLabel.TEXT, text=text_content, prov=prov)
329+
330+
def _add_table_layout(self, block, doc, height, page_no, width):
331+
"""Maps AWS table layout to Docling table layout"""
332+
text_content = block.get("Text", "")
333+
bbox = self.extract_bbox_from_geometry(block.get("Geometry", {}))
334+
# Scale normalized coordinates to the page dimensions
335+
bbox_obj = BoundingBox(
336+
l=bbox["l"] * width,
337+
t=bbox["t"] * height,
338+
r=bbox["r"] * width,
339+
b=bbox["b"] * height,
340+
coord_origin=CoordOrigin.TOPLEFT,
341+
)
342+
prov = ProvenanceItem(
343+
page_no=page_no,
344+
bbox=bbox_obj,
345+
charspan=(0, len(text_content)),
346+
)
347+
doc.add_table(data=TableData(), prov=prov)
348+
349+
def _add_key_value(self, block, doc, height, page_no, width):
350+
"""Maps AWS Kew-Value pairs to Docling text"""
351+
text_content = block.get("Text", "")
352+
bbox = self.extract_bbox_from_geometry(block.get("Geometry", {}))
353+
# Scale normalized coordinates to the page dimensions
354+
bbox_obj = BoundingBox(
355+
l=bbox["l"] * width,
356+
t=bbox["t"] * height,
357+
r=bbox["r"] * width,
358+
b=bbox["b"] * height,
359+
coord_origin=CoordOrigin.TOPLEFT,
360+
)
361+
prov = ProvenanceItem(
362+
page_no=page_no,
363+
bbox=bbox_obj,
364+
charspan=(0, len(text_content)),
365+
)
366+
doc.add_text(label=DocItemLabel.TEXT, text=text_content, prov=prov)
367+
368+
def _add_figure(self, block, doc, height, page_no, width):
369+
"""Maps AWS Figure to Docling picture"""
370+
text_content = block.get("Text", "")
371+
bbox = self.extract_bbox_from_geometry(block.get("Geometry", {}))
372+
# Scale normalized coordinates to the page dimensions
373+
bbox_obj = BoundingBox(
374+
l=bbox["l"] * width,
375+
t=bbox["t"] * height,
376+
r=bbox["r"] * width,
377+
b=bbox["b"] * height,
378+
coord_origin=CoordOrigin.TOPLEFT,
379+
)
380+
prov = ProvenanceItem(
381+
page_no=page_no,
382+
bbox=bbox_obj,
383+
charspan=(0, len(text_content)),
384+
)
385+
doc.add_picture(prov=prov)
386+
387+
def _add_list(self, block, doc, height, page_no, width):
388+
"""Maps AWS List to Docling List"""
389+
text_content = block.get("Text", "")
390+
bbox = self.extract_bbox_from_geometry(block.get("Geometry", {}))
391+
# Scale normalized coordinates to the page dimensions
392+
bbox_obj = BoundingBox(
393+
l=bbox["l"] * width,
394+
t=bbox["t"] * height,
395+
r=bbox["r"] * width,
396+
b=bbox["b"] * height,
397+
coord_origin=CoordOrigin.TOPLEFT,
398+
)
399+
prov = ProvenanceItem(
400+
page_no=page_no,
401+
bbox=bbox_obj,
402+
charspan=(0, len(text_content)),
403+
)
404+
doc.add_list_item(text=text_content, prov=prov)
405+
406+
def _add_page_number(self, block, doc, height, page_no, width):
407+
"""Maps AWS page number to Docling text"""
408+
text_content = block.get("Text", "")
409+
bbox = self.extract_bbox_from_geometry(block.get("Geometry", {}))
410+
# Scale normalized coordinates to the page dimensions
411+
bbox_obj = BoundingBox(
412+
l=bbox["l"] * width,
413+
t=bbox["t"] * height,
414+
r=bbox["r"] * width,
415+
b=bbox["b"] * height,
416+
coord_origin=CoordOrigin.TOPLEFT,
417+
)
418+
prov = ProvenanceItem(
419+
page_no=page_no,
420+
bbox=bbox_obj,
421+
charspan=(0, len(text_content)),
422+
)
423+
doc.add_text(label=DocItemLabel.TEXT, text=text_content, prov=prov)
424+
425+
def _add_heading(self, block, doc, height, page_no, width):
426+
"""Maps AWS section header to Docling section header"""
427+
text_content = block.get("Text", "")
428+
bbox = self.extract_bbox_from_geometry(block.get("Geometry", {}))
429+
# Scale normalized coordinates to the page dimensions
430+
bbox_obj = BoundingBox(
431+
l=bbox["l"] * width,
432+
t=bbox["t"] * height,
433+
r=bbox["r"] * width,
434+
b=bbox["b"] * height,
435+
coord_origin=CoordOrigin.TOPLEFT,
436+
)
437+
prov = ProvenanceItem(
438+
page_no=page_no,
439+
bbox=bbox_obj,
440+
charspan=(0, len(text_content)),
441+
)
442+
doc.add_heading(text=text_content, prov=prov)
443+
444+
def _add_page_footer(self, block, doc, height, page_no, width):
445+
"""Maps AWS page footer to Docling page footer"""
446+
text_content = block.get("Text", "")
447+
bbox = self.extract_bbox_from_geometry(block.get("Geometry", {}))
448+
# Scale normalized coordinates to the page dimensions
449+
bbox_obj = BoundingBox(
450+
l=bbox["l"] * width,
451+
t=bbox["t"] * height,
452+
r=bbox["r"] * width,
453+
b=bbox["b"] * height,
454+
coord_origin=CoordOrigin.TOPLEFT,
455+
)
456+
prov = ProvenanceItem(
457+
page_no=page_no,
458+
bbox=bbox_obj,
459+
charspan=(0, len(text_content)),
460+
)
461+
doc.add_text(label=DocItemLabel.PAGE_FOOTER, text=text_content, prov=prov)
462+
463+
def _add_page_header(self, block, doc, height, page_no, width):
464+
"""Maps AWS page header to Docling page header"""
465+
text_content = block.get("Text", "")
466+
bbox = self.extract_bbox_from_geometry(block.get("Geometry", {}))
467+
# Scale normalized coordinates to the page dimensions
468+
bbox_obj = BoundingBox(
469+
l=bbox["l"] * width,
470+
t=bbox["t"] * height,
471+
r=bbox["r"] * width,
472+
b=bbox["b"] * height,
473+
coord_origin=CoordOrigin.TOPLEFT,
474+
)
475+
prov = ProvenanceItem(
476+
page_no=page_no,
477+
bbox=bbox_obj,
478+
charspan=(0, len(text_content)),
479+
)
480+
doc.add_text(label=DocItemLabel.PAGE_HEADER, text=text_content, prov=prov)
481+
482+
def _add_title(self, block, doc, height, page_no, text_content, width):
483+
"""Maps AWS title to Docling title"""
484+
bbox = self.extract_bbox_from_geometry(block.get("Geometry", {}))
485+
# Scale normalized coordinates to the page dimensions
486+
bbox_obj = BoundingBox(
487+
l=bbox["l"] * width,
488+
t=bbox["t"] * height,
489+
r=bbox["r"] * width,
490+
b=bbox["b"] * height,
491+
coord_origin=CoordOrigin.TOPLEFT,
492+
)
493+
prov = ProvenanceItem(
494+
page_no=page_no,
495+
bbox=bbox_obj,
496+
charspan=(0, len(text_content)),
497+
)
498+
doc.add_title(text=text_content, prov=prov)
253499

254500
@property
255501
def prediction_format(self) -> PredictionFormats:
@@ -273,15 +519,16 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
273519

274520
file_bytes = record.original.stream.read()
275521
response = self.textract_client.analyze_document(
276-
Document={"Bytes": file_bytes}, FeatureTypes=["TABLES", "FORMS"]
522+
Document={"Bytes": file_bytes},
523+
FeatureTypes=["TABLES", "FORMS", "LAYOUT"],
277524
)
278525
result_orig = json.dumps(response, default=str)
279526
result_json = json.loads(result_orig)
280527
_log.info(
281528
f"Successfully processed [{record.doc_id}] using AWS Textract API!"
282529
)
283530

284-
pred_doc = self.convert_aws_output_to_docling(
531+
pred_doc, pred_segmented_pages = self.convert_aws_output_to_docling(
285532
result_json, record, file_bytes
286533
)
287534
else:
@@ -300,6 +547,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
300547
pred_record = self.create_dataset_record_with_prediction(
301548
record, pred_doc, result_orig
302549
)
550+
pred_record.predicted_segmented_pages = pred_segmented_pages
303551
pred_record.status = status
304552
return pred_record
305553

0 commit comments

Comments
 (0)