22import json
33import logging
44import os
5- from io import BytesIO
6- from pathlib import Path
7- from typing import Dict , List , Optional , Set
5+ from typing import Dict , Optional , Set , Tuple
86
97import boto3
108from docling .datamodel .base_models import ConversionStatus
11- from docling_core .types import DoclingDocument
129from docling_core .types .doc .base import BoundingBox , CoordOrigin , Size
1310from docling_core .types .doc .document import (
1411 DoclingDocument ,
1916 TableData ,
2017)
2118from docling_core .types .doc .labels import DocItemLabel
19+ from docling_core .types .doc .page import (
20+ BoundingRectangle ,
21+ PageGeometry ,
22+ SegmentedPage ,
23+ TextCell ,
24+ )
2225from docling_core .types .io import DocumentStream
23- from PIL import Image
26+ from skimage . draw import polygon
2427
2528from docling_eval .datamodels .dataset_record import (
2629 DatasetRecord ,
@@ -186,13 +189,14 @@ def process_table(self, table, blocks_map, page_no):
186189
187190 def convert_aws_output_to_docling (
188191 self , analyze_result , record : DatasetRecord , file_bytes
189- ) -> DoclingDocument :
192+ ) -> Tuple [ DoclingDocument , Dict [ int , SegmentedPage ]] :
190193 """Converts AWS Textract output to DoclingDocument format."""
191194 doc = DoclingDocument (name = record .doc_id )
192195
193196 blocks_map = {block ["Id" ]: block for block in analyze_result .get ("Blocks" , [])}
194197
195198 processed_pages = set ()
199+ segmented_pages : Dict [int , SegmentedPage ] = {}
196200
197201 # Get page dimensions from page block
198202 # AWS provides normalized coordinates, so we need to multiply by a typical page size
@@ -201,7 +205,6 @@ def convert_aws_output_to_docling(
201205 im = record .ground_truth_page_images [0 ]
202206 width , height = im .size
203207
204- # TODO: Can we get more detail than just "Text blocks" from AWS Textract? If they provide layout labels, let's use it here.
205208 for block in analyze_result .get ("Blocks" , []):
206209 if block ["BlockType" ] == "PAGE" :
207210 page_no = int (block .get ("Page" , 1 ))
@@ -223,33 +226,276 @@ def convert_aws_output_to_docling(
223226
224227 doc .pages [page_no ] = page_item
225228
226- if block ["BlockType" ] == "WORD" and block .get ("Page" , 1 ) == page_no :
229+ # Create SegmentedPage Entry if not already present for the page number
230+ if page_no not in segmented_pages .keys ():
231+ seg_page = SegmentedPage (
232+ dimension = PageGeometry (
233+ angle = 0 ,
234+ rect = BoundingRectangle .from_bounding_box (
235+ BoundingBox (
236+ l = 0 ,
237+ t = 0 ,
238+ r = page_item .size .width ,
239+ b = page_item .size .height ,
240+ )
241+ ),
242+ )
243+ )
244+ segmented_pages [page_no ] = seg_page
245+
246+ elif block ["BlockType" ] == "WORD" and block .get ("Page" , 1 ) == page_no :
247+ text_content = block .get ("Text" , None )
248+ geometry = block .get ("Geometry" , None )
249+
250+ if text_content is not None and geometry is not None :
251+ bbox = self .extract_bbox_from_geometry (geometry )
252+ # Scale normalized coordinates to the page dimensions
253+ bbox_obj = BoundingBox (
254+ l = bbox ["l" ] * width ,
255+ t = bbox ["t" ] * height ,
256+ r = bbox ["r" ] * width ,
257+ b = bbox ["b" ] * height ,
258+ coord_origin = CoordOrigin .TOPLEFT ,
259+ )
260+
261+ segmented_pages [page_no ].word_cells .append (
262+ TextCell (
263+ rect = BoundingRectangle .from_bounding_box (bbox_obj ),
264+ text = text_content ,
265+ orig = text_content ,
266+ # Keeping from_ocr flag False since AWS output doesn't indicate whether the given word is programmatic or OCR
267+ from_ocr = False ,
268+ )
269+ )
270+
271+ elif block ["BlockType" ] == "LAYOUT_TITLE" :
227272 text_content = block .get ("Text" , "" )
228- bbox = self .extract_bbox_from_geometry (block .get ("Geometry" , {}))
229-
230- # Scale normalized coordinates to the page dimensions
231- bbox_obj = BoundingBox (
232- l = bbox ["l" ] * width ,
233- t = bbox ["t" ] * height ,
234- r = bbox ["r" ] * width ,
235- b = bbox ["b" ] * height ,
236- coord_origin = CoordOrigin .TOPLEFT ,
237- )
273+ self ._add_title (block , doc , height , page_no , text_content , width )
238274
239- prov = ProvenanceItem (
240- page_no = page_no ,
241- bbox = bbox_obj ,
242- charspan = (0 , len (text_content )),
243- )
275+ elif block ["BlockType" ] == "LAYOUT_HEADER" :
276+ self ._add_page_header (block , doc , height , page_no , width )
277+
278+ elif block ["BlockType" ] == "LAYOUT_FOOTER" :
279+ self ._add_page_footer (block , doc , height , page_no , width )
280+
281+ elif block ["BlockType" ] == "LAYOUT_SECTION_HEADER" :
282+ self ._add_heading (block , doc , height , page_no , width )
283+
284+ elif block ["BlockType" ] == "LAYOUT_PAGE_NUMBER" :
285+ self ._add_page_number (block , doc , height , page_no , width )
286+
287+ elif block ["BlockType" ] == "LAYOUT_LIST" :
288+ self ._add_list (block , doc , height , page_no , width )
289+
290+ elif block ["BlockType" ] == "LAYOUT_FIGURE" :
291+ self ._add_figure (block , doc , height , page_no , width )
292+
293+ elif block ["BlockType" ] == "LAYOUT_KEY_VALUE" :
294+ self ._add_key_value (block , doc , height , page_no , width )
244295
245- doc .add_text (label = DocItemLabel .TEXT , text = text_content , prov = prov )
296+ # This condition is to add only the layout of the table as predicted, doesn't contain the cell structure
297+ elif block ["BlockType" ] == "LAYOUT_TABLE" :
298+ self ._add_table_layout (block , doc , height , page_no , width )
246299
247- if block ["BlockType" ] == "TABLE" :
300+ elif block ["BlockType" ] == "LAYOUT_TEXT" :
301+ self ._add_text (block , doc , height , page_no , width )
302+
303+ # This condition is to add output from actual tables API which adds detailed table
304+ elif block ["BlockType" ] == "TABLE" :
248305 page_no = int (block .get ("Page" , 1 ))
249306 table_prov , table_data = self .process_table (block , blocks_map , page_no )
250307 doc .add_table (prov = table_prov , data = table_data , caption = None )
251308
252- return doc
309+ return doc , segmented_pages
310+
311+ def _add_text (self , block , doc , height , page_no , width ):
312+ """Maps AWS text to Docling text."""
313+ text_content = block .get ("Text" , "" )
314+ bbox = self .extract_bbox_from_geometry (block .get ("Geometry" , {}))
315+ # Scale normalized coordinates to the page dimensions
316+ bbox_obj = BoundingBox (
317+ l = bbox ["l" ] * width ,
318+ t = bbox ["t" ] * height ,
319+ r = bbox ["r" ] * width ,
320+ b = bbox ["b" ] * height ,
321+ coord_origin = CoordOrigin .TOPLEFT ,
322+ )
323+ prov = ProvenanceItem (
324+ page_no = page_no ,
325+ bbox = bbox_obj ,
326+ charspan = (0 , len (text_content )),
327+ )
328+ doc .add_text (label = DocItemLabel .TEXT , text = text_content , prov = prov )
329+
330+ def _add_table_layout (self , block , doc , height , page_no , width ):
331+ """Maps AWS table layout to Docling table layout"""
332+ text_content = block .get ("Text" , "" )
333+ bbox = self .extract_bbox_from_geometry (block .get ("Geometry" , {}))
334+ # Scale normalized coordinates to the page dimensions
335+ bbox_obj = BoundingBox (
336+ l = bbox ["l" ] * width ,
337+ t = bbox ["t" ] * height ,
338+ r = bbox ["r" ] * width ,
339+ b = bbox ["b" ] * height ,
340+ coord_origin = CoordOrigin .TOPLEFT ,
341+ )
342+ prov = ProvenanceItem (
343+ page_no = page_no ,
344+ bbox = bbox_obj ,
345+ charspan = (0 , len (text_content )),
346+ )
347+ doc .add_table (data = TableData (), prov = prov )
348+
349+ def _add_key_value (self , block , doc , height , page_no , width ):
350+ """Maps AWS Kew-Value pairs to Docling text"""
351+ text_content = block .get ("Text" , "" )
352+ bbox = self .extract_bbox_from_geometry (block .get ("Geometry" , {}))
353+ # Scale normalized coordinates to the page dimensions
354+ bbox_obj = BoundingBox (
355+ l = bbox ["l" ] * width ,
356+ t = bbox ["t" ] * height ,
357+ r = bbox ["r" ] * width ,
358+ b = bbox ["b" ] * height ,
359+ coord_origin = CoordOrigin .TOPLEFT ,
360+ )
361+ prov = ProvenanceItem (
362+ page_no = page_no ,
363+ bbox = bbox_obj ,
364+ charspan = (0 , len (text_content )),
365+ )
366+ doc .add_text (label = DocItemLabel .TEXT , text = text_content , prov = prov )
367+
368+ def _add_figure (self , block , doc , height , page_no , width ):
369+ """Maps AWS Figure to Docling picture"""
370+ text_content = block .get ("Text" , "" )
371+ bbox = self .extract_bbox_from_geometry (block .get ("Geometry" , {}))
372+ # Scale normalized coordinates to the page dimensions
373+ bbox_obj = BoundingBox (
374+ l = bbox ["l" ] * width ,
375+ t = bbox ["t" ] * height ,
376+ r = bbox ["r" ] * width ,
377+ b = bbox ["b" ] * height ,
378+ coord_origin = CoordOrigin .TOPLEFT ,
379+ )
380+ prov = ProvenanceItem (
381+ page_no = page_no ,
382+ bbox = bbox_obj ,
383+ charspan = (0 , len (text_content )),
384+ )
385+ doc .add_picture (prov = prov )
386+
387+ def _add_list (self , block , doc , height , page_no , width ):
388+ """Maps AWS List to Docling List"""
389+ text_content = block .get ("Text" , "" )
390+ bbox = self .extract_bbox_from_geometry (block .get ("Geometry" , {}))
391+ # Scale normalized coordinates to the page dimensions
392+ bbox_obj = BoundingBox (
393+ l = bbox ["l" ] * width ,
394+ t = bbox ["t" ] * height ,
395+ r = bbox ["r" ] * width ,
396+ b = bbox ["b" ] * height ,
397+ coord_origin = CoordOrigin .TOPLEFT ,
398+ )
399+ prov = ProvenanceItem (
400+ page_no = page_no ,
401+ bbox = bbox_obj ,
402+ charspan = (0 , len (text_content )),
403+ )
404+ doc .add_list_item (text = text_content , prov = prov )
405+
406+ def _add_page_number (self , block , doc , height , page_no , width ):
407+ """Maps AWS page number to Docling text"""
408+ text_content = block .get ("Text" , "" )
409+ bbox = self .extract_bbox_from_geometry (block .get ("Geometry" , {}))
410+ # Scale normalized coordinates to the page dimensions
411+ bbox_obj = BoundingBox (
412+ l = bbox ["l" ] * width ,
413+ t = bbox ["t" ] * height ,
414+ r = bbox ["r" ] * width ,
415+ b = bbox ["b" ] * height ,
416+ coord_origin = CoordOrigin .TOPLEFT ,
417+ )
418+ prov = ProvenanceItem (
419+ page_no = page_no ,
420+ bbox = bbox_obj ,
421+ charspan = (0 , len (text_content )),
422+ )
423+ doc .add_text (label = DocItemLabel .TEXT , text = text_content , prov = prov )
424+
425+ def _add_heading (self , block , doc , height , page_no , width ):
426+ """Maps AWS section header to Docling section header"""
427+ text_content = block .get ("Text" , "" )
428+ bbox = self .extract_bbox_from_geometry (block .get ("Geometry" , {}))
429+ # Scale normalized coordinates to the page dimensions
430+ bbox_obj = BoundingBox (
431+ l = bbox ["l" ] * width ,
432+ t = bbox ["t" ] * height ,
433+ r = bbox ["r" ] * width ,
434+ b = bbox ["b" ] * height ,
435+ coord_origin = CoordOrigin .TOPLEFT ,
436+ )
437+ prov = ProvenanceItem (
438+ page_no = page_no ,
439+ bbox = bbox_obj ,
440+ charspan = (0 , len (text_content )),
441+ )
442+ doc .add_heading (text = text_content , prov = prov )
443+
444+ def _add_page_footer (self , block , doc , height , page_no , width ):
445+ """Maps AWS page footer to Docling page footer"""
446+ text_content = block .get ("Text" , "" )
447+ bbox = self .extract_bbox_from_geometry (block .get ("Geometry" , {}))
448+ # Scale normalized coordinates to the page dimensions
449+ bbox_obj = BoundingBox (
450+ l = bbox ["l" ] * width ,
451+ t = bbox ["t" ] * height ,
452+ r = bbox ["r" ] * width ,
453+ b = bbox ["b" ] * height ,
454+ coord_origin = CoordOrigin .TOPLEFT ,
455+ )
456+ prov = ProvenanceItem (
457+ page_no = page_no ,
458+ bbox = bbox_obj ,
459+ charspan = (0 , len (text_content )),
460+ )
461+ doc .add_text (label = DocItemLabel .PAGE_FOOTER , text = text_content , prov = prov )
462+
463+ def _add_page_header (self , block , doc , height , page_no , width ):
464+ """Maps AWS page header to Docling page header"""
465+ text_content = block .get ("Text" , "" )
466+ bbox = self .extract_bbox_from_geometry (block .get ("Geometry" , {}))
467+ # Scale normalized coordinates to the page dimensions
468+ bbox_obj = BoundingBox (
469+ l = bbox ["l" ] * width ,
470+ t = bbox ["t" ] * height ,
471+ r = bbox ["r" ] * width ,
472+ b = bbox ["b" ] * height ,
473+ coord_origin = CoordOrigin .TOPLEFT ,
474+ )
475+ prov = ProvenanceItem (
476+ page_no = page_no ,
477+ bbox = bbox_obj ,
478+ charspan = (0 , len (text_content )),
479+ )
480+ doc .add_text (label = DocItemLabel .PAGE_HEADER , text = text_content , prov = prov )
481+
482+ def _add_title (self , block , doc , height , page_no , text_content , width ):
483+ """Maps AWS title to Docling title"""
484+ bbox = self .extract_bbox_from_geometry (block .get ("Geometry" , {}))
485+ # Scale normalized coordinates to the page dimensions
486+ bbox_obj = BoundingBox (
487+ l = bbox ["l" ] * width ,
488+ t = bbox ["t" ] * height ,
489+ r = bbox ["r" ] * width ,
490+ b = bbox ["b" ] * height ,
491+ coord_origin = CoordOrigin .TOPLEFT ,
492+ )
493+ prov = ProvenanceItem (
494+ page_no = page_no ,
495+ bbox = bbox_obj ,
496+ charspan = (0 , len (text_content )),
497+ )
498+ doc .add_title (text = text_content , prov = prov )
253499
254500 @property
255501 def prediction_format (self ) -> PredictionFormats :
@@ -273,15 +519,16 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
273519
274520 file_bytes = record .original .stream .read ()
275521 response = self .textract_client .analyze_document (
276- Document = {"Bytes" : file_bytes }, FeatureTypes = ["TABLES" , "FORMS" ]
522+ Document = {"Bytes" : file_bytes },
523+ FeatureTypes = ["TABLES" , "FORMS" , "LAYOUT" ],
277524 )
278525 result_orig = json .dumps (response , default = str )
279526 result_json = json .loads (result_orig )
280527 _log .info (
281528 f"Successfully processed [{ record .doc_id } ] using AWS Textract API!"
282529 )
283530
284- pred_doc = self .convert_aws_output_to_docling (
531+ pred_doc , pred_segmented_pages = self .convert_aws_output_to_docling (
285532 result_json , record , file_bytes
286533 )
287534 else :
@@ -300,6 +547,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
300547 pred_record = self .create_dataset_record_with_prediction (
301548 record , pred_doc , result_orig
302549 )
550+ pred_record .predicted_segmented_pages = pred_segmented_pages
303551 pred_record .status = status
304552 return pred_record
305553
0 commit comments