33import logging
44import os
55from io import BytesIO
6- from pathlib import Path
76from typing import Dict , Optional , Set , Tuple
87
8+ from azure .ai .documentintelligence import DocumentIntelligenceClient
9+ from azure .ai .documentintelligence .models import AnalyzeOutputOption
910from docling .datamodel .base_models import ConversionStatus
1011
1112# from docling_core.types import DoclingDocument
1920 TableCell ,
2021 TableData ,
2122)
23+ from docling_core .types .doc .page import (
24+ BoundingRectangle ,
25+ PageGeometry ,
26+ SegmentedPage ,
27+ TextCell ,
28+ )
2229from docling_core .types .io import DocumentStream
2330
2431from docling_eval .datamodels .dataset_record import (
@@ -64,13 +71,11 @@ def __init__(
6471 # TODO - Need a temp directory to save Azure outputs
6572 # Validate the required library
6673 try :
67- from azure .ai .formrecognizer import ( # type: ignore
68- AnalysisFeature ,
69- DocumentAnalysisClient ,
70- )
7174 from azure .core .credentials import AzureKeyCredential # type: ignore
7275 except ImportError :
73- raise ImportError ("azure-ai-formrecognizer library is not installed.." )
76+ raise ImportError (
77+ "azure-ai-documentintelligence library is not installed.."
78+ )
7479
7580 # Validate the required endpoints to call the API
7681 endpoint = os .getenv ("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT" )
@@ -81,7 +86,7 @@ def __init__(
8186 "AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and AZURE_DOCUMENT_INTELLIGENCE_KEY must be set in environment variables."
8287 )
8388
84- self .doc_intelligence_client = DocumentAnalysisClient (
89+ self .doc_intelligence_client = DocumentIntelligenceClient (
8590 endpoint , AzureKeyCredential (key )
8691 )
8792
@@ -108,9 +113,10 @@ def extract_bbox_from_polygon(self, polygon):
108113
109114 def convert_azure_output_to_docling (
110115 self , analyze_result , record : DatasetRecord
111- ) -> DoclingDocument :
116+ ) -> Tuple [ DoclingDocument , Dict [ int , SegmentedPage ]] :
112117 """Converts Azure Document Intelligence output to DoclingDocument format."""
113118 doc = DoclingDocument (name = record .doc_id )
119+ segmented_pages : Dict [int , SegmentedPage ] = {}
114120
115121 for page in analyze_result .get ("pages" , []):
116122 page_no = page .get ("page_number" , 1 )
@@ -134,32 +140,118 @@ def convert_azure_output_to_docling(
134140 )
135141 doc .pages [page_no ] = page_item
136142
143+ if page_no not in segmented_pages .keys ():
144+ seg_page = SegmentedPage (
145+ dimension = PageGeometry (
146+ angle = 0 ,
147+ rect = BoundingRectangle .from_bounding_box (
148+ BoundingBox (
149+ l = 0 ,
150+ t = 0 ,
151+ r = page_item .size .width ,
152+ b = page_item .size .height ,
153+ )
154+ ),
155+ )
156+ )
157+ segmented_pages [page_no ] = seg_page
158+
137159 for word in page .get ("words" , []):
138- polygon = word .get ("polygon" , [])
139- bbox = self .extract_bbox_from_polygon (polygon )
160+ polygon = word .get ("polygon" , None )
161+ text_content = word .get ("content" , None )
162+
163+ if text_content is not None and polygon is not None :
164+ bbox = self .extract_bbox_from_polygon (polygon )
165+ bbox_obj = BoundingBox (
166+ l = bbox ["l" ],
167+ t = bbox ["t" ],
168+ r = bbox ["r" ],
169+ b = bbox ["b" ],
170+ coord_origin = CoordOrigin .TOPLEFT ,
171+ )
140172
141- text_content = word .get ("content" , "" )
173+ segmented_pages [page_no ].word_cells .append (
174+ TextCell (
175+ rect = BoundingRectangle .from_bounding_box (bbox_obj ),
176+ text = text_content ,
177+ orig = text_content ,
178+ # Keeping from_ocr flag False since Azure output doesn't indicate whether the given word is programmatic or OCR
179+ from_ocr = False ,
180+ )
181+ )
142182
143- bbox_obj = BoundingBox (
144- l = bbox ["l" ],
145- t = bbox ["t" ],
146- r = bbox ["r" ],
147- b = bbox ["b" ],
148- coord_origin = CoordOrigin .TOPLEFT ,
149- )
183+ # Iterate over tables in the response and add to DoclingDocument
184+ self ._add_tables (analyze_result , doc )
150185
151- prov = ProvenanceItem (
152- page_no = page_no , bbox = bbox_obj , charspan = (0 , len (text_content ))
153- )
186+ # Iterate over paragraphs in the response and add populate fields like section headings, header-footer based on "role" field
187+ self ._handle_paragraphs_based_on_roles (analyze_result , doc )
188+
189+ # Iterate over figures and add them as pictures in DoclingDocument
190+ self ._add_figures (analyze_result , doc )
154191
155- # TODO: This needs to be developed further. Azure responses contain full-page document information,
156- # with text and layout features,
157- # see https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/prebuilt/layout
158- # This code only adds the primitive text content, without
159- # layout labels or reading order, then all tables separately. This will work for plain
160- # table datasets only.
192+ return doc , segmented_pages
193+
194+ def _add_figures (self , analyze_result , doc ):
195+ for figure in analyze_result .get ("figures" , []):
196+ bounding_regions = figure ["boundingRegions" ][0 ]
197+ page_no = bounding_regions ["pageNumber" ]
198+ polygon = bounding_regions .get ("polygon" , [])
199+ bbox = self .extract_bbox_from_polygon (polygon )
200+
201+ bbox_obj = BoundingBox (
202+ l = bbox ["l" ],
203+ t = bbox ["t" ],
204+ r = bbox ["r" ],
205+ b = bbox ["b" ],
206+ coord_origin = CoordOrigin .TOPLEFT ,
207+ )
208+
209+ prov = ProvenanceItem (page_no = page_no , bbox = bbox_obj , charspan = (0 , 0 ))
210+ doc .add_picture (prov = prov )
211+
212+ def _handle_paragraphs_based_on_roles (self , analyze_result , doc ):
213+ for paragraph in analyze_result .get ("paragraphs" , []):
214+ bounding_regions = paragraph ["boundingRegions" ][0 ]
215+ page_no = bounding_regions ["pageNumber" ]
216+ polygon = bounding_regions .get ("polygon" , [])
217+ bbox = self .extract_bbox_from_polygon (polygon )
218+
219+ text_content = paragraph .get ("content" , "" )
220+
221+ bbox_obj = BoundingBox (
222+ l = bbox ["l" ],
223+ t = bbox ["t" ],
224+ r = bbox ["r" ],
225+ b = bbox ["b" ],
226+ coord_origin = CoordOrigin .TOPLEFT ,
227+ )
228+
229+ prov = ProvenanceItem (
230+ page_no = page_no , bbox = bbox_obj , charspan = (0 , len (text_content ))
231+ )
232+
233+ role = paragraph .get ("role" , None )
234+ if role :
235+ if role == "sectionHeading" :
236+ doc .add_heading (text = text_content , prov = prov )
237+ elif role == "title" :
238+ doc .add_title (text = text_content , prov = prov )
239+ elif role == "footnote" :
240+ doc .add_text (label = DocItemLabel .TEXT , text = text_content , prov = prov )
241+ elif role == "pageHeader" :
242+ doc .add_text (
243+ label = DocItemLabel .PAGE_HEADER , text = text_content , prov = prov
244+ )
245+ elif role == "pageFooter" :
246+ doc .add_text (
247+ label = DocItemLabel .PAGE_FOOTER , text = text_content , prov = prov
248+ )
249+ elif role == "pageNumber" :
250+ doc .add_text (label = DocItemLabel .TEXT , text = text_content , prov = prov )
251+ else :
161252 doc .add_text (label = DocItemLabel .TEXT , text = text_content , prov = prov )
162253
254+ def _add_tables (self , analyze_result , doc ):
163255 for table in analyze_result .get ("tables" , []):
164256 page_no = table .get ("page_range" , {}).get ("first_page_number" , 1 )
165257 row_count = table .get ("row_count" , 0 )
@@ -183,7 +275,6 @@ def convert_azure_output_to_docling(
183275 table_cells = []
184276
185277 for cell in table .get ("cells" , []):
186-
187278 cell_text = cell .get ("content" , "" ).strip ()
188279 row_index = cell .get ("row_index" , 0 )
189280 col_index = cell .get ("column_index" , 0 )
@@ -221,8 +312,6 @@ def convert_azure_output_to_docling(
221312
222313 doc .add_table (prov = table_prov , data = table_data , caption = None )
223314
224- return doc
225-
226315 @property
227316 def prediction_format (self ) -> PredictionFormats :
228317 """Get the prediction format."""
@@ -255,10 +344,13 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
255344 )
256345 # Call the Azure API by passing in the image for prediction
257346 poller = self .doc_intelligence_client .begin_analyze_document (
258- "prebuilt-layout" , record .original .stream , features = []
347+ "prebuilt-layout" ,
348+ record .original .stream ,
349+ features = [],
350+ output = [AnalyzeOutputOption .FIGURES ],
259351 )
260- result = poller .result ()
261- result_json = result . to_dict ( )
352+ result = poller .result (). as_dict ()
353+ result_json = json . dumps ( result )
262354 _log .info (
263355 f"Successfully processed [{ record .doc_id } ] using Azure API..!!"
264356 )
@@ -271,10 +363,13 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
271363 record .ground_truth_page_images [0 ].save (buf , format = "PNG" )
272364
273365 poller = self .doc_intelligence_client .begin_analyze_document (
274- "prebuilt-layout" , BytesIO (buf .getvalue ()), features = []
366+ "prebuilt-layout" ,
367+ BytesIO (buf .getvalue ()),
368+ features = [],
369+ output = [AnalyzeOutputOption .FIGURES ],
275370 )
276- result = poller .result ()
277- result_json = result . to_dict ( )
371+ result = poller .result (). as_dict ()
372+ result_json = json . dumps ( result , default = str )
278373 _log .info (
279374 f"Successfully processed [{ record .doc_id } ] using Azure API..!!"
280375 )
@@ -283,8 +378,9 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
283378 f"Unsupported mime type: { record .mime_type } . AzureDocIntelligencePredictionProvider supports 'application/pdf' and 'image/png'"
284379 )
285380 # Convert the prediction to doclingDocument
286- pred_doc = self .convert_azure_output_to_docling (result_json , record )
287- result_orig = json .dumps (result_json )
381+ pred_doc , pred_segmented_pages = self .convert_azure_output_to_docling (
382+ json .loads (result_json ), record
383+ )
288384
289385 except Exception as e :
290386 _log .error (
@@ -298,13 +394,14 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
298394 ) # Use copy of ground truth as fallback
299395
300396 pred_record = self .create_dataset_record_with_prediction (
301- record , pred_doc , result_orig
397+ record , pred_doc , result_json
302398 )
399+ pred_record .predicted_segmented_pages = pred_segmented_pages
303400 pred_record .status = status
304401 return pred_record
305402
306403 def info (self ) -> Dict :
307404 return {
308405 "asset" : PredictionProviderType .AZURE ,
309- "version" : importlib .metadata .version ("azure-ai-formrecognizer " ),
406+ "version" : importlib .metadata .version ("azure-ai-documentintelligence " ),
310407 }
0 commit comments