Skip to content

Commit 88ae2da

Browse files
divekarsccau-gitSamved Divekar
authored
Add layout information for azure hyperscaler, add tests (#89)
* Add README for Docling-DPBench Signed-off-by: Christoph Auer <[email protected]> * Add layout information for azure hyperscaler, add tests Signed-off-by: Samved Divekar <[email protected]> * Update poetry file Signed-off-by: Samved Divekar <[email protected]> * Minor updates Signed-off-by: Samved Divekar <[email protected]> * Regenrate poetry lock file Signed-off-by: Samved Divekar <[email protected]> * Establish SegmentedPage support in DatasetRecord and DatasetRecordWithPrediction Signed-off-by: Christoph Auer <[email protected]> * Add SegmentedPage usage to PixParse dataset provider Signed-off-by: Christoph Auer <[email protected]> * Use segmented page for populating word level information, remove words from DoclingDocument Signed-off-by: Samved Divekar <[email protected]> * Fix precommit errors, propagate segmented pages Signed-off-by: Samved Divekar <[email protected]> * Do not add a word if text or bbox is missing Signed-off-by: Samved Divekar <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Samved Divekar <[email protected]> Co-authored-by: Christoph Auer <[email protected]> Co-authored-by: Samved Divekar <[email protected]>
1 parent d43af71 commit 88ae2da

File tree

6 files changed

+537
-547
lines changed

6 files changed

+537
-547
lines changed

docling_eval/prediction_providers/azure_prediction_provider.py

Lines changed: 137 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
import logging
44
import os
55
from io import BytesIO
6-
from pathlib import Path
76
from typing import Dict, Optional, Set, Tuple
87

8+
from azure.ai.documentintelligence import DocumentIntelligenceClient
9+
from azure.ai.documentintelligence.models import AnalyzeOutputOption
910
from docling.datamodel.base_models import ConversionStatus
1011

1112
# from docling_core.types import DoclingDocument
@@ -19,6 +20,12 @@
1920
TableCell,
2021
TableData,
2122
)
23+
from docling_core.types.doc.page import (
24+
BoundingRectangle,
25+
PageGeometry,
26+
SegmentedPage,
27+
TextCell,
28+
)
2229
from docling_core.types.io import DocumentStream
2330

2431
from docling_eval.datamodels.dataset_record import (
@@ -64,13 +71,11 @@ def __init__(
6471
# TODO - Need a temp directory to save Azure outputs
6572
# Validate the required library
6673
try:
67-
from azure.ai.formrecognizer import ( # type: ignore
68-
AnalysisFeature,
69-
DocumentAnalysisClient,
70-
)
7174
from azure.core.credentials import AzureKeyCredential # type: ignore
7275
except ImportError:
73-
raise ImportError("azure-ai-formrecognizer library is not installed..")
76+
raise ImportError(
77+
"azure-ai-documentintelligence library is not installed.."
78+
)
7479

7580
# Validate the required endpoints to call the API
7681
endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
@@ -81,7 +86,7 @@ def __init__(
8186
"AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and AZURE_DOCUMENT_INTELLIGENCE_KEY must be set in environment variables."
8287
)
8388

84-
self.doc_intelligence_client = DocumentAnalysisClient(
89+
self.doc_intelligence_client = DocumentIntelligenceClient(
8590
endpoint, AzureKeyCredential(key)
8691
)
8792

@@ -108,9 +113,10 @@ def extract_bbox_from_polygon(self, polygon):
108113

109114
def convert_azure_output_to_docling(
110115
self, analyze_result, record: DatasetRecord
111-
) -> DoclingDocument:
116+
) -> Tuple[DoclingDocument, Dict[int, SegmentedPage]]:
112117
"""Converts Azure Document Intelligence output to DoclingDocument format."""
113118
doc = DoclingDocument(name=record.doc_id)
119+
segmented_pages: Dict[int, SegmentedPage] = {}
114120

115121
for page in analyze_result.get("pages", []):
116122
page_no = page.get("page_number", 1)
@@ -134,32 +140,118 @@ def convert_azure_output_to_docling(
134140
)
135141
doc.pages[page_no] = page_item
136142

143+
if page_no not in segmented_pages.keys():
144+
seg_page = SegmentedPage(
145+
dimension=PageGeometry(
146+
angle=0,
147+
rect=BoundingRectangle.from_bounding_box(
148+
BoundingBox(
149+
l=0,
150+
t=0,
151+
r=page_item.size.width,
152+
b=page_item.size.height,
153+
)
154+
),
155+
)
156+
)
157+
segmented_pages[page_no] = seg_page
158+
137159
for word in page.get("words", []):
138-
polygon = word.get("polygon", [])
139-
bbox = self.extract_bbox_from_polygon(polygon)
160+
polygon = word.get("polygon", None)
161+
text_content = word.get("content", None)
162+
163+
if text_content is not None and polygon is not None:
164+
bbox = self.extract_bbox_from_polygon(polygon)
165+
bbox_obj = BoundingBox(
166+
l=bbox["l"],
167+
t=bbox["t"],
168+
r=bbox["r"],
169+
b=bbox["b"],
170+
coord_origin=CoordOrigin.TOPLEFT,
171+
)
140172

141-
text_content = word.get("content", "")
173+
segmented_pages[page_no].word_cells.append(
174+
TextCell(
175+
rect=BoundingRectangle.from_bounding_box(bbox_obj),
176+
text=text_content,
177+
orig=text_content,
178+
# Keeping from_ocr flag False since Azure output doesn't indicate whether the given word is programmatic or OCR
179+
from_ocr=False,
180+
)
181+
)
142182

143-
bbox_obj = BoundingBox(
144-
l=bbox["l"],
145-
t=bbox["t"],
146-
r=bbox["r"],
147-
b=bbox["b"],
148-
coord_origin=CoordOrigin.TOPLEFT,
149-
)
183+
# Iterate over tables in the response and add to DoclingDocument
184+
self._add_tables(analyze_result, doc)
150185

151-
prov = ProvenanceItem(
152-
page_no=page_no, bbox=bbox_obj, charspan=(0, len(text_content))
153-
)
186+
# Iterate over paragraphs in the response and add populate fields like section headings, header-footer based on "role" field
187+
self._handle_paragraphs_based_on_roles(analyze_result, doc)
188+
189+
# Iterate over figures and add them as pictures in DoclingDocument
190+
self._add_figures(analyze_result, doc)
154191

155-
# TODO: This needs to be developed further. Azure responses contain full-page document information,
156-
# with text and layout features,
157-
# see https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/prebuilt/layout
158-
# This code only adds the primitive text content, without
159-
# layout labels or reading order, then all tables separately. This will work for plain
160-
# table datasets only.
192+
return doc, segmented_pages
193+
194+
def _add_figures(self, analyze_result, doc):
195+
for figure in analyze_result.get("figures", []):
196+
bounding_regions = figure["boundingRegions"][0]
197+
page_no = bounding_regions["pageNumber"]
198+
polygon = bounding_regions.get("polygon", [])
199+
bbox = self.extract_bbox_from_polygon(polygon)
200+
201+
bbox_obj = BoundingBox(
202+
l=bbox["l"],
203+
t=bbox["t"],
204+
r=bbox["r"],
205+
b=bbox["b"],
206+
coord_origin=CoordOrigin.TOPLEFT,
207+
)
208+
209+
prov = ProvenanceItem(page_no=page_no, bbox=bbox_obj, charspan=(0, 0))
210+
doc.add_picture(prov=prov)
211+
212+
def _handle_paragraphs_based_on_roles(self, analyze_result, doc):
213+
for paragraph in analyze_result.get("paragraphs", []):
214+
bounding_regions = paragraph["boundingRegions"][0]
215+
page_no = bounding_regions["pageNumber"]
216+
polygon = bounding_regions.get("polygon", [])
217+
bbox = self.extract_bbox_from_polygon(polygon)
218+
219+
text_content = paragraph.get("content", "")
220+
221+
bbox_obj = BoundingBox(
222+
l=bbox["l"],
223+
t=bbox["t"],
224+
r=bbox["r"],
225+
b=bbox["b"],
226+
coord_origin=CoordOrigin.TOPLEFT,
227+
)
228+
229+
prov = ProvenanceItem(
230+
page_no=page_no, bbox=bbox_obj, charspan=(0, len(text_content))
231+
)
232+
233+
role = paragraph.get("role", None)
234+
if role:
235+
if role == "sectionHeading":
236+
doc.add_heading(text=text_content, prov=prov)
237+
elif role == "title":
238+
doc.add_title(text=text_content, prov=prov)
239+
elif role == "footnote":
240+
doc.add_text(label=DocItemLabel.TEXT, text=text_content, prov=prov)
241+
elif role == "pageHeader":
242+
doc.add_text(
243+
label=DocItemLabel.PAGE_HEADER, text=text_content, prov=prov
244+
)
245+
elif role == "pageFooter":
246+
doc.add_text(
247+
label=DocItemLabel.PAGE_FOOTER, text=text_content, prov=prov
248+
)
249+
elif role == "pageNumber":
250+
doc.add_text(label=DocItemLabel.TEXT, text=text_content, prov=prov)
251+
else:
161252
doc.add_text(label=DocItemLabel.TEXT, text=text_content, prov=prov)
162253

254+
def _add_tables(self, analyze_result, doc):
163255
for table in analyze_result.get("tables", []):
164256
page_no = table.get("page_range", {}).get("first_page_number", 1)
165257
row_count = table.get("row_count", 0)
@@ -183,7 +275,6 @@ def convert_azure_output_to_docling(
183275
table_cells = []
184276

185277
for cell in table.get("cells", []):
186-
187278
cell_text = cell.get("content", "").strip()
188279
row_index = cell.get("row_index", 0)
189280
col_index = cell.get("column_index", 0)
@@ -221,8 +312,6 @@ def convert_azure_output_to_docling(
221312

222313
doc.add_table(prov=table_prov, data=table_data, caption=None)
223314

224-
return doc
225-
226315
@property
227316
def prediction_format(self) -> PredictionFormats:
228317
"""Get the prediction format."""
@@ -255,10 +344,13 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
255344
)
256345
# Call the Azure API by passing in the image for prediction
257346
poller = self.doc_intelligence_client.begin_analyze_document(
258-
"prebuilt-layout", record.original.stream, features=[]
347+
"prebuilt-layout",
348+
record.original.stream,
349+
features=[],
350+
output=[AnalyzeOutputOption.FIGURES],
259351
)
260-
result = poller.result()
261-
result_json = result.to_dict()
352+
result = poller.result().as_dict()
353+
result_json = json.dumps(result)
262354
_log.info(
263355
f"Successfully processed [{record.doc_id}] using Azure API..!!"
264356
)
@@ -271,10 +363,13 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
271363
record.ground_truth_page_images[0].save(buf, format="PNG")
272364

273365
poller = self.doc_intelligence_client.begin_analyze_document(
274-
"prebuilt-layout", BytesIO(buf.getvalue()), features=[]
366+
"prebuilt-layout",
367+
BytesIO(buf.getvalue()),
368+
features=[],
369+
output=[AnalyzeOutputOption.FIGURES],
275370
)
276-
result = poller.result()
277-
result_json = result.to_dict()
371+
result = poller.result().as_dict()
372+
result_json = json.dumps(result, default=str)
278373
_log.info(
279374
f"Successfully processed [{record.doc_id}] using Azure API..!!"
280375
)
@@ -283,8 +378,9 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
283378
f"Unsupported mime type: {record.mime_type}. AzureDocIntelligencePredictionProvider supports 'application/pdf' and 'image/png'"
284379
)
285380
# Convert the prediction to doclingDocument
286-
pred_doc = self.convert_azure_output_to_docling(result_json, record)
287-
result_orig = json.dumps(result_json)
381+
pred_doc, pred_segmented_pages = self.convert_azure_output_to_docling(
382+
json.loads(result_json), record
383+
)
288384

289385
except Exception as e:
290386
_log.error(
@@ -298,13 +394,14 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
298394
) # Use copy of ground truth as fallback
299395

300396
pred_record = self.create_dataset_record_with_prediction(
301-
record, pred_doc, result_orig
397+
record, pred_doc, result_json
302398
)
399+
pred_record.predicted_segmented_pages = pred_segmented_pages
303400
pred_record.status = status
304401
return pred_record
305402

306403
def info(self) -> Dict:
307404
return {
308405
"asset": PredictionProviderType.AZURE,
309-
"version": importlib.metadata.version("azure-ai-formrecognizer"),
406+
"version": importlib.metadata.version("azure-ai-documentintelligence"),
310407
}

0 commit comments

Comments
 (0)