1818 TableData ,
1919)
2020from docling_core .types .doc .labels import DocItemLabel
21+ from docling_core .types .doc .page import (
22+ BoundingRectangle ,
23+ PageGeometry ,
24+ SegmentedPage ,
25+ TextCell ,
26+ )
2127from docling_core .types .io import DocumentStream
2228from google .cloud import documentai # type: ignore
2329from google .oauth2 import service_account
@@ -144,6 +150,7 @@ def process_table_row(self, row, row_index, document, table_data, is_header=Fals
144150 def convert_google_output_to_docling (self , document , record : DatasetRecord ):
145151 """Converts Google Document AI output to DoclingDocument format."""
146152 doc = DoclingDocument (name = record .doc_id )
153+ segmented_pages : Dict [int , SegmentedPage ] = {}
147154
148155 for page in document .get ("pages" , []):
149156 page_no = page .get ("pageNumber" , 1 )
@@ -166,6 +173,23 @@ def convert_google_output_to_docling(self, document, record: DatasetRecord):
166173 )
167174 doc .pages [page_no ] = page_item
168175
176+ # Create SegmentedPage Entry if not already present for the page number
177+ if page_no not in segmented_pages .keys ():
178+ seg_page = SegmentedPage (
179+ dimension = PageGeometry (
180+ angle = 0 ,
181+ rect = BoundingRectangle .from_bounding_box (
182+ BoundingBox (
183+ l = 0 ,
184+ t = 0 ,
185+ r = page_item .size .width ,
186+ b = page_item .size .height ,
187+ )
188+ ),
189+ )
190+ )
191+ segmented_pages [page_no ] = seg_page
192+
169193 # TODO: Can we get more detail than just "Text blocks" from Google DocAI? If they provide layout labels, let's use it here.
170194 for paragraph in page .get ("paragraphs" , []):
171195 # Extract text content from text_anchor and text_segments
@@ -203,6 +227,47 @@ def convert_google_output_to_docling(self, document, record: DatasetRecord):
203227
204228 doc .add_text (label = DocItemLabel .TEXT , text = text_content , prov = prov )
205229
230+ for token in page .get ("tokens" , []):
231+ # Extract text content from text_anchor and text_segments
232+ text_content = ""
233+ if "layout" in token and "textAnchor" in token ["layout" ]:
234+ for text_segment in token ["layout" ]["textAnchor" ].get (
235+ "textSegments" , []
236+ ):
237+ if "endIndex" in text_segment :
238+ start_index = int (text_segment .get ("startIndex" , 0 ))
239+ end_index = int (text_segment .get ("endIndex" , 0 ))
240+ if document .get ("text" ) and start_index < len (
241+ document ["text" ]
242+ ):
243+ text_content += document ["text" ][start_index :end_index ]
244+
245+ # Extract token bounding box
246+ vertices = (
247+ token .get ("layout" , {}).get ("boundingPoly" , {}).get ("vertices" , [])
248+ )
249+ token_bbox = (
250+ None if not vertices else self .extract_bbox_from_vertices (vertices )
251+ )
252+
253+ if text_content and token_bbox is not None :
254+ bbox_obj = BoundingBox (
255+ l = token_bbox ["l" ],
256+ t = token_bbox ["t" ],
257+ r = token_bbox ["r" ],
258+ b = token_bbox ["b" ],
259+ coord_origin = CoordOrigin .TOPLEFT ,
260+ )
261+ segmented_pages [page_no ].word_cells .append (
262+ TextCell (
263+ rect = BoundingRectangle .from_bounding_box (bbox_obj ),
264+ text = text_content ,
265+ orig = text_content ,
266+ # Keeping from_ocr flag False since AWS output doesn't indicate whether the given word is programmatic or OCR
267+ from_ocr = False ,
268+ )
269+ )
270+
206271 # TODO: Can we make sure the tables and the text is inserted in reading-order, instead of all tables at the end?
207272 for table in page .get ("tables" , []):
208273 table_bbox = self .extract_bbox_from_vertices (
@@ -252,7 +317,7 @@ def convert_google_output_to_docling(self, document, record: DatasetRecord):
252317
253318 doc .add_table (data = table_data , prov = table_prov )
254319
255- return doc
320+ return doc , segmented_pages
256321
257322 @property
258323 def prediction_format (self ) -> PredictionFormats :
@@ -316,7 +381,9 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
316381 f"Successfully processed [{ record .doc_id } ] using Google Document AI API!"
317382 )
318383
319- pred_doc = self .convert_google_output_to_docling (result_json , record )
384+ pred_doc , pred_segmented_pages = self .convert_google_output_to_docling (
385+ result_json , record
386+ )
320387 else :
321388 raise RuntimeError (
322389 f"Unsupported mime type: { record .mime_type } . GoogleDocAIPredictionProvider supports 'application/pdf' and 'image/png'"
@@ -333,6 +400,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
333400 pred_record = self .create_dataset_record_with_prediction (
334401 record , pred_doc , json .dumps (result_json )
335402 )
403+ pred_record .predicted_segmented_pages = pred_segmented_pages
336404 pred_record .status = status
337405 return pred_record
338406
0 commit comments