Skip to content

Commit 560d952

Browse files
divekarscSamved Divekar
andauthored
populate SegmentedPage words from Google OCR output (#96)
Signed-off-by: Samved Divekar <[email protected]> Co-authored-by: Samved Divekar <[email protected]>
1 parent 7903b6a commit 560d952

File tree

1 file changed

+70
-2
lines changed

1 file changed

+70
-2
lines changed

docling_eval/prediction_providers/google_prediction_provider.py

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@
1818
TableData,
1919
)
2020
from docling_core.types.doc.labels import DocItemLabel
21+
from docling_core.types.doc.page import (
22+
BoundingRectangle,
23+
PageGeometry,
24+
SegmentedPage,
25+
TextCell,
26+
)
2127
from docling_core.types.io import DocumentStream
2228
from google.cloud import documentai # type: ignore
2329
from google.oauth2 import service_account
@@ -144,6 +150,7 @@ def process_table_row(self, row, row_index, document, table_data, is_header=Fals
144150
def convert_google_output_to_docling(self, document, record: DatasetRecord):
145151
"""Converts Google Document AI output to DoclingDocument format."""
146152
doc = DoclingDocument(name=record.doc_id)
153+
segmented_pages: Dict[int, SegmentedPage] = {}
147154

148155
for page in document.get("pages", []):
149156
page_no = page.get("pageNumber", 1)
@@ -166,6 +173,23 @@ def convert_google_output_to_docling(self, document, record: DatasetRecord):
166173
)
167174
doc.pages[page_no] = page_item
168175

176+
# Create SegmentedPage Entry if not already present for the page number
177+
if page_no not in segmented_pages.keys():
178+
seg_page = SegmentedPage(
179+
dimension=PageGeometry(
180+
angle=0,
181+
rect=BoundingRectangle.from_bounding_box(
182+
BoundingBox(
183+
l=0,
184+
t=0,
185+
r=page_item.size.width,
186+
b=page_item.size.height,
187+
)
188+
),
189+
)
190+
)
191+
segmented_pages[page_no] = seg_page
192+
169193
# TODO: Can we get more detail than just "Text blocks" from Google DocAI? If they provide layout labels, let's use it here.
170194
for paragraph in page.get("paragraphs", []):
171195
# Extract text content from text_anchor and text_segments
@@ -203,6 +227,47 @@ def convert_google_output_to_docling(self, document, record: DatasetRecord):
203227

204228
doc.add_text(label=DocItemLabel.TEXT, text=text_content, prov=prov)
205229

230+
for token in page.get("tokens", []):
231+
# Extract text content from text_anchor and text_segments
232+
text_content = ""
233+
if "layout" in token and "textAnchor" in token["layout"]:
234+
for text_segment in token["layout"]["textAnchor"].get(
235+
"textSegments", []
236+
):
237+
if "endIndex" in text_segment:
238+
start_index = int(text_segment.get("startIndex", 0))
239+
end_index = int(text_segment.get("endIndex", 0))
240+
if document.get("text") and start_index < len(
241+
document["text"]
242+
):
243+
text_content += document["text"][start_index:end_index]
244+
245+
# Extract token bounding box
246+
vertices = (
247+
token.get("layout", {}).get("boundingPoly", {}).get("vertices", [])
248+
)
249+
token_bbox = (
250+
None if not vertices else self.extract_bbox_from_vertices(vertices)
251+
)
252+
253+
if text_content and token_bbox is not None:
254+
bbox_obj = BoundingBox(
255+
l=token_bbox["l"],
256+
t=token_bbox["t"],
257+
r=token_bbox["r"],
258+
b=token_bbox["b"],
259+
coord_origin=CoordOrigin.TOPLEFT,
260+
)
261+
segmented_pages[page_no].word_cells.append(
262+
TextCell(
263+
rect=BoundingRectangle.from_bounding_box(bbox_obj),
264+
text=text_content,
265+
orig=text_content,
266+
# Keeping from_ocr flag False since AWS output doesn't indicate whether the given word is programmatic or OCR
267+
from_ocr=False,
268+
)
269+
)
270+
206271
# TODO: Can we make sure the tables and the text is inserted in reading-order, instead of all tables at the end?
207272
for table in page.get("tables", []):
208273
table_bbox = self.extract_bbox_from_vertices(
@@ -252,7 +317,7 @@ def convert_google_output_to_docling(self, document, record: DatasetRecord):
252317

253318
doc.add_table(data=table_data, prov=table_prov)
254319

255-
return doc
320+
return doc, segmented_pages
256321

257322
@property
258323
def prediction_format(self) -> PredictionFormats:
@@ -316,7 +381,9 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
316381
f"Successfully processed [{record.doc_id}] using Google Document AI API!"
317382
)
318383

319-
pred_doc = self.convert_google_output_to_docling(result_json, record)
384+
pred_doc, pred_segmented_pages = self.convert_google_output_to_docling(
385+
result_json, record
386+
)
320387
else:
321388
raise RuntimeError(
322389
f"Unsupported mime type: {record.mime_type}. GoogleDocAIPredictionProvider supports 'application/pdf' and 'image/png'"
@@ -333,6 +400,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
333400
pred_record = self.create_dataset_record_with_prediction(
334401
record, pred_doc, json.dumps(result_json)
335402
)
403+
pred_record.predicted_segmented_pages = pred_segmented_pages
336404
pred_record.status = status
337405
return pred_record
338406

0 commit comments

Comments
 (0)