Skip to content

Commit be0ff6a

Browse files
cau-gitsamiucsamiullahchattha
authored
feat: Introduce SegmentedPage for OCR (#91)
* Add README for Docling-DPBench Signed-off-by: Christoph Auer <[email protected]> * Establish SegmentedPage support in DatasetRecord and DatasetRecordWithPrediction Signed-off-by: Christoph Auer <[email protected]> * Add SegmentedPage usage to PixParse dataset provider Signed-off-by: Christoph Auer <[email protected]> * feat: Update FUNSDDatasetBuilder to support segmented pages (#93) * feat: update FUNSDDatasetBuilder to support segmented pages * address review comments --------- Co-authored-by: samiullahchattha <[email protected]> * feat: Update XFUNDDatasetBuilder to support segmented pages (#92) * feat: update XFUNDDatasetBuilder to support segmented pages * address review comments --------- Co-authored-by: samiullahchattha <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]> Co-authored-by: samiuc <[email protected]> Co-authored-by: samiullahchattha <[email protected]>
1 parent 28d166d commit be0ff6a

File tree

9 files changed

+213
-40
lines changed

9 files changed

+213
-40
lines changed

docling_eval/cli/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,7 @@ def get_prediction_provider(
254254
pipeline_options.images_scale = 2.0
255255
pipeline_options.generate_page_images = True
256256
pipeline_options.generate_picture_images = True
257+
pipeline_options.generate_parsed_pages = True
257258

258259
if artifacts_path is not None:
259260
pipeline_options.artifacts_path = artifacts_path

docling_eval/datamodels/dataset_record.py

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,14 @@
99
from datasets import Sequence, Value
1010
from docling.datamodel.base_models import ConversionStatus
1111
from docling_core.types import DoclingDocument
12+
from docling_core.types.doc.page import SegmentedPage
1213
from docling_core.types.io import DocumentStream
13-
from pydantic import BaseModel, ConfigDict, Field, model_validator
14+
from pydantic import BaseModel, ConfigDict, Field, TypeAdapter, model_validator
1415

1516
from docling_eval.datamodels.types import EvaluationModality, PredictionFormats
1617

18+
seg_adapter = TypeAdapter(Dict[int, SegmentedPage])
19+
1720

1821
class DatasetRecord(
1922
BaseModel
@@ -24,6 +27,9 @@ class DatasetRecord(
2427
doc_hash: Optional[str] = Field(alias="document_filehash", default=None)
2528

2629
ground_truth_doc: DoclingDocument = Field(alias="GroundTruthDocument")
30+
ground_truth_segmented_pages: Dict[int, SegmentedPage] = Field(
31+
alias="ground_truth_segmented_pages", default={}
32+
)
2733
original: Optional[Union[DocumentStream, Path]] = Field(
2834
alias="BinaryDocument", default=None
2935
)
@@ -53,6 +59,7 @@ def features(cls):
5359
cls.get_field_alias("doc_path"): Value("string"),
5460
cls.get_field_alias("doc_hash"): Value("string"),
5561
cls.get_field_alias("ground_truth_doc"): Value("string"),
62+
cls.get_field_alias("ground_truth_segmented_pages"): Value("string"),
5663
cls.get_field_alias("ground_truth_pictures"): Sequence(
5764
Features_Image()
5865
),
@@ -102,6 +109,9 @@ def as_record_dict(self):
102109
self.ground_truth_doc.export_to_dict()
103110
),
104111
self.get_field_alias("ground_truth_pictures"): self.ground_truth_pictures,
112+
self.get_field_alias("ground_truth_segmented_pages"): seg_adapter.dump_json(
113+
self.ground_truth_segmented_pages
114+
),
105115
self.get_field_alias(
106116
"ground_truth_page_images"
107117
): self.ground_truth_page_images,
@@ -143,6 +153,12 @@ def validate_record_dict(cls, data: dict):
143153
if gt_doc_alias in data and isinstance(data[gt_doc_alias], str):
144154
data[gt_doc_alias] = json.loads(data[gt_doc_alias])
145155

156+
gt_seg_pages_alias = cls.get_field_alias("ground_truth_segmented_pages")
157+
if gt_seg_pages_alias in data and isinstance(data[gt_seg_pages_alias], bytes):
158+
data[gt_seg_pages_alias] = seg_adapter.validate_json(
159+
data[gt_seg_pages_alias]
160+
)
161+
146162
gt_page_img_alias = cls.get_field_alias("ground_truth_page_images")
147163
if gt_page_img_alias in data:
148164
for ix, item in enumerate(data[gt_page_img_alias]):
@@ -171,6 +187,11 @@ class DatasetRecordWithPrediction(DatasetRecord):
171187
predicted_doc: Optional[DoclingDocument] = Field(
172188
alias="PredictedDocument", default=None
173189
)
190+
191+
predicted_segmented_pages: Dict[int, SegmentedPage] = Field(
192+
alias="predicted_segmented_pages", default={}
193+
)
194+
174195
original_prediction: Optional[str] = None
175196
prediction_format: PredictionFormats # some enum type
176197
prediction_timings: Optional[Dict] = Field(alias="prediction_timings", default=None)
@@ -187,20 +208,22 @@ class DatasetRecordWithPrediction(DatasetRecord):
187208
@classmethod
188209
def features(cls):
189210
return {
190-
cls.get_field_alias("predictor_info"): Value("string"),
191-
cls.get_field_alias("status"): Value("string"),
192211
cls.get_field_alias("doc_id"): Value("string"),
193212
cls.get_field_alias("doc_path"): Value("string"),
194213
cls.get_field_alias("doc_hash"): Value("string"),
195214
cls.get_field_alias("ground_truth_doc"): Value("string"),
215+
cls.get_field_alias("ground_truth_segmented_pages"): Value("string"),
196216
cls.get_field_alias("ground_truth_pictures"): Sequence(Features_Image()),
197217
cls.get_field_alias("ground_truth_page_images"): Sequence(Features_Image()),
198-
cls.get_field_alias("predicted_doc"): Value("string"),
199-
cls.get_field_alias("predicted_pictures"): Sequence(Features_Image()),
200-
cls.get_field_alias("predicted_page_images"): Sequence(Features_Image()),
201218
cls.get_field_alias("original"): Value("string"),
202219
cls.get_field_alias("mime_type"): Value("string"),
203220
cls.get_field_alias("modalities"): Sequence(Value("string")),
221+
cls.get_field_alias("predictor_info"): Value("string"),
222+
cls.get_field_alias("status"): Value("string"),
223+
cls.get_field_alias("predicted_doc"): Value("string"),
224+
cls.get_field_alias("predicted_segmented_pages"): Value("string"),
225+
cls.get_field_alias("predicted_pictures"): Sequence(Features_Image()),
226+
cls.get_field_alias("predicted_page_images"): Sequence(Features_Image()),
204227
cls.get_field_alias("prediction_format"): Value("string"),
205228
cls.get_field_alias("prediction_timings"): Value("string"),
206229
}
@@ -211,6 +234,8 @@ def as_record_dict(self):
211234
{
212235
self.get_field_alias("prediction_format"): self.prediction_format.value,
213236
self.get_field_alias("prediction_timings"): self.prediction_timings,
237+
self.get_field_alias("predictor_info"): self.predictor_info,
238+
self.get_field_alias("status"): (self.status),
214239
}
215240
)
216241

@@ -220,15 +245,16 @@ def as_record_dict(self):
220245
self.get_field_alias("predicted_doc"): json.dumps(
221246
self.predicted_doc.export_to_dict()
222247
),
248+
self.get_field_alias(
249+
"predicted_segmented_pages"
250+
): seg_adapter.dump_json(self.predicted_segmented_pages),
223251
self.get_field_alias("predicted_pictures"): self.predicted_pictures,
224252
self.get_field_alias(
225253
"predicted_page_images"
226254
): self.predicted_page_images,
227255
self.get_field_alias("original_prediction"): (
228256
self.original_prediction
229257
),
230-
self.get_field_alias("status"): (self.status),
231-
self.get_field_alias("predictor_info"): self.predictor_info,
232258
}
233259
)
234260

@@ -262,6 +288,14 @@ def validate_prediction_record_dict(cls, data: dict):
262288
if pred_doc_alias in data and isinstance(data[pred_doc_alias], str):
263289
data[pred_doc_alias] = json.loads(data[pred_doc_alias])
264290

291+
pred_seg_pages_alias = cls.get_field_alias("predicted_segmented_pages")
292+
if pred_seg_pages_alias in data and isinstance(
293+
data[pred_seg_pages_alias], bytes
294+
):
295+
data[pred_seg_pages_alias] = seg_adapter.validate_json(
296+
data[pred_seg_pages_alias]
297+
)
298+
265299
pred_page_img_alias = cls.get_field_alias("predicted_page_images")
266300
if pred_page_img_alias in data:
267301
for ix, item in enumerate(data[pred_page_img_alias]):

docling_eval/dataset_builders/funsd_builder.py

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,20 @@
33
import logging
44
import shutil
55
from pathlib import Path
6-
from typing import Dict, Iterable, List, Optional
6+
from typing import Dict, Iterable, List, Optional, Tuple
77

88
from datasets import DownloadManager
99
from docling_core.types import DoclingDocument
1010
from docling_core.types.doc import BoundingBox, ImageRef, PageItem, ProvenanceItem, Size
1111
from docling_core.types.doc.document import GraphCell, GraphData, GraphLink
1212
from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel
13+
from docling_core.types.doc.page import (
14+
BoundingRectangle,
15+
PageGeometry,
16+
SegmentedPage,
17+
TextCell,
18+
)
19+
from docling_core.types.io import DocumentStream
1320
from PIL import Image
1421
from tqdm import tqdm
1522

@@ -193,9 +200,9 @@ def get_overall_bbox(
193200
bbox_instance = BoundingBox.enclosing_bbox(all_bboxes)
194201
return bbox_instance
195202

196-
def populate_key_value_item(
203+
def _create_ground_truth_doc(
197204
self, doc: DoclingDocument, funsd_data: dict
198-
) -> DoclingDocument:
205+
) -> Tuple[DoclingDocument, Dict[int, SegmentedPage]]:
199206
"""
200207
Populate the key-value item from the FUNSD data.
201208
@@ -210,6 +217,19 @@ def populate_key_value_item(
210217
raise ValueError("Invalid FUNSD data: missing 'form' key.")
211218

212219
form_items = funsd_data["form"]
220+
segmented_pages: Dict[int, SegmentedPage] = {}
221+
222+
page_item: PageItem = doc.pages[1]
223+
seg_page = SegmentedPage(
224+
dimension=PageGeometry(
225+
angle=0,
226+
rect=BoundingRectangle.from_bounding_box(
227+
BoundingBox(
228+
l=0, t=0, r=page_item.size.width, b=page_item.size.height
229+
)
230+
),
231+
)
232+
)
213233

214234
cell_by_id = {}
215235
for item in form_items:
@@ -242,6 +262,23 @@ def populate_key_value_item(
242262
)
243263
cell_by_id[cell_id] = cell
244264

265+
for word in item.get("words", []):
266+
text = word.get("text", None)
267+
bbox = word.get("box", None)
268+
if bbox is None or text is None:
269+
continue
270+
bbox_obj = self.convert_bbox(bbox)
271+
seg_page.word_cells.append(
272+
TextCell(
273+
from_ocr=True,
274+
rect=BoundingRectangle.from_bounding_box(bbox_obj),
275+
text=text,
276+
orig=text,
277+
)
278+
)
279+
280+
segmented_pages[doc.pages[1].page_no] = seg_page
281+
245282
# unique linking pairs
246283
linking_set = set()
247284
for item in form_items:
@@ -283,7 +320,7 @@ def populate_key_value_item(
283320

284321
sort_cell_ids(doc)
285322

286-
return doc
323+
return doc, segmented_pages
287324

288325
def iterate(self) -> Iterable[DatasetRecord]:
289326
"""
@@ -358,23 +395,28 @@ def iterate(self) -> Iterable[DatasetRecord]:
358395
true_doc.pages[1] = page_item
359396

360397
# Populate document with key-value data
361-
true_doc = self.populate_key_value_item(true_doc, funsd_data)
398+
true_doc, seg_pages = self._create_ground_truth_doc(
399+
true_doc, funsd_data
400+
)
362401

363402
# Extract images
364403
true_doc, true_pictures, true_page_images = extract_images(
365404
document=true_doc,
366405
pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
367406
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
368407
)
369-
408+
image_stream = DocumentStream(
409+
name=img_path.stem, stream=io.BytesIO(img_bytes)
410+
)
370411
# Create dataset record
371412
record = DatasetRecord(
372413
doc_id=img_path.stem,
373414
doc_hash=get_binhash(img_bytes),
374415
ground_truth_doc=true_doc,
375-
original=None,
416+
ground_truth_segmented_pages=seg_pages,
417+
original=image_stream,
376418
mime_type="image/png",
377-
modalities=[EvaluationModality.KEY_VALUE],
419+
modalities=[EvaluationModality.KEY_VALUE, EvaluationModality.OCR],
378420
ground_truth_pictures=true_pictures,
379421
ground_truth_page_images=true_page_images,
380422
)

docling_eval/dataset_builders/pixparse_builder.py

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import logging
33
from io import BytesIO
44
from pathlib import Path
5-
from typing import Dict, Iterable, Optional
5+
from typing import Dict, Iterable, List, Optional, Tuple
66

77
from docling_core.types import DoclingDocument
88
from docling_core.types.doc import (
@@ -14,6 +14,12 @@
1414
ProvenanceItem,
1515
Size,
1616
)
17+
from docling_core.types.doc.page import (
18+
BoundingRectangle,
19+
PageGeometry,
20+
SegmentedPage,
21+
TextCell,
22+
)
1723
from docling_core.types.io import DocumentStream
1824
from PIL import Image
1925
from tqdm import tqdm
@@ -54,7 +60,7 @@ def __init__(
5460

5561
def _create_ground_truth_doc(
5662
self, doc_id: str, gt_data: Dict, image: Image.Image
57-
) -> DoclingDocument:
63+
) -> Tuple[DoclingDocument, Dict[int, SegmentedPage]]:
5864
"""Create a DoclingDocument from ground truth data and image file."""
5965
true_doc = DoclingDocument(name=doc_id)
6066

@@ -72,8 +78,19 @@ def _create_ground_truth_doc(
7278
)
7379
true_doc.pages[1] = page_item
7480

81+
segmented_pages: Dict[int, SegmentedPage] = {}
82+
7583
for page_idx, page in enumerate(gt_data["pages"], 1):
76-
for text, bbox, _ in zip(page["text"], page["bbox"], page["score"]):
84+
seg_page = SegmentedPage(
85+
dimension=PageGeometry(
86+
angle=0,
87+
rect=BoundingRectangle.from_bounding_box(
88+
BoundingBox(l=0, t=0, r=image.width, b=image.height)
89+
),
90+
)
91+
)
92+
93+
for text, bbox, score in zip(page["text"], page["bbox"], page["score"]):
7794
bbox_obj = BoundingBox.from_tuple(
7895
(
7996
float(bbox[0]),
@@ -83,12 +100,18 @@ def _create_ground_truth_doc(
83100
),
84101
CoordOrigin.TOPLEFT,
85102
)
86-
prov = ProvenanceItem(
87-
page_no=page_idx, bbox=bbox_obj, charspan=(0, len(text))
103+
seg_page.textline_cells.append(
104+
TextCell(
105+
from_ocr=True,
106+
rect=BoundingRectangle.from_bounding_box(bbox_obj),
107+
text=text,
108+
orig=text,
109+
confidence=score,
110+
)
88111
)
89-
true_doc.add_text(label=DocItemLabel.TEXT, text=text, prov=prov)
112+
segmented_pages[page_idx] = seg_page
90113

91-
return true_doc
114+
return true_doc, segmented_pages
92115

93116
def iterate(self) -> Iterable[DatasetRecord]:
94117
if not self.retrieved and self.must_retrieve:
@@ -135,7 +158,9 @@ def iterate(self) -> Iterable[DatasetRecord]:
135158
):
136159
image = image.convert("RGB")
137160

138-
true_doc = self._create_ground_truth_doc(doc_id, gt_data, image)
161+
true_doc, seg_pages = self._create_ground_truth_doc(
162+
doc_id, gt_data, image
163+
)
139164

140165
# Extract images from the ground truth document
141166
true_doc, true_pictures, true_page_images = extract_images(
@@ -158,6 +183,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
158183
doc_id=doc_id,
159184
doc_hash=get_binhash(img_bytes),
160185
ground_truth_doc=true_doc,
186+
ground_truth_segmented_pages=seg_pages,
161187
original=image_stream,
162188
mime_type="image/png",
163189
modalities=[

0 commit comments

Comments
 (0)