Skip to content

Commit b79dd19

Browse files
authored
fix: dataset feature spec fixes, cvat improvements (#97)
* Add README for Docling-DPBench Signed-off-by: Christoph Auer <[email protected]> * Add CVAT annotation features, fix DatasetRecord.features usage Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]>
1 parent 88ae2da commit b79dd19

File tree

5 files changed

+73
-50
lines changed

5 files changed

+73
-50
lines changed

docling_eval/datamodels/dataset_record.py

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def features(cls):
6666
cls.get_field_alias("ground_truth_page_images"): Sequence(
6767
Features_Image()
6868
),
69-
cls.get_field_alias("original"): Value("string"),
69+
cls.get_field_alias("original"): Value("binary"),
7070
cls.get_field_alias("mime_type"): Value("string"),
7171
cls.get_field_alias("modalities"): Sequence(Value("string")),
7272
}
@@ -207,26 +207,34 @@ class DatasetRecordWithPrediction(DatasetRecord):
207207

208208
@classmethod
209209
def features(cls):
210-
return {
211-
cls.get_field_alias("doc_id"): Value("string"),
212-
cls.get_field_alias("doc_path"): Value("string"),
213-
cls.get_field_alias("doc_hash"): Value("string"),
214-
cls.get_field_alias("ground_truth_doc"): Value("string"),
215-
cls.get_field_alias("ground_truth_segmented_pages"): Value("string"),
216-
cls.get_field_alias("ground_truth_pictures"): Sequence(Features_Image()),
217-
cls.get_field_alias("ground_truth_page_images"): Sequence(Features_Image()),
218-
cls.get_field_alias("original"): Value("string"),
219-
cls.get_field_alias("mime_type"): Value("string"),
220-
cls.get_field_alias("modalities"): Sequence(Value("string")),
221-
cls.get_field_alias("predictor_info"): Value("string"),
222-
cls.get_field_alias("status"): Value("string"),
223-
cls.get_field_alias("predicted_doc"): Value("string"),
224-
cls.get_field_alias("predicted_segmented_pages"): Value("string"),
225-
cls.get_field_alias("predicted_pictures"): Sequence(Features_Image()),
226-
cls.get_field_alias("predicted_page_images"): Sequence(Features_Image()),
227-
cls.get_field_alias("prediction_format"): Value("string"),
228-
cls.get_field_alias("prediction_timings"): Value("string"),
229-
}
210+
return Features(
211+
{
212+
cls.get_field_alias("doc_id"): Value("string"),
213+
cls.get_field_alias("doc_path"): Value("string"),
214+
cls.get_field_alias("doc_hash"): Value("string"),
215+
cls.get_field_alias("ground_truth_doc"): Value("string"),
216+
cls.get_field_alias("ground_truth_segmented_pages"): Value("string"),
217+
cls.get_field_alias("ground_truth_pictures"): Sequence(
218+
Features_Image()
219+
),
220+
cls.get_field_alias("ground_truth_page_images"): Sequence(
221+
Features_Image()
222+
),
223+
cls.get_field_alias("original"): Value("binary"),
224+
cls.get_field_alias("mime_type"): Value("string"),
225+
cls.get_field_alias("modalities"): Sequence(Value("string")),
226+
cls.get_field_alias("predictor_info"): Value("string"),
227+
cls.get_field_alias("status"): Value("string"),
228+
cls.get_field_alias("predicted_doc"): Value("string"),
229+
cls.get_field_alias("predicted_segmented_pages"): Value("string"),
230+
cls.get_field_alias("predicted_pictures"): Sequence(Features_Image()),
231+
cls.get_field_alias("predicted_page_images"): Sequence(
232+
Features_Image()
233+
),
234+
cls.get_field_alias("prediction_format"): Value("string"),
235+
cls.get_field_alias("prediction_timings"): Value("string"),
236+
}
237+
)
230238

231239
def as_record_dict(self):
232240
record = super().as_record_dict()

docling_eval/dataset_builders/cvat_preannotation_builder.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,16 @@ def _create_project_properties(self) -> None:
199199
"""
200200
results = []
201201

202+
default_attributes = [
203+
{
204+
"name": "content_layer",
205+
"input_type": "select",
206+
"mutable": True,
207+
"values": ["BODY", "FURNITURE", "BACKGROUND"],
208+
"default_value": "BODY",
209+
}
210+
]
211+
202212
# Add DocItemLabel properties
203213
for item in DocItemLabel:
204214
r, g, b = DocItemLabel.get_color(item)
@@ -208,7 +218,7 @@ def _create_project_properties(self) -> None:
208218
"name": item.value,
209219
"color": rgb_to_hex(r, g, b),
210220
"type": "rectangle",
211-
"attributes": [],
221+
"attributes": default_attributes.copy(),
212222
}
213223
)
214224

@@ -247,14 +257,32 @@ def _create_project_properties(self) -> None:
247257
)
248258

249259
if item == DocItemLabel.PICTURE:
250-
results[-1]["attributes"].append(
251-
{
252-
"name": "json",
253-
"mutable": True,
254-
"input_type": "text",
255-
"values": [""],
256-
"default_value": "",
257-
}
260+
results[-1]["attributes"].extend(
261+
[
262+
{
263+
"name": "json",
264+
"mutable": True,
265+
"input_type": "text",
266+
"values": [""],
267+
"default_value": "",
268+
},
269+
{
270+
"name": "type",
271+
"input_type": "select",
272+
"mutable": True,
273+
"values": [
274+
"CHART",
275+
"INFOGRAPHIC",
276+
"SCREENSHOT",
277+
"UI_ELEMENT",
278+
"BARCODE",
279+
"LOGO",
280+
"PICTOGRAM",
281+
"OTHER",
282+
],
283+
"default_value": "main",
284+
},
285+
]
258286
)
259287

260288
# Add TableComponentLabel properties

docling_eval/dataset_builders/dataset_builder.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,4 +345,5 @@ def save_to_disk(
345345
output_dir=self.target,
346346
num_train_rows=0,
347347
num_test_rows=count,
348+
features=DatasetRecord.features(),
348349
)

docling_eval/prediction_providers/base_prediction_provider.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,7 @@ def _iterate_predictions() -> Iterable[DatasetRecordWithPrediction]:
412412
output_dir=target_dataset_dir,
413413
num_train_rows=0,
414414
num_test_rows=count,
415+
features=DatasetRecordWithPrediction.features(),
415416
)
416417

417418
_log.info(f"Saved {count} records in {chunk_count} chunks to {test_dir}")

docling_eval/utils/utils.py

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -53,27 +53,12 @@ def get_binhash(binary_data: bytes) -> str:
5353

5454

5555
def write_datasets_info(
56-
name: str, output_dir: Path, num_train_rows: int, num_test_rows: int
56+
name: str,
57+
output_dir: Path,
58+
num_train_rows: int,
59+
num_test_rows: int,
60+
features: Features,
5761
):
58-
features = Features(
59-
{
60-
BenchMarkColumns.CONVERTER_VERSION: Value("string"),
61-
BenchMarkColumns.STATUS: Value("string"),
62-
BenchMarkColumns.DOC_ID: Value("string"),
63-
BenchMarkColumns.DOC_PATH: Value("string"),
64-
BenchMarkColumns.DOC_HASH: Value("string"),
65-
BenchMarkColumns.GROUNDTRUTH: Value("string"),
66-
BenchMarkColumns.GROUNDTRUTH_PICTURES: Sequence(Features_Image()),
67-
BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: Sequence(Features_Image()),
68-
BenchMarkColumns.PREDICTION: Value("string"),
69-
BenchMarkColumns.PREDICTION_PICTURES: Sequence(Features_Image()),
70-
BenchMarkColumns.PREDICTION_PAGE_IMAGES: Sequence(Features_Image()),
71-
BenchMarkColumns.ORIGINAL: Value("string"),
72-
BenchMarkColumns.MIMETYPE: Value("string"),
73-
BenchMarkColumns.MODALITIES: Sequence(Value("string")),
74-
}
75-
)
76-
7762
schema = features.to_dict()
7863
# print(json.dumps(schema, indent=2))
7964

0 commit comments

Comments
 (0)