fix: dataset feature spec fixes, cvat improvements (#97)

cau-git · web-flow · commit b79dd1988cb3 · 2025-05-12T20:19:15.000+02:00
* Add README for Docling-DPBench

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

* Add CVAT annotation features, fix DatasetRecord.features usage

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

---------

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;
diff --git a/docling_eval/datamodels/dataset_record.py b/docling_eval/datamodels/dataset_record.py
@@ -66,7 +66,7 @@ def features(cls):
                 cls.get_field_alias("ground_truth_page_images"): Sequence(
                     Features_Image()
                 ),
-                cls.get_field_alias("original"): Value("string"),
+                cls.get_field_alias("original"): Value("binary"),
                 cls.get_field_alias("mime_type"): Value("string"),
                 cls.get_field_alias("modalities"): Sequence(Value("string")),
             }
@@ -207,26 +207,34 @@ class DatasetRecordWithPrediction(DatasetRecord):
 
     @classmethod
     def features(cls):
-        return {
-            cls.get_field_alias("doc_id"): Value("string"),
-            cls.get_field_alias("doc_path"): Value("string"),
-            cls.get_field_alias("doc_hash"): Value("string"),
-            cls.get_field_alias("ground_truth_doc"): Value("string"),
-            cls.get_field_alias("ground_truth_segmented_pages"): Value("string"),
-            cls.get_field_alias("ground_truth_pictures"): Sequence(Features_Image()),
-            cls.get_field_alias("ground_truth_page_images"): Sequence(Features_Image()),
-            cls.get_field_alias("original"): Value("string"),
-            cls.get_field_alias("mime_type"): Value("string"),
-            cls.get_field_alias("modalities"): Sequence(Value("string")),
-            cls.get_field_alias("predictor_info"): Value("string"),
-            cls.get_field_alias("status"): Value("string"),
-            cls.get_field_alias("predicted_doc"): Value("string"),
-            cls.get_field_alias("predicted_segmented_pages"): Value("string"),
-            cls.get_field_alias("predicted_pictures"): Sequence(Features_Image()),
-            cls.get_field_alias("predicted_page_images"): Sequence(Features_Image()),
-            cls.get_field_alias("prediction_format"): Value("string"),
-            cls.get_field_alias("prediction_timings"): Value("string"),
-        }
+        return Features(
+            {
+                cls.get_field_alias("doc_id"): Value("string"),
+                cls.get_field_alias("doc_path"): Value("string"),
+                cls.get_field_alias("doc_hash"): Value("string"),
+                cls.get_field_alias("ground_truth_doc"): Value("string"),
+                cls.get_field_alias("ground_truth_segmented_pages"): Value("string"),
+                cls.get_field_alias("ground_truth_pictures"): Sequence(
+                    Features_Image()
+                ),
+                cls.get_field_alias("ground_truth_page_images"): Sequence(
+                    Features_Image()
+                ),
+                cls.get_field_alias("original"): Value("binary"),
+                cls.get_field_alias("mime_type"): Value("string"),
+                cls.get_field_alias("modalities"): Sequence(Value("string")),
+                cls.get_field_alias("predictor_info"): Value("string"),
+                cls.get_field_alias("status"): Value("string"),
+                cls.get_field_alias("predicted_doc"): Value("string"),
+                cls.get_field_alias("predicted_segmented_pages"): Value("string"),
+                cls.get_field_alias("predicted_pictures"): Sequence(Features_Image()),
+                cls.get_field_alias("predicted_page_images"): Sequence(
+                    Features_Image()
+                ),
+                cls.get_field_alias("prediction_format"): Value("string"),
+                cls.get_field_alias("prediction_timings"): Value("string"),
+            }
+        )
 
     def as_record_dict(self):
         record = super().as_record_dict()
diff --git a/docling_eval/dataset_builders/cvat_preannotation_builder.py b/docling_eval/dataset_builders/cvat_preannotation_builder.py
@@ -199,6 +199,16 @@ def _create_project_properties(self) -> None:
         """
         results = []
 
+        default_attributes = [
+            {
+                "name": "content_layer",
+                "input_type": "select",
+                "mutable": True,
+                "values": ["BODY", "FURNITURE", "BACKGROUND"],
+                "default_value": "BODY",
+            }
+        ]
+
         # Add DocItemLabel properties
         for item in DocItemLabel:
             r, g, b = DocItemLabel.get_color(item)
@@ -208,7 +218,7 @@ def _create_project_properties(self) -> None:
                     "name": item.value,
                     "color": rgb_to_hex(r, g, b),
                     "type": "rectangle",
-                    "attributes": [],
+                    "attributes": default_attributes.copy(),
                 }
             )
 
@@ -247,14 +257,32 @@ def _create_project_properties(self) -> None:
                 )
 
             if item == DocItemLabel.PICTURE:
-                results[-1]["attributes"].append(
-                    {
-                        "name": "json",
-                        "mutable": True,
-                        "input_type": "text",
-                        "values": [""],
-                        "default_value": "",
-                    }
+                results[-1]["attributes"].extend(
+                    [
+                        {
+                            "name": "json",
+                            "mutable": True,
+                            "input_type": "text",
+                            "values": [""],
+                            "default_value": "",
+                        },
+                        {
+                            "name": "type",
+                            "input_type": "select",
+                            "mutable": True,
+                            "values": [
+                                "CHART",
+                                "INFOGRAPHIC",
+                                "SCREENSHOT",
+                                "UI_ELEMENT",
+                                "BARCODE",
+                                "LOGO",
+                                "PICTOGRAM",
+                                "OTHER",
+                            ],
+                            "default_value": "main",
+                        },
+                    ]
                 )
 
         # Add TableComponentLabel properties
diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py
@@ -345,4 +345,5 @@ def save_to_disk(
             output_dir=self.target,
             num_train_rows=0,
             num_test_rows=count,
+            features=DatasetRecord.features(),
         )
diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py
@@ -412,6 +412,7 @@ def _iterate_predictions() -> Iterable[DatasetRecordWithPrediction]:
                 output_dir=target_dataset_dir,
                 num_train_rows=0,
                 num_test_rows=count,
+                features=DatasetRecordWithPrediction.features(),
             )
 
         _log.info(f"Saved {count} records in {chunk_count} chunks to {test_dir}")
diff --git a/docling_eval/utils/utils.py b/docling_eval/utils/utils.py
@@ -53,27 +53,12 @@ def get_binhash(binary_data: bytes) -> str:
 
 
 def write_datasets_info(
-    name: str, output_dir: Path, num_train_rows: int, num_test_rows: int
+    name: str,
+    output_dir: Path,
+    num_train_rows: int,
+    num_test_rows: int,
+    features: Features,
 ):
-    features = Features(
-        {
-            BenchMarkColumns.CONVERTER_VERSION: Value("string"),
-            BenchMarkColumns.STATUS: Value("string"),
-            BenchMarkColumns.DOC_ID: Value("string"),
-            BenchMarkColumns.DOC_PATH: Value("string"),
-            BenchMarkColumns.DOC_HASH: Value("string"),
-            BenchMarkColumns.GROUNDTRUTH: Value("string"),
-            BenchMarkColumns.GROUNDTRUTH_PICTURES: Sequence(Features_Image()),
-            BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: Sequence(Features_Image()),
-            BenchMarkColumns.PREDICTION: Value("string"),
-            BenchMarkColumns.PREDICTION_PICTURES: Sequence(Features_Image()),
-            BenchMarkColumns.PREDICTION_PAGE_IMAGES: Sequence(Features_Image()),
-            BenchMarkColumns.ORIGINAL: Value("string"),
-            BenchMarkColumns.MIMETYPE: Value("string"),
-            BenchMarkColumns.MODALITIES: Sequence(Value("string")),
-        }
-    )
-
     schema = features.to_dict()
     # print(json.dumps(schema, indent=2))
 

Original file line number	Diff line number	Diff line change
`@@ -345,4 +345,5 @@ def save_to_disk(`
`345`	`345`	`output_dir=self.target,`
`346`	`346`	`num_train_rows=0,`
`347`	`347`	`num_test_rows=count,`
	`348`	`+ features=DatasetRecord.features(),`
`348`	`349`	`)`
Original file line number	Diff line number	Diff line change
`@@ -412,6 +412,7 @@ def _iterate_predictions() -> Iterable[DatasetRecordWithPrediction]:`
`412`	`412`	`output_dir=target_dataset_dir,`
`413`	`413`	`num_train_rows=0,`
`414`	`414`	`num_test_rows=count,`
	`415`	`+ features=DatasetRecordWithPrediction.features(),`
`415`	`416`	`)`
`416`	`417`
`417`	`418`	`_log.info(f"Saved {count} records in {chunk_count} chunks to {test_dir}")`