feat: Add extra args for docling-provider and default annotations for CVAT (#98)

cau-git · web-flow · commit 7903b6a1d9f3 · 2025-05-14T17:47:15.000+02:00
* Add README for Docling-DPBench

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

* Add CVAT annotation features, fix DatasetRecord.features usage

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

* dev: Updates for CVAT and docling provider args

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

* documentation for SmolDocling, fix artifacts_path

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

* Update lock

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

---------

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;
diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -227,10 +227,13 @@ def get_dataset_builder(
 
 def get_prediction_provider(
     provider_type: PredictionProviderType,
+    *,
     file_source_path: Optional[Path] = None,
     file_prediction_format: Optional[PredictionFormats] = None,
     do_visualization: bool = True,
+    do_table_structure: bool = True,
     artifacts_path: Optional[Path] = None,
+    image_scale_factor: Optional[float] = None,
 ):
     pipeline_options: PaginatedPipelineOptions
     """Get the appropriate prediction provider with default settings."""
@@ -248,10 +251,10 @@ def get_prediction_provider(
         pipeline_options = PdfPipelineOptions(
             do_ocr=True,
             ocr_options=ocr_options,
-            do_table_structure=True,
+            do_table_structure=do_table_structure,
         )
 
-        pipeline_options.images_scale = 2.0
+        pipeline_options.images_scale = image_scale_factor or 2.0
         pipeline_options.generate_page_images = True
         pipeline_options.generate_picture_images = True
         pipeline_options.generate_parsed_pages = True
@@ -278,10 +281,10 @@ def get_prediction_provider(
         pipeline_options = PdfPipelineOptions(
             do_ocr=True,
             ocr_options=ocr_options,
-            do_table_structure=True,
+            do_table_structure=do_table_structure,
         )
 
-        pipeline_options.images_scale = 2.0
+        pipeline_options.images_scale = image_scale_factor or 2.0
         pipeline_options.generate_page_images = True
         pipeline_options.generate_picture_images = True
 
@@ -308,20 +311,20 @@ def get_prediction_provider(
         pdf_pipeline_options = PdfPipelineOptions(
             do_ocr=False,
             ocr_options=ocr_options,  # we need to provide OCR options in order to not break the parquet serialization
-            do_table_structure=True,
+            do_table_structure=do_table_structure,
         )
 
-        pdf_pipeline_options.images_scale = 2.0
+        pdf_pipeline_options.images_scale = image_scale_factor or 2.0
         pdf_pipeline_options.generate_page_images = True
         pdf_pipeline_options.generate_picture_images = True
 
         ocr_pipeline_options = PdfPipelineOptions(
             do_ocr=True,
             ocr_options=ocr_options,  # we need to provide OCR options in order to not break the parquet serialization
-            do_table_structure=True,
+            do_table_structure=do_table_structure,
         )
 
-        ocr_pipeline_options.images_scale = 2.0
+        ocr_pipeline_options.images_scale = image_scale_factor or 2.0
         ocr_pipeline_options.generate_page_images = True
         ocr_pipeline_options.generate_picture_images = True
 
@@ -343,20 +346,20 @@ def get_prediction_provider(
     elif provider_type == PredictionProviderType.SMOLDOCLING:
         pipeline_options = VlmPipelineOptions()
 
-        pipeline_options.images_scale = 2.0
+        pipeline_options.images_scale = image_scale_factor or 2.0
         pipeline_options.generate_page_images = True
         pipeline_options.generate_picture_images = True
 
         pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+        if artifacts_path is not None:
+            pipeline_options.artifacts_path = artifacts_path
+
         if sys.platform == "darwin":
             try:
                 import mlx_vlm  # type: ignore
 
                 pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
 
-                if artifacts_path is not None:
-                    pipeline_options.artifacts_path = artifacts_path
-
             except ImportError:
                 _log.warning(
                     "To run SmolDocling faster, please install mlx-vlm:\n"
@@ -918,6 +921,13 @@ def create_eval(
     do_visualization: Annotated[
         bool, typer.Option(help="visualize the predictions")
     ] = True,
+    image_scale_factor: Annotated[
+        float,
+        typer.Option(help="Scale of page images used in prediction (only Docling)"),
+    ] = 2.0,
+    do_table_structure: Annotated[
+        bool, typer.Option(help="Include table structure predictions (only Docling)")
+    ] = True,
 ):
     """Create evaluation dataset from existing ground truth."""
     gt_dir = gt_dir or output_dir / "gt_dataset"
@@ -946,6 +956,8 @@ def create_eval(
             file_prediction_format=file_format,
             artifacts_path=artifacts_path,
             do_visualization=do_visualization,
+            image_scale_factor=image_scale_factor,
+            do_table_structure=do_table_structure,
         )
 
         # Get the dataset name from the benchmark
@@ -993,6 +1005,13 @@ def create(
     do_visualization: Annotated[
         bool, typer.Option(help="visualize the predictions")
     ] = True,
+    image_scale_factor: Annotated[
+        float,
+        typer.Option(help="Scale of page images used in prediction (only Docling)"),
+    ] = 2.0,
+    do_table_structure: Annotated[
+        bool, typer.Option(help="Include table structure predictions (only Docling)")
+    ] = True,
 ):
     """Create both ground truth and evaluation datasets in one step."""
     # First create ground truth
@@ -1020,6 +1039,8 @@ def create(
             file_prediction_format=file_prediction_format,
             file_source_path=file_source_path,
             do_visualization=do_visualization,
+            image_scale_factor=image_scale_factor,
+            do_table_structure=do_table_structure,
         )
     else:
         _log.info(
diff --git a/docling_eval/datamodels/cvat_types.py b/docling_eval/datamodels/cvat_types.py
@@ -62,7 +62,7 @@ def get_color(label: "TableComponentLabel") -> Tuple[int, int, int]:
             TableComponentLabel.TABLE_COL: (0, 255, 0),
             TableComponentLabel.TABLE_GROUP: (0, 0, 255),
         }
-        return color_map[label]
+        return color_map.get(label, (0, 0, 0))
 
 
 class BenchMarkDirs(BaseModel):
diff --git a/docling_eval/dataset_builders/cvat_preannotation_builder.py b/docling_eval/dataset_builders/cvat_preannotation_builder.py
@@ -9,6 +9,7 @@
 from docling_core.types.doc import DocItemLabel
 from docling_core.types.doc.base import BoundingBox, CoordOrigin
 from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
+from docling_core.types.doc.labels import GraphCellLabel, TableCellLabel
 from docling_core.types.io import DocumentStream
 from PIL import Image
 from pydantic import ValidationError
@@ -211,6 +212,17 @@ def _create_project_properties(self) -> None:
 
         # Add DocItemLabel properties
         for item in DocItemLabel:
+            if item in [
+                DocItemLabel.KEY_VALUE_REGION,
+                DocItemLabel.PARAGRAPH,
+                DocItemLabel.PAGE_HEADER,
+                DocItemLabel.PAGE_FOOTER,
+                DocItemLabel.TITLE,
+                DocItemLabel.CHART,
+                DocItemLabel.REFERENCE,
+            ]:
+                continue
+
             r, g, b = DocItemLabel.get_color(item)
 
             results.append(
@@ -286,15 +298,42 @@ def _create_project_properties(self) -> None:
                 )
 
         # Add TableComponentLabel properties
-        for table_item in TableComponentLabel:
-            r, g, b = TableComponentLabel.get_color(table_item)
+        for table_component_label in TableComponentLabel:
+            r, g, b = TableComponentLabel.get_color(table_component_label)
 
             results.append(
                 {
-                    "name": table_item.value,
+                    "name": table_component_label.value,
                     "color": rgb_to_hex(r, g, b),
                     "type": "rectangle",
-                    "attributes": [],
+                    "attributes": default_attributes.copy(),
+                }
+            )
+
+        # Add TableCellLabel properties
+        for table_cell_label in TableCellLabel:
+            r, g, b = TableCellLabel.get_color(table_cell_label)
+
+            results.append(
+                {
+                    "name": table_cell_label.value,
+                    "color": rgb_to_hex(r, g, b),
+                    "type": "rectangle",
+                    "attributes": default_attributes.copy(),
+                }
+            )
+
+        for graph_item in GraphCellLabel:
+            if graph_item in [GraphCellLabel.UNSPECIFIED, GraphCellLabel.CHECKBOX]:
+                continue
+            r, g, b = GraphCellLabel.get_color(graph_item)
+
+            results.append(
+                {
+                    "name": graph_item.value,
+                    "color": rgb_to_hex(r, g, b),
+                    "type": "rectangle",
+                    "attributes": default_attributes.copy(),
                 }
             )
 
diff --git a/docs/SmolDocling-custom-eval.md b/docs/SmolDocling-custom-eval.md
@@ -0,0 +1,68 @@
+# Evaluate SmolDocling with docling-eval
+
+Below are instructions to evaluate custom weights for SmolDocling with docling-eval.
+
+## Prepare SmolDocling weights for docling
+
+Docling can run SmolDocling out of the box. By default, it will download the model weights from Huggingface and keep them in the user `~/.cache` dir.
+If you want to inject custom weights and config, you need to prepare a directory like this:
+
+```shell
+models/ # the dir you will point docling-eval to (see below)
+├─ ds4sd--SmolDocling-256M-preview/ # the dir you place custom weights in. The name _must_ match the SmolDocling HF repo id, but using -- for /.
+```
+
+## Run docling-eval
+
+You can now run `docling-eval` as shown below. Example given for the Docling-DocLayNetV1 dataset:
+
+```shell
+# Create GT dataset for DocLayNet v1 test set (only once)
+mkdir benchmarks
+
+huggingface-cli login --token your_hf_token_123 # token-type: read is good, get it here: https://huggingface.co/settings/tokens
+huggingface-cli download --repo-type dataset --local-dir ./benchmarks/DLN_GT/gt_dataset ds4sd/Docling-DocLayNetV1
+# alternatively, create the GT dataset yourself: docling_eval create-gt --benchmark DocLayNetV1 --output-dir ./benchmarks/DLN_GT/ 
+
+## --- Do benchnmarks ---
+export HF_HUB_OFFLINE=1 # no communication with huggingface from now!
+
+# Make predictions for smoldocling
+docling_eval create-eval \
+  --benchmark DocLayNetV1 \
+  --gt-dir ./benchmarks/DLN_GT/gt_dataset/ \
+  --output-dir ./benchmarks/DLN_smoldocling_experiment1/ \
+  --prediction-provider SmolDocling \
+  --artifacts-path /path/to/your/models/ # see above. Must include the ds4sd--SmolDocling-256M-preview dir.
+
+# Layout metrics eval
+docling_eval evaluate \
+  --modality layout \
+  --benchmark DocLayNetV1 \
+  --output-dir ./benchmarks/DLN_smoldocling_experiment1/ 
+
+docling_eval visualize \
+  --modality layout \
+  --benchmark DocLayNetV1 \
+  --output-dir ./benchmarks/DLN_smoldocling_experiment1/ 
+
+# Text metrics eval
+docling_eval evaluate \
+  --modality markdown_text \
+  --benchmark DocLayNetV1 \
+  --output-dir ./benchmarks/DLN_smoldocling_experiment1/ 
+
+# Text metrics eval
+docling_eval visualize \
+  --modality markdown_text \
+  --benchmark DocLayNetV1 \
+  --output-dir ./benchmarks/DLN_smoldocling_experiment1/ 
+  
+```
+To repeat this with another set of weights, please replace the content of your `models/ds4sd--SmolDocling-256M-preview` directory, and adjust the
+experiment name used in your `--output-dir` arguments.
+
+**Note**: MacOS users should use weights converted with mlx-vlm. 
+Install `mlx-vlm`, convert the weights, and place them in a `ds4sd--SmolDocling-256M-preview-mlx-bf16` subdirectory instead.
+
+
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ def get_color(label: "TableComponentLabel") -> Tuple[int, int, int]:`
`62`	`62`	`TableComponentLabel.TABLE_COL: (0, 255, 0),`
`63`	`63`	`TableComponentLabel.TABLE_GROUP: (0, 0, 255),`
`64`	`64`	`}`
`65`		`- return color_map[label]`
	`65`	`+ return color_map.get(label, (0, 0, 0))`
`66`	`66`
`67`	`67`
`68`	`68`	`class BenchMarkDirs(BaseModel):`