feat: Extend the CLI for create-eval to receive the vlm-options and max_new_tokens parameters when the provider is GraniteDocling (#164)

nikos-livathinos · web-flow · commit 8be2e8399b9f · 2025-10-22T14:47:35.000+02:00
* chore: Remove pining of docling, docling-core and advance the docling version to 2.56

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* feat: Extend the CLI to receive the GraniteDocling-specific options `--granite-docling-vlm-options`,
`--max-new-tokens` and propagate them in the `get_prediction_provider()`

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* chore: Pin the docling-core version to the correct branch

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* fix: Set the default max_new_tokens CLI parameter to None

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* fix: Refactor the DoclingEvalCOCOExporter to check for page/image size mismatches within a tolerance

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* fix: Fix the CLI for coco_exporter.py

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* fix: Improve the logic in main to set the vlm_options when using GraniteDocling.
First priority is given to user-defined CLI parameters, then it is decided by the engines available
in the system.

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

---------

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;
diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 
 # --- DoclingLayoutOptionsManager definition moved here ---
-from typing import Annotated, Dict, List, Optional, Tuple
+from typing import Annotated, Dict, List, Optional, Tuple, Union
 
 import typer
 from docling.datamodel.accelerator_options import AcceleratorOptions
@@ -27,9 +27,11 @@
     PdfPipelineOptions,
     VlmPipelineOptions,
 )
+from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
 from docling.datamodel.vlm_model_specs import (
     GRANITEDOCLING_MLX,
     GRANITEDOCLING_TRANSFORMERS,
+    GRANITEDOCLING_VLLM,
 )
 from docling.datamodel.vlm_model_specs import (
     SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
@@ -143,6 +145,34 @@ def get_layout_model_config_names() -> List[str]:
         return list(DoclingLayoutOptionsManager.layout_model_configs.keys())
 
 
+class GraniteDoclingVlmOptionsManager:
+    vlm_options_configs = {
+        "granitedocling_mlx": GRANITEDOCLING_MLX,
+        "granitedocling_transformers": GRANITEDOCLING_TRANSFORMERS,
+        "granitedocling_vllm": GRANITEDOCLING_VLLM,
+    }
+
+    @staticmethod
+    def get_granitedocling_vlm_config(vlm_spec: str) -> InlineVlmOptions:
+        return GraniteDoclingVlmOptionsManager.vlm_options_configs[vlm_spec]
+
+    @staticmethod
+    def get_granitedocling_vlm_config_names() -> List[str]:
+        return list(GraniteDoclingVlmOptionsManager.vlm_options_configs.keys())
+
+    @staticmethod
+    def get_granitedocling_vlm_config_name(
+        vlm_options: InlineVlmOptions,
+    ) -> Optional[str]:
+        for (
+            config_name,
+            vlm_opt,
+        ) in GraniteDoclingVlmOptionsManager.vlm_options_configs.items():
+            if vlm_options == vlm_opt:
+                return config_name
+        return None
+
+
 # Configure logging
 logging_level = logging.WARNING
 # logging_level = logging.DEBUG
@@ -331,6 +361,8 @@ def get_prediction_provider(
     # Controls orphan text cells only for the programmatic Docling pipeline (PDF_DOCLING)
     docling_programmatic_add_orphan_text_cells: Optional[bool] = None,
     docling_force_full_page_ocr: Optional[bool] = None,
+    granite_docling_vlm_options: Optional[InlineVlmOptions] = None,
+    max_new_tokens: Optional[int] = None,
 ):
     pipeline_options: PaginatedPipelineOptions
     """Get the appropriate prediction provider with default settings."""
@@ -508,12 +540,24 @@ def get_prediction_provider(
         pipeline_options.images_scale = image_scale_factor or 2.0
         pipeline_options.generate_page_images = True
         pipeline_options.generate_picture_images = True
-
         pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
+
+        if max_new_tokens:
+            pipeline_options.vlm_options.max_new_tokens = max_new_tokens
+
         if artifacts_path is not None:
             pipeline_options.artifacts_path = artifacts_path
 
-        if sys.platform == "darwin":
+        if granite_docling_vlm_options:
+            pipeline_options.vlm_options = granite_docling_vlm_options
+            vlm_option_name = (
+                GraniteDoclingVlmOptionsManager.get_granitedocling_vlm_config_name(
+                    granite_docling_vlm_options
+                )
+            )
+            if vlm_option_name:
+                _log.info("running GraniteDocling on %s", granite_docling_vlm_options)
+        elif sys.platform == "darwin":
             try:
                 import mlx_vlm  # type: ignore
 
@@ -1206,6 +1250,17 @@ def create_eval(
         bool,
         typer.Option(help="Force OCR on entire page (only Docling OCR providers)"),
     ] = False,
+    granite_docling_vlm_options: Annotated[
+        Optional[str],
+        typer.Option(
+            help="Vlm options for GraniteDocling. Supported values: {}".format(
+                GraniteDoclingVlmOptionsManager.get_granitedocling_vlm_config_names()
+            )
+        ),
+    ] = "granitedocling_transformers",
+    max_new_tokens: Annotated[
+        Optional[int], typer.Option(help="Override the default value of max_new_tokens")
+    ] = None,
 ):
     """Create evaluation dataset from existing ground truth."""
     gt_dir = gt_dir or output_dir / "gt_dataset"
@@ -1236,6 +1291,14 @@ def create_eval(
             else None
         )
 
+        granitedocling_vlm_options_obj = (
+            GraniteDoclingVlmOptionsManager.get_granitedocling_vlm_config(
+                granite_docling_vlm_options
+            )
+            if granite_docling_vlm_options
+            else None
+        )
+
         provider = get_prediction_provider(
             provider_type=prediction_provider,
             file_source_path=file_source_path,
@@ -1251,6 +1314,8 @@ def create_eval(
             docling_layout_keep_empty_clusters=docling_layout_keep_empty_clusters,
             docling_programmatic_add_orphan_text_cells=programmatic_add_orphan_text_cells,
             docling_force_full_page_ocr=docling_force_full_page_ocr,
+            granite_docling_vlm_options=granitedocling_vlm_options_obj,
+            max_new_tokens=max_new_tokens,
         )
 
         # Get the dataset name from the benchmark
diff --git a/docling_eval/utils/coco_exporter.py b/docling_eval/utils/coco_exporter.py
@@ -7,7 +7,12 @@
 
 from datasets import Dataset, load_dataset
 from docling_core.types.doc.base import BoundingBox, Size
-from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
+from docling_core.types.doc.document import (
+    ContentLayer,
+    DocItem,
+    DoclingDocument,
+    PageItem,
+)
 from docling_core.types.doc.labels import DocItemLabel
 from PIL import Image
 from pycocotools.coco import COCO
@@ -103,6 +108,9 @@ def __init__(self, docling_eval_ds_path: Path):
         r""" """
         self._docling_eval_ds_path = docling_eval_ds_path
 
+        # Tolerance in size diff between page size and page image size measured in pixels
+        self._page_image_pixels_tolerance = 2
+
     def export_COCO_and_predictions(
         self,
         split: str,
@@ -126,6 +134,8 @@ def export_COCO(
         source_doc_column: str = "GT",
     ):
         r"""
+        Export COCO dataset
+
         Parameters
         ----------
         save_dir: Location to save the exported COCO dataset
@@ -221,6 +231,8 @@ def _extract_layout_coco_annotations(
         annotation_id_offset: int,
     ) -> Tuple[List[Dict], List[Dict], int, int]:
         r"""
+        Extract layout information from DoclingDocument into coco-tools format
+
         Returns
         -------
         images: List of dict in COCO format with the images in the document
@@ -264,9 +276,10 @@ def _extract_layout_coco_annotations(
             if page.image is not None and page_no > len(doc_images):
                 img: Image.Image = page.image.pil_image  # type: ignore
                 if img:
-                    assert (
-                        img.width == page_size.width and img.height == page_size.height
-                    )
+                    # Check the tolerance for the page/image size mismatch
+                    page_size = self._check_page_image_size(page)
+                    if not page_size:
+                        continue
 
                     image_filename = (
                         f"{doc_id}.png"
@@ -528,6 +541,31 @@ def _extract_layout_predictions(
                 category_ids.append(category_id)
         return category_ids, scores, bboxes
 
+    def _check_page_image_size(self, page: PageItem) -> Optional[Size]:
+        r"""
+        Check if the page size and page image size are within the allowed tolerance
+        If the tolerance is respected, return the smaller size, otherwise return None
+        """
+        page_size = page.size
+        img: Image.Image = page.image.pil_image  # type: ignore
+        if (
+            abs(img.width - page_size.width) > self._page_image_pixels_tolerance
+            or abs(img.height - page_size.height) > self._page_image_pixels_tolerance
+        ):
+            _log.error(
+                "Page/image size diff exceeds tolerance (%f): (%d, %d) vs (%d, %d)",
+                self._page_image_pixels_tolerance,
+                page_size.width,
+                page_size.height,
+                img.width,
+                img.height,
+            )
+            return None
+        return Size(
+            width=min(page_size.width, img.width),
+            height=min(page_size.height, img.height),
+        )
+
 
 def main():
     r""" """
@@ -592,7 +630,7 @@ def main():
             args.save_dir,
             doc_label_to_valid_label_mapping,
         )
-    elif args.operation.upper() == "predictions":
+    elif args.operation.upper() == "PREDICTIONS":
         exporter.export_predictions_wrt_original_COCO(
             "test",
             args.save_dir,
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ classifiers = [
 ]
 requires-python = '>=3.10,<4.0'
 dependencies = [
-    'docling[vlm] (>=2.42.0,<3.0.0)',
+    'docling[vlm] (>=2.56.1,<3.0.0)',
     "docling-core>=2.48.0,<3.0.0",
     'pydantic (>=2.0.0,<3.0.0)',
     'lxml (>=5.3.0,<6.0.0)',
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ classifiers = [`
`25`	`25`	`]`
`26`	`26`	`requires-python = '>=3.10,<4.0'`
`27`	`27`	`dependencies = [`
`28`		`- 'docling[vlm] (>=2.42.0,<3.0.0)',`
	`28`	`+ 'docling[vlm] (>=2.56.1,<3.0.0)',`
`29`	`29`	`"docling-core>=2.48.0,<3.0.0",`
`30`	`30`	`'pydantic (>=2.0.0,<3.0.0)',`
`31`	`31`	`'lxml (>=5.3.0,<6.0.0)',`