Skip to content

Commit 8be2e83

Browse files
feat: Extend the CLI for create-eval to receive the vlm-options and max_new_tokens parameters when the provider is GraniteDocling (#164)
* chore: Remove pining of docling, docling-core and advance the docling version to 2.56 Signed-off-by: Nikos Livathinos <[email protected]> * feat: Extend the CLI to receive the GraniteDocling-specific options `--granite-docling-vlm-options`, `--max-new-tokens` and propagate them in the `get_prediction_provider()` Signed-off-by: Nikos Livathinos <[email protected]> * chore: Pin the docling-core version to the correct branch Signed-off-by: Nikos Livathinos <[email protected]> * fix: Set the default max_new_tokens CLI parameter to None Signed-off-by: Nikos Livathinos <[email protected]> * fix: Refactor the DoclingEvalCOCOExporter to check for page/image size mismatches within a tolerance Signed-off-by: Nikos Livathinos <[email protected]> * fix: Fix the CLI for coco_exporter.py Signed-off-by: Nikos Livathinos <[email protected]> * fix: Improve the logic in main to set the vlm_options when using GraniteDocling. First priority is given to user-defined CLI parameters, then it is decided by the engines available in the system. Signed-off-by: Nikos Livathinos <[email protected]> --------- Signed-off-by: Nikos Livathinos <[email protected]>
1 parent 740157d commit 8be2e83

File tree

4 files changed

+188
-239
lines changed

4 files changed

+188
-239
lines changed

docling_eval/cli/main.py

Lines changed: 68 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pathlib import Path
88

99
# --- DoclingLayoutOptionsManager definition moved here ---
10-
from typing import Annotated, Dict, List, Optional, Tuple
10+
from typing import Annotated, Dict, List, Optional, Tuple, Union
1111

1212
import typer
1313
from docling.datamodel.accelerator_options import AcceleratorOptions
@@ -27,9 +27,11 @@
2727
PdfPipelineOptions,
2828
VlmPipelineOptions,
2929
)
30+
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
3031
from docling.datamodel.vlm_model_specs import (
3132
GRANITEDOCLING_MLX,
3233
GRANITEDOCLING_TRANSFORMERS,
34+
GRANITEDOCLING_VLLM,
3335
)
3436
from docling.datamodel.vlm_model_specs import (
3537
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
@@ -143,6 +145,34 @@ def get_layout_model_config_names() -> List[str]:
143145
return list(DoclingLayoutOptionsManager.layout_model_configs.keys())
144146

145147

148+
class GraniteDoclingVlmOptionsManager:
149+
vlm_options_configs = {
150+
"granitedocling_mlx": GRANITEDOCLING_MLX,
151+
"granitedocling_transformers": GRANITEDOCLING_TRANSFORMERS,
152+
"granitedocling_vllm": GRANITEDOCLING_VLLM,
153+
}
154+
155+
@staticmethod
156+
def get_granitedocling_vlm_config(vlm_spec: str) -> InlineVlmOptions:
157+
return GraniteDoclingVlmOptionsManager.vlm_options_configs[vlm_spec]
158+
159+
@staticmethod
160+
def get_granitedocling_vlm_config_names() -> List[str]:
161+
return list(GraniteDoclingVlmOptionsManager.vlm_options_configs.keys())
162+
163+
@staticmethod
164+
def get_granitedocling_vlm_config_name(
165+
vlm_options: InlineVlmOptions,
166+
) -> Optional[str]:
167+
for (
168+
config_name,
169+
vlm_opt,
170+
) in GraniteDoclingVlmOptionsManager.vlm_options_configs.items():
171+
if vlm_options == vlm_opt:
172+
return config_name
173+
return None
174+
175+
146176
# Configure logging
147177
logging_level = logging.WARNING
148178
# logging_level = logging.DEBUG
@@ -331,6 +361,8 @@ def get_prediction_provider(
331361
# Controls orphan text cells only for the programmatic Docling pipeline (PDF_DOCLING)
332362
docling_programmatic_add_orphan_text_cells: Optional[bool] = None,
333363
docling_force_full_page_ocr: Optional[bool] = None,
364+
granite_docling_vlm_options: Optional[InlineVlmOptions] = None,
365+
max_new_tokens: Optional[int] = None,
334366
):
335367
pipeline_options: PaginatedPipelineOptions
336368
"""Get the appropriate prediction provider with default settings."""
@@ -508,12 +540,24 @@ def get_prediction_provider(
508540
pipeline_options.images_scale = image_scale_factor or 2.0
509541
pipeline_options.generate_page_images = True
510542
pipeline_options.generate_picture_images = True
511-
512543
pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
544+
545+
if max_new_tokens:
546+
pipeline_options.vlm_options.max_new_tokens = max_new_tokens
547+
513548
if artifacts_path is not None:
514549
pipeline_options.artifacts_path = artifacts_path
515550

516-
if sys.platform == "darwin":
551+
if granite_docling_vlm_options:
552+
pipeline_options.vlm_options = granite_docling_vlm_options
553+
vlm_option_name = (
554+
GraniteDoclingVlmOptionsManager.get_granitedocling_vlm_config_name(
555+
granite_docling_vlm_options
556+
)
557+
)
558+
if vlm_option_name:
559+
_log.info("running GraniteDocling on %s", granite_docling_vlm_options)
560+
elif sys.platform == "darwin":
517561
try:
518562
import mlx_vlm # type: ignore
519563

@@ -1206,6 +1250,17 @@ def create_eval(
12061250
bool,
12071251
typer.Option(help="Force OCR on entire page (only Docling OCR providers)"),
12081252
] = False,
1253+
granite_docling_vlm_options: Annotated[
1254+
Optional[str],
1255+
typer.Option(
1256+
help="Vlm options for GraniteDocling. Supported values: {}".format(
1257+
GraniteDoclingVlmOptionsManager.get_granitedocling_vlm_config_names()
1258+
)
1259+
),
1260+
] = "granitedocling_transformers",
1261+
max_new_tokens: Annotated[
1262+
Optional[int], typer.Option(help="Override the default value of max_new_tokens")
1263+
] = None,
12091264
):
12101265
"""Create evaluation dataset from existing ground truth."""
12111266
gt_dir = gt_dir or output_dir / "gt_dataset"
@@ -1236,6 +1291,14 @@ def create_eval(
12361291
else None
12371292
)
12381293

1294+
granitedocling_vlm_options_obj = (
1295+
GraniteDoclingVlmOptionsManager.get_granitedocling_vlm_config(
1296+
granite_docling_vlm_options
1297+
)
1298+
if granite_docling_vlm_options
1299+
else None
1300+
)
1301+
12391302
provider = get_prediction_provider(
12401303
provider_type=prediction_provider,
12411304
file_source_path=file_source_path,
@@ -1251,6 +1314,8 @@ def create_eval(
12511314
docling_layout_keep_empty_clusters=docling_layout_keep_empty_clusters,
12521315
docling_programmatic_add_orphan_text_cells=programmatic_add_orphan_text_cells,
12531316
docling_force_full_page_ocr=docling_force_full_page_ocr,
1317+
granite_docling_vlm_options=granitedocling_vlm_options_obj,
1318+
max_new_tokens=max_new_tokens,
12541319
)
12551320

12561321
# Get the dataset name from the benchmark

docling_eval/utils/coco_exporter.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,12 @@
77

88
from datasets import Dataset, load_dataset
99
from docling_core.types.doc.base import BoundingBox, Size
10-
from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
10+
from docling_core.types.doc.document import (
11+
ContentLayer,
12+
DocItem,
13+
DoclingDocument,
14+
PageItem,
15+
)
1116
from docling_core.types.doc.labels import DocItemLabel
1217
from PIL import Image
1318
from pycocotools.coco import COCO
@@ -103,6 +108,9 @@ def __init__(self, docling_eval_ds_path: Path):
103108
r""" """
104109
self._docling_eval_ds_path = docling_eval_ds_path
105110

111+
# Tolerance in size diff between page size and page image size measured in pixels
112+
self._page_image_pixels_tolerance = 2
113+
106114
def export_COCO_and_predictions(
107115
self,
108116
split: str,
@@ -126,6 +134,8 @@ def export_COCO(
126134
source_doc_column: str = "GT",
127135
):
128136
r"""
137+
Export COCO dataset
138+
129139
Parameters
130140
----------
131141
save_dir: Location to save the exported COCO dataset
@@ -221,6 +231,8 @@ def _extract_layout_coco_annotations(
221231
annotation_id_offset: int,
222232
) -> Tuple[List[Dict], List[Dict], int, int]:
223233
r"""
234+
Extract layout information from DoclingDocument into coco-tools format
235+
224236
Returns
225237
-------
226238
images: List of dict in COCO format with the images in the document
@@ -264,9 +276,10 @@ def _extract_layout_coco_annotations(
264276
if page.image is not None and page_no > len(doc_images):
265277
img: Image.Image = page.image.pil_image # type: ignore
266278
if img:
267-
assert (
268-
img.width == page_size.width and img.height == page_size.height
269-
)
279+
# Check the tolerance for the page/image size mismatch
280+
page_size = self._check_page_image_size(page)
281+
if not page_size:
282+
continue
270283

271284
image_filename = (
272285
f"{doc_id}.png"
@@ -528,6 +541,31 @@ def _extract_layout_predictions(
528541
category_ids.append(category_id)
529542
return category_ids, scores, bboxes
530543

544+
def _check_page_image_size(self, page: PageItem) -> Optional[Size]:
545+
r"""
546+
Check if the page size and page image size are within the allowed tolerance
547+
If the tolerance is respected, return the smaller size, otherwise return None
548+
"""
549+
page_size = page.size
550+
img: Image.Image = page.image.pil_image # type: ignore
551+
if (
552+
abs(img.width - page_size.width) > self._page_image_pixels_tolerance
553+
or abs(img.height - page_size.height) > self._page_image_pixels_tolerance
554+
):
555+
_log.error(
556+
"Page/image size diff exceeds tolerance (%f): (%d, %d) vs (%d, %d)",
557+
self._page_image_pixels_tolerance,
558+
page_size.width,
559+
page_size.height,
560+
img.width,
561+
img.height,
562+
)
563+
return None
564+
return Size(
565+
width=min(page_size.width, img.width),
566+
height=min(page_size.height, img.height),
567+
)
568+
531569

532570
def main():
533571
r""" """
@@ -592,7 +630,7 @@ def main():
592630
args.save_dir,
593631
doc_label_to_valid_label_mapping,
594632
)
595-
elif args.operation.upper() == "predictions":
633+
elif args.operation.upper() == "PREDICTIONS":
596634
exporter.export_predictions_wrt_original_COCO(
597635
"test",
598636
args.save_dir,

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ classifiers = [
2525
]
2626
requires-python = '>=3.10,<4.0'
2727
dependencies = [
28-
'docling[vlm] (>=2.42.0,<3.0.0)',
28+
'docling[vlm] (>=2.56.1,<3.0.0)',
2929
"docling-core>=2.48.0,<3.0.0",
3030
'pydantic (>=2.0.0,<3.0.0)',
3131
'lxml (>=5.3.0,<6.0.0)',

0 commit comments

Comments
 (0)