@@ -227,10 +227,13 @@ def get_dataset_builder(
227227
228228def get_prediction_provider (
229229 provider_type : PredictionProviderType ,
230+ * ,
230231 file_source_path : Optional [Path ] = None ,
231232 file_prediction_format : Optional [PredictionFormats ] = None ,
232233 do_visualization : bool = True ,
234+ do_table_structure : bool = True ,
233235 artifacts_path : Optional [Path ] = None ,
236+ image_scale_factor : Optional [float ] = None ,
234237):
235238 pipeline_options : PaginatedPipelineOptions
236239 """Get the appropriate prediction provider with default settings."""
@@ -248,10 +251,10 @@ def get_prediction_provider(
248251 pipeline_options = PdfPipelineOptions (
249252 do_ocr = True ,
250253 ocr_options = ocr_options ,
251- do_table_structure = True ,
254+ do_table_structure = do_table_structure ,
252255 )
253256
254- pipeline_options .images_scale = 2.0
257+ pipeline_options .images_scale = image_scale_factor or 2.0
255258 pipeline_options .generate_page_images = True
256259 pipeline_options .generate_picture_images = True
257260 pipeline_options .generate_parsed_pages = True
@@ -278,10 +281,10 @@ def get_prediction_provider(
278281 pipeline_options = PdfPipelineOptions (
279282 do_ocr = True ,
280283 ocr_options = ocr_options ,
281- do_table_structure = True ,
284+ do_table_structure = do_table_structure ,
282285 )
283286
284- pipeline_options .images_scale = 2.0
287+ pipeline_options .images_scale = image_scale_factor or 2.0
285288 pipeline_options .generate_page_images = True
286289 pipeline_options .generate_picture_images = True
287290
@@ -308,20 +311,20 @@ def get_prediction_provider(
308311 pdf_pipeline_options = PdfPipelineOptions (
309312 do_ocr = False ,
310313 ocr_options = ocr_options , # we need to provide OCR options in order to not break the parquet serialization
311- do_table_structure = True ,
314+ do_table_structure = do_table_structure ,
312315 )
313316
314- pdf_pipeline_options .images_scale = 2.0
317+ pdf_pipeline_options .images_scale = image_scale_factor or 2.0
315318 pdf_pipeline_options .generate_page_images = True
316319 pdf_pipeline_options .generate_picture_images = True
317320
318321 ocr_pipeline_options = PdfPipelineOptions (
319322 do_ocr = True ,
320323 ocr_options = ocr_options , # we need to provide OCR options in order to not break the parquet serialization
321- do_table_structure = True ,
324+ do_table_structure = do_table_structure ,
322325 )
323326
324- ocr_pipeline_options .images_scale = 2.0
327+ ocr_pipeline_options .images_scale = image_scale_factor or 2.0
325328 ocr_pipeline_options .generate_page_images = True
326329 ocr_pipeline_options .generate_picture_images = True
327330
@@ -343,20 +346,20 @@ def get_prediction_provider(
343346 elif provider_type == PredictionProviderType .SMOLDOCLING :
344347 pipeline_options = VlmPipelineOptions ()
345348
346- pipeline_options .images_scale = 2.0
349+ pipeline_options .images_scale = image_scale_factor or 2.0
347350 pipeline_options .generate_page_images = True
348351 pipeline_options .generate_picture_images = True
349352
350353 pipeline_options .vlm_options = smoldocling_vlm_conversion_options
354+ if artifacts_path is not None :
355+ pipeline_options .artifacts_path = artifacts_path
356+
351357 if sys .platform == "darwin" :
352358 try :
353359 import mlx_vlm # type: ignore
354360
355361 pipeline_options .vlm_options = smoldocling_vlm_mlx_conversion_options
356362
357- if artifacts_path is not None :
358- pipeline_options .artifacts_path = artifacts_path
359-
360363 except ImportError :
361364 _log .warning (
362365 "To run SmolDocling faster, please install mlx-vlm:\n "
@@ -918,6 +921,13 @@ def create_eval(
918921 do_visualization : Annotated [
919922 bool , typer .Option (help = "visualize the predictions" )
920923 ] = True ,
924+ image_scale_factor : Annotated [
925+ float ,
926+ typer .Option (help = "Scale of page images used in prediction (only Docling)" ),
927+ ] = 2.0 ,
928+ do_table_structure : Annotated [
929+ bool , typer .Option (help = "Include table structure predictions (only Docling)" )
930+ ] = True ,
921931):
922932 """Create evaluation dataset from existing ground truth."""
923933 gt_dir = gt_dir or output_dir / "gt_dataset"
@@ -946,6 +956,8 @@ def create_eval(
946956 file_prediction_format = file_format ,
947957 artifacts_path = artifacts_path ,
948958 do_visualization = do_visualization ,
959+ image_scale_factor = image_scale_factor ,
960+ do_table_structure = do_table_structure ,
949961 )
950962
951963 # Get the dataset name from the benchmark
@@ -993,6 +1005,13 @@ def create(
9931005 do_visualization : Annotated [
9941006 bool , typer .Option (help = "visualize the predictions" )
9951007 ] = True ,
1008+ image_scale_factor : Annotated [
1009+ float ,
1010+ typer .Option (help = "Scale of page images used in prediction (only Docling)" ),
1011+ ] = 2.0 ,
1012+ do_table_structure : Annotated [
1013+ bool , typer .Option (help = "Include table structure predictions (only Docling)" )
1014+ ] = True ,
9961015):
9971016 """Create both ground truth and evaluation datasets in one step."""
9981017 # First create ground truth
@@ -1020,6 +1039,8 @@ def create(
10201039 file_prediction_format = file_prediction_format ,
10211040 file_source_path = file_source_path ,
10221041 do_visualization = do_visualization ,
1042+ image_scale_factor = image_scale_factor ,
1043+ do_table_structure = do_table_structure ,
10231044 )
10241045 else :
10251046 _log .info (
0 commit comments