@@ -327,6 +327,7 @@ def get_prediction_provider(
327327 docling_layout_model_spec : Optional [LayoutModelConfig ] = None ,
328328 docling_layout_create_orphan_clusters : Optional [bool ] = None ,
329329 docling_layout_keep_empty_clusters : Optional [bool ] = None ,
330+ docling_force_full_page_ocr : Optional [bool ] = None ,
330331):
331332 pipeline_options : PaginatedPipelineOptions
332333 """Get the appropriate prediction provider with default settings."""
@@ -339,6 +340,7 @@ def get_prediction_provider(
339340
340341 ocr_options : OcrOptions = ocr_factory .create_options ( # type: ignore
341342 kind = "easyocr" ,
343+ force_full_page_ocr = docling_force_full_page_ocr ,
342344 )
343345 # Use all CPU cores
344346 accelerator_options = AcceleratorOptions (
@@ -386,6 +388,7 @@ def get_prediction_provider(
386388
387389 ocr_options : OcrOptions = ocr_factory .create_options ( # type: ignore
388390 kind = "ocrmac" ,
391+ force_full_page_ocr = docling_force_full_page_ocr ,
389392 )
390393
391394 pipeline_options = PdfPipelineOptions (
@@ -415,6 +418,7 @@ def get_prediction_provider(
415418
416419 ocr_options : OcrOptions = ocr_factory .create_options ( # type: ignore
417420 kind = "easyocr" ,
421+ force_full_page_ocr = docling_force_full_page_ocr ,
418422 )
419423
420424 pdf_pipeline_options = PdfPipelineOptions (
@@ -468,7 +472,7 @@ def get_prediction_provider(
468472 import mlx_vlm # type: ignore
469473
470474 pipeline_options .vlm_options = smoldocling_vlm_mlx_conversion_options
471-
475+ _log . info ( "running SmolDocling on MLX!" )
472476 except ImportError :
473477 _log .warning (
474478 "To run SmolDocling faster, please install mlx-vlm:\n "
@@ -503,7 +507,7 @@ def get_prediction_provider(
503507 import mlx_vlm # type: ignore
504508
505509 pipeline_options .vlm_options = GRANITEDOCLING_MLX
506-
510+ _log . info ( "running GraniteDocling on MLX!" )
507511 except ImportError :
508512 _log .warning (
509513 "To run SmolDocling faster, please install mlx-vlm:\n "
@@ -1175,6 +1179,10 @@ def create_eval(
11751179 do_table_structure : Annotated [
11761180 bool , typer .Option (help = "Include table structure predictions (only Docling)" )
11771181 ] = True ,
1182+ docling_force_full_page_ocr : Annotated [
1183+ bool ,
1184+ typer .Option (help = "Force OCR on entire page (only Docling OCR providers)" ),
1185+ ] = False ,
11781186):
11791187 """Create evaluation dataset from existing ground truth."""
11801188 gt_dir = gt_dir or output_dir / "gt_dataset"
@@ -1218,6 +1226,7 @@ def create_eval(
12181226 docling_layout_model_spec = docling_layout_model_spec_obj ,
12191227 docling_layout_create_orphan_clusters = docling_layout_create_orphan_clusters ,
12201228 docling_layout_keep_empty_clusters = docling_layout_keep_empty_clusters ,
1229+ docling_force_full_page_ocr = docling_force_full_page_ocr ,
12211230 )
12221231
12231232 # Get the dataset name from the benchmark
@@ -1272,6 +1281,10 @@ def create(
12721281 do_table_structure : Annotated [
12731282 bool , typer .Option (help = "Include table structure predictions (only Docling)" )
12741283 ] = True ,
1284+ docling_force_full_page_ocr : Annotated [
1285+ bool ,
1286+ typer .Option (help = "Force OCR on entire page (only Docling OCR providers)" ),
1287+ ] = False ,
12751288):
12761289 """Create both ground truth and evaluation datasets in one step."""
12771290 # First create ground truth
@@ -1301,6 +1314,7 @@ def create(
13011314 do_visualization = do_visualization ,
13021315 image_scale_factor = image_scale_factor ,
13031316 do_table_structure = do_table_structure ,
1317+ docling_force_full_page_ocr = docling_force_full_page_ocr ,
13041318 )
13051319 else :
13061320 _log .info (
0 commit comments