Skip to content

Commit ac21644

Browse files
feat: exposed forced-ocr-option (#157)
* exposed forced-ocr-option Signed-off-by: Peter Staar <[email protected]> * refactored the code Signed-off-by: Peter Staar <[email protected]> * moved warning to info Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent 208cd14 commit ac21644

File tree

1 file changed

+16
-2
lines changed

1 file changed

+16
-2
lines changed

docling_eval/cli/main.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,7 @@ def get_prediction_provider(
327327
docling_layout_model_spec: Optional[LayoutModelConfig] = None,
328328
docling_layout_create_orphan_clusters: Optional[bool] = None,
329329
docling_layout_keep_empty_clusters: Optional[bool] = None,
330+
docling_force_full_page_ocr: Optional[bool] = None,
330331
):
331332
pipeline_options: PaginatedPipelineOptions
332333
"""Get the appropriate prediction provider with default settings."""
@@ -339,6 +340,7 @@ def get_prediction_provider(
339340

340341
ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
341342
kind="easyocr",
343+
force_full_page_ocr=docling_force_full_page_ocr,
342344
)
343345
# Use all CPU cores
344346
accelerator_options = AcceleratorOptions(
@@ -386,6 +388,7 @@ def get_prediction_provider(
386388

387389
ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
388390
kind="ocrmac",
391+
force_full_page_ocr=docling_force_full_page_ocr,
389392
)
390393

391394
pipeline_options = PdfPipelineOptions(
@@ -415,6 +418,7 @@ def get_prediction_provider(
415418

416419
ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
417420
kind="easyocr",
421+
force_full_page_ocr=docling_force_full_page_ocr,
418422
)
419423

420424
pdf_pipeline_options = PdfPipelineOptions(
@@ -468,7 +472,7 @@ def get_prediction_provider(
468472
import mlx_vlm # type: ignore
469473

470474
pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
471-
475+
_log.info("running SmolDocling on MLX!")
472476
except ImportError:
473477
_log.warning(
474478
"To run SmolDocling faster, please install mlx-vlm:\n"
@@ -503,7 +507,7 @@ def get_prediction_provider(
503507
import mlx_vlm # type: ignore
504508

505509
pipeline_options.vlm_options = GRANITEDOCLING_MLX
506-
510+
_log.info("running GraniteDocling on MLX!")
507511
except ImportError:
508512
_log.warning(
509513
"To run SmolDocling faster, please install mlx-vlm:\n"
@@ -1175,6 +1179,10 @@ def create_eval(
11751179
do_table_structure: Annotated[
11761180
bool, typer.Option(help="Include table structure predictions (only Docling)")
11771181
] = True,
1182+
docling_force_full_page_ocr: Annotated[
1183+
bool,
1184+
typer.Option(help="Force OCR on entire page (only Docling OCR providers)"),
1185+
] = False,
11781186
):
11791187
"""Create evaluation dataset from existing ground truth."""
11801188
gt_dir = gt_dir or output_dir / "gt_dataset"
@@ -1218,6 +1226,7 @@ def create_eval(
12181226
docling_layout_model_spec=docling_layout_model_spec_obj,
12191227
docling_layout_create_orphan_clusters=docling_layout_create_orphan_clusters,
12201228
docling_layout_keep_empty_clusters=docling_layout_keep_empty_clusters,
1229+
docling_force_full_page_ocr=docling_force_full_page_ocr,
12211230
)
12221231

12231232
# Get the dataset name from the benchmark
@@ -1272,6 +1281,10 @@ def create(
12721281
do_table_structure: Annotated[
12731282
bool, typer.Option(help="Include table structure predictions (only Docling)")
12741283
] = True,
1284+
docling_force_full_page_ocr: Annotated[
1285+
bool,
1286+
typer.Option(help="Force OCR on entire page (only Docling OCR providers)"),
1287+
] = False,
12751288
):
12761289
"""Create both ground truth and evaluation datasets in one step."""
12771290
# First create ground truth
@@ -1301,6 +1314,7 @@ def create(
13011314
do_visualization=do_visualization,
13021315
image_scale_factor=image_scale_factor,
13031316
do_table_structure=do_table_structure,
1317+
docling_force_full_page_ocr=docling_force_full_page_ocr,
13041318
)
13051319
else:
13061320
_log.info(

0 commit comments

Comments
 (0)