7373
7474# Configure logging
7575logging .getLogger ("docling" ).setLevel (logging .WARNING )
76+ logging .getLogger ("PIL" ).setLevel (logging .WARNING )
77+ logging .getLogger ("transformers" ).setLevel (logging .WARNING )
78+ logging .getLogger ("datasets" ).setLevel (logging .WARNING )
79+ logging .getLogger ("filelock" ).setLevel (logging .WARNING )
80+ logging .getLogger ("urllib3" ).setLevel (logging .WARNING )
81+ logging .getLogger ("docling_ibm_models" ).setLevel (logging .WARNING )
82+
7683_log = logging .getLogger (__name__ )
7784
7885app = typer .Typer (
@@ -188,14 +195,17 @@ def get_dataset_builder(
188195 name = "CVAT" , dataset_source = dataset_source , target = target , split = split
189196 )
190197 elif benchmark == BenchMarkNames .PLAIN_FILES :
191- assert dataset_source is not None
198+ if dataset_source is None :
199+ raise ValueError ("dataset_source is required for PLAIN_FILES" )
200+
192201 return FileDatasetBuilder (
193202 name = dataset_source .name ,
194203 dataset_source = dataset_source ,
195204 target = target ,
196205 split = split ,
206+ begin_index = begin_index ,
207+ end_index = end_index ,
197208 )
198-
199209 else :
200210 raise ValueError (f"Unsupported benchmark: { benchmark } " )
201211
@@ -209,7 +219,11 @@ def get_prediction_provider(
209219):
210220 pipeline_options : PaginatedPipelineOptions
211221 """Get the appropriate prediction provider with default settings."""
212- if provider_type == PredictionProviderType .DOCLING :
222+ if (
223+ provider_type == PredictionProviderType .DOCLING
224+ or provider_type == PredictionProviderType .OCR_DOCLING
225+ or provider_type == PredictionProviderType .EasyOCR_DOCLING
226+ ):
213227 ocr_factory = get_ocr_factory ()
214228
215229 ocr_options : OcrOptions = ocr_factory .create_options ( # type: ignore
@@ -238,6 +252,78 @@ def get_prediction_provider(
238252 ignore_missing_predictions = True ,
239253 )
240254
255+ elif provider_type == PredictionProviderType .MacOCR_DOCLING :
256+ ocr_factory = get_ocr_factory ()
257+
258+ ocr_options : OcrOptions = ocr_factory .create_options ( # type: ignore
259+ kind = "ocrmac" ,
260+ )
261+
262+ pipeline_options = PdfPipelineOptions (
263+ do_ocr = True ,
264+ ocr_options = ocr_options ,
265+ do_table_structure = True ,
266+ )
267+
268+ pipeline_options .images_scale = 2.0
269+ pipeline_options .generate_page_images = True
270+ pipeline_options .generate_picture_images = True
271+
272+ if artifacts_path is not None :
273+ pipeline_options .artifacts_path = artifacts_path
274+
275+ return DoclingPredictionProvider (
276+ format_options = {
277+ InputFormat .PDF : PdfFormatOption (pipeline_options = pipeline_options ),
278+ InputFormat .IMAGE : PdfFormatOption (pipeline_options = pipeline_options ),
279+ },
280+ do_visualization = do_visualization ,
281+ ignore_missing_predictions = True ,
282+ )
283+
284+ elif provider_type == PredictionProviderType .PDF_DOCLING :
285+
286+ ocr_factory = get_ocr_factory ()
287+
288+ ocr_options : OcrOptions = ocr_factory .create_options ( # type: ignore
289+ kind = "easyocr" ,
290+ )
291+
292+ pdf_pipeline_options = PdfPipelineOptions (
293+ do_ocr = False ,
294+ ocr_options = ocr_options , # we need to provide OCR options in order to not break the parquet serialization
295+ do_table_structure = True ,
296+ )
297+
298+ pdf_pipeline_options .images_scale = 2.0
299+ pdf_pipeline_options .generate_page_images = True
300+ pdf_pipeline_options .generate_picture_images = True
301+
302+ ocr_pipeline_options = PdfPipelineOptions (
303+ do_ocr = True ,
304+ ocr_options = ocr_options , # we need to provide OCR options in order to not break the parquet serialization
305+ do_table_structure = True ,
306+ )
307+
308+ ocr_pipeline_options .images_scale = 2.0
309+ ocr_pipeline_options .generate_page_images = True
310+ ocr_pipeline_options .generate_picture_images = True
311+
312+ if artifacts_path is not None :
313+ pdf_pipeline_options .artifacts_path = artifacts_path
314+ ocr_pipeline_options .artifacts_path = artifacts_path
315+
316+ return DoclingPredictionProvider (
317+ format_options = {
318+ InputFormat .PDF : PdfFormatOption (pipeline_options = pdf_pipeline_options ),
319+ InputFormat .IMAGE : PdfFormatOption (
320+ pipeline_options = ocr_pipeline_options
321+ ),
322+ },
323+ do_visualization = do_visualization ,
324+ ignore_missing_predictions = True ,
325+ )
326+
241327 elif provider_type == PredictionProviderType .SMOLDOCLING :
242328 pipeline_options = VlmPipelineOptions ()
243329
@@ -614,9 +700,14 @@ def create_cvat(
614700 output_dir : Annotated [Path , typer .Option (help = "Output directory" )],
615701 gt_dir : Annotated [Path , typer .Option (help = "Dataset source path" )],
616702 bucket_size : Annotated [int , typer .Option (help = "Size of CVAT tasks" )] = 20 ,
703+ use_predictions : Annotated [bool , typer .Option (help = "use predictions" )] = False ,
617704):
705+ """Create dataset ready to upload to CVAT starting from (ground-truth) dataset."""
618706 builder = CvatPreannotationBuilder (
619- dataset_source = gt_dir , target = output_dir , bucket_size = bucket_size
707+ dataset_source = gt_dir ,
708+ target = output_dir ,
709+ bucket_size = bucket_size ,
710+ use_predictions = use_predictions ,
620711 )
621712 builder .prepare_for_annotation ()
622713
0 commit comments