11"""
2- CLI utility to batch-create datasets for annotation workflows from directores containing plain files.
2+ CLI utility to batch-create datasets for annotation workflows from directories containing plain files.
33
44Given a root input directory containing subdirectories (each with plain files: PDF, image, etc.),
55this tool creates, for each subdirectory:
66 - gt_dataset: ground truth dataset
77 - eval_dataset: weak annotation dataset using Docling predictions
88 - cvat_dataset_preannotated: CVAT-ready input structure for annotation
99
10- This is useful for preparing large-scale annotation tasks for CVAT or similar tools.
10+ If a subdirectory contains more than max_files_per_chunk files (default: 1000), it will be
11+ automatically split into multiple chunks with separate output directories named:
12+ - {subdir_name}_chunk_001, {subdir_name}_chunk_002, etc.
13+
14+ This is useful for preparing large-scale annotation tasks for CVAT or similar tools while
15+ avoiding "too many files open" errors.
1116
1217Usage:
13- uv run python docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py batch-prepare --input-directory <input_dir> --output-directory <output_dir> [--sliding-window <int>] [--use-predictions/--no-use-predictions]
18+ uv run python docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py batch-prepare --input-directory <input_dir> --output-directory <output_dir> [--sliding-window <int>] [--use-predictions/--no-use-predictions] [--max-files-per-chunk <int>]
1419
1520Arguments:
1621 input_directory: Root directory containing subdirectories with files to process
17- output_directory: Where to store the generated datasets (one subdir per input subdir)
22+ output_directory: Where to store the generated datasets (one subdir per input subdir, with chunk suffixes if needed )
1823 sliding_window: Number of pages per CVAT task (default: 1)
1924 use_predictions: Whether to create prediction dataset and use predictions in CVAT (default: True)
25+ max_files_per_chunk: Maximum number of files to process per chunk (default: 1000)
2026"""
2127
2228from pathlib import Path
@@ -35,16 +41,19 @@ def process_subdirectories(
3541 output_directory : Path ,
3642 sliding_window : int = 1 ,
3743 use_predictions : bool = True ,
44+ max_files_per_chunk : int = 1000 ,
3845) -> None :
3946 """
4047 For each subdirectory in input_directory, create gt_dataset, eval_dataset, and cvat_dataset_preannotated
41- in the corresponding output_directory.
48+ in the corresponding output_directory. If a subdirectory contains more than max_files_per_chunk files,
49+ it will be automatically split into multiple chunks with separate output directories.
4250
4351 Args:
4452 input_directory: Root directory with subdirectories to process
4553 output_directory: Where to store generated datasets
4654 sliding_window: Number of pages per CVAT task (default: 1)
4755 use_predictions: Whether to create prediction dataset and use predictions in CVAT
56+ max_files_per_chunk: Maximum number of files to process per chunk (default: 1000)
4857 """
4958 input_directory = input_directory .expanduser ().resolve ()
5059 output_directory = output_directory .expanduser ().resolve ()
@@ -57,62 +66,111 @@ def process_subdirectories(
5766
5867 for subdir in subdirs :
5968 subdir_name = subdir .name
60- odir = output_directory / subdir_name
61- odir .mkdir (parents = True , exist_ok = True )
69+
70+ # Count total files in subdirectory
71+ total_files = 0
72+ for ext in ["pdf" , "tif" , "tiff" , "jpg" , "jpeg" , "png" , "bmp" , "gif" , "json" ]:
73+ total_files += len (list (subdir .glob (f"*.{ ext } " )))
74+ total_files += len (list (subdir .glob (f"*.{ ext .upper ()} " )))
75+
6276 typer .echo (f"\n Processing: { subdir_name } " )
77+ typer .echo (f" Total files found: { total_files } " )
78+
79+ # Calculate number of chunks needed
80+ num_chunks = (total_files + max_files_per_chunk - 1 ) // max_files_per_chunk
81+ typer .echo (
82+ f" Will create { num_chunks } chunk(s) of max { max_files_per_chunk } files each"
83+ )
84+
85+ # Process each chunk
86+ for chunk_idx in range (num_chunks ):
87+ begin_index = chunk_idx * max_files_per_chunk
88+ end_index = min ((chunk_idx + 1 ) * max_files_per_chunk , total_files )
89+ files_in_chunk = end_index - begin_index
90+
91+ # Create chunk-specific output directory
92+ if num_chunks == 1 :
93+ # Single chunk, use original directory name
94+ chunk_output_dir = output_directory / subdir_name
95+ else :
96+ # Multiple chunks, append chunk number
97+ chunk_output_dir = (
98+ output_directory / f"{ subdir_name } _chunk_{ chunk_idx + 1 :03d} "
99+ )
63100
64- gt_dir = odir / "gt_dataset"
65- eval_dir = odir / "eval_dataset"
66- cvat_dir = odir / "cvat_dataset_preannotated"
67-
68- if not gt_dir .exists ():
69- typer .echo (f" Creating GT dataset..." )
70- create_gt (
71- benchmark = BenchMarkNames .PLAIN_FILES ,
72- dataset_source = subdir ,
73- output_dir = odir ,
74- do_visualization = False ,
101+ chunk_output_dir .mkdir (parents = True , exist_ok = True )
102+
103+ typer .echo (
104+ f" Processing chunk { chunk_idx + 1 } /{ num_chunks } : files { begin_index } to { end_index - 1 } ({ files_in_chunk } files)"
75105 )
76- else :
77- typer .echo (f" GT dataset already exists, skipping." )
106+ typer .echo (f" Output directory: { chunk_output_dir } " )
78107
79- if use_predictions :
80- if not eval_dir .exists ():
81- typer .echo (f" Creating prediction dataset (Docling)..." )
82- create_eval (
108+ gt_dir = chunk_output_dir / "gt_dataset"
109+ eval_dir = chunk_output_dir / "eval_dataset"
110+ cvat_dir = chunk_output_dir / "cvat_dataset_preannotated"
111+
112+ if not gt_dir .exists ():
113+ typer .echo (f" Creating GT dataset..." )
114+ create_gt (
83115 benchmark = BenchMarkNames .PLAIN_FILES ,
84- output_dir = odir ,
85- prediction_provider = PredictionProviderType . DOCLING ,
86- do_visualization = True ,
87- image_scale_factor = 2.0 ,
88- do_table_structure = False ,
116+ dataset_source = subdir ,
117+ output_dir = chunk_output_dir ,
118+ do_visualization = False ,
119+ begin_index = begin_index ,
120+ end_index = end_index ,
89121 )
90122 else :
91- typer .echo (f" Prediction dataset already exists, skipping." )
92- else :
93- typer .echo (
94- f" Skipping prediction dataset creation (use_predictions=False)."
95- )
123+ typer .echo (f" GT dataset already exists, skipping." )
124+
125+ if use_predictions :
126+ if not eval_dir .exists ():
127+ typer .echo (f" Creating prediction dataset (Docling)..." )
128+ create_eval (
129+ benchmark = BenchMarkNames .PLAIN_FILES ,
130+ output_dir = chunk_output_dir ,
131+ prediction_provider = PredictionProviderType .DOCLING ,
132+ do_visualization = True ,
133+ image_scale_factor = 2.0 ,
134+ do_table_structure = False ,
135+ begin_index = begin_index ,
136+ end_index = end_index ,
137+ )
138+ else :
139+ typer .echo (f" Prediction dataset already exists, skipping." )
140+ else :
141+ typer .echo (
142+ f" Skipping prediction dataset creation (use_predictions=False)."
143+ )
96144
97- if not cvat_dir .exists ():
98- typer .echo (f" Creating CVAT pre-annotated dataset..." )
99- # Use gt_dir when no predictions, eval_dir when using predictions
100- source_dir = (eval_dir / "test" ) if use_predictions else (gt_dir / "test" )
101- create_cvat (
102- gt_dir = source_dir ,
103- output_dir = cvat_dir ,
104- bucket_size = 100 ,
105- use_predictions = use_predictions ,
106- sliding_window = sliding_window ,
145+ if not cvat_dir .exists ():
146+ typer .echo (f" Creating CVAT pre-annotated dataset..." )
147+ # Use gt_dir when no predictions, eval_dir when using predictions
148+ source_dir = (
149+ (eval_dir / "test" ) if use_predictions else (gt_dir / "test" )
150+ )
151+ create_cvat (
152+ gt_dir = source_dir ,
153+ output_dir = cvat_dir ,
154+ bucket_size = 100 ,
155+ use_predictions = use_predictions ,
156+ sliding_window = sliding_window ,
157+ )
158+ else :
159+ typer .echo (f" CVAT dataset already exists, skipping." )
160+
161+ assert (
162+ gt_dir .exists ()
163+ ), f"gt_dataset not created for { subdir_name } chunk { chunk_idx + 1 } "
164+ assert (
165+ cvat_dir .exists ()
166+ ), f"cvat_dataset_preannotated not created for { subdir_name } chunk { chunk_idx + 1 } "
167+ typer .echo (
168+ f" Successfully created all datasets for chunk { chunk_idx + 1 } "
107169 )
108- else :
109- typer .echo (f" CVAT dataset already exists, skipping." )
110170
111- assert gt_dir .exists (), f"gt_dataset not created for { subdir_name } "
112- assert (
113- cvat_dir .exists ()
114- ), f"cvat_dataset_preannotated not created for { subdir_name } "
115- typer .echo (f" Successfully created all datasets for { subdir_name } " )
171+ typer .echo (
172+ f" Successfully processed all { num_chunks } chunk(s) for { subdir_name } "
173+ )
116174
117175
118176@app .command ()
@@ -129,12 +187,19 @@ def batch_prepare(
129187 use_predictions : bool = typer .Option (
130188 True , help = "Whether to create prediction dataset and use predictions in CVAT"
131189 ),
190+ max_files_per_chunk : int = typer .Option (
191+ 1000 , help = "Maximum number of files to process per chunk (default: 1000)"
192+ ),
132193) -> None :
133194 """
134195 Batch-create Docling evaluation datasets for all subdirectories in input_directory.
135196 """
136197 process_subdirectories (
137- input_directory , output_directory , sliding_window , use_predictions
198+ input_directory ,
199+ output_directory ,
200+ sliding_window ,
201+ use_predictions ,
202+ max_files_per_chunk ,
138203 )
139204 typer .echo ("\n All benchmarks created successfully!" )
140205
0 commit comments