Skip to content

Commit 208cd14

Browse files
authored
feat: Implementation of table structure conversion from CVAT to DoclingDocument
1 parent cfed3d1 commit 208cd14

File tree

19 files changed

+4096
-1796
lines changed

19 files changed

+4096
-1796
lines changed

docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py

Lines changed: 117 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,28 @@
11
"""
2-
CLI utility to batch-create datasets for annotation workflows from directores containing plain files.
2+
CLI utility to batch-create datasets for annotation workflows from directories containing plain files.
33
44
Given a root input directory containing subdirectories (each with plain files: PDF, image, etc.),
55
this tool creates, for each subdirectory:
66
- gt_dataset: ground truth dataset
77
- eval_dataset: weak annotation dataset using Docling predictions
88
- cvat_dataset_preannotated: CVAT-ready input structure for annotation
99
10-
This is useful for preparing large-scale annotation tasks for CVAT or similar tools.
10+
If a subdirectory contains more than max_files_per_chunk files (default: 1000), it will be
11+
automatically split into multiple chunks with separate output directories named:
12+
- {subdir_name}_chunk_001, {subdir_name}_chunk_002, etc.
13+
14+
This is useful for preparing large-scale annotation tasks for CVAT or similar tools while
15+
avoiding "too many files open" errors.
1116
1217
Usage:
13-
uv run python docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py batch-prepare --input-directory <input_dir> --output-directory <output_dir> [--sliding-window <int>] [--use-predictions/--no-use-predictions]
18+
uv run python docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py batch-prepare --input-directory <input_dir> --output-directory <output_dir> [--sliding-window <int>] [--use-predictions/--no-use-predictions] [--max-files-per-chunk <int>]
1419
1520
Arguments:
1621
input_directory: Root directory containing subdirectories with files to process
17-
output_directory: Where to store the generated datasets (one subdir per input subdir)
22+
output_directory: Where to store the generated datasets (one subdir per input subdir, with chunk suffixes if needed)
1823
sliding_window: Number of pages per CVAT task (default: 1)
1924
use_predictions: Whether to create prediction dataset and use predictions in CVAT (default: True)
25+
max_files_per_chunk: Maximum number of files to process per chunk (default: 1000)
2026
"""
2127

2228
from pathlib import Path
@@ -35,16 +41,19 @@ def process_subdirectories(
3541
output_directory: Path,
3642
sliding_window: int = 1,
3743
use_predictions: bool = True,
44+
max_files_per_chunk: int = 1000,
3845
) -> None:
3946
"""
4047
For each subdirectory in input_directory, create gt_dataset, eval_dataset, and cvat_dataset_preannotated
41-
in the corresponding output_directory.
48+
in the corresponding output_directory. If a subdirectory contains more than max_files_per_chunk files,
49+
it will be automatically split into multiple chunks with separate output directories.
4250
4351
Args:
4452
input_directory: Root directory with subdirectories to process
4553
output_directory: Where to store generated datasets
4654
sliding_window: Number of pages per CVAT task (default: 1)
4755
use_predictions: Whether to create prediction dataset and use predictions in CVAT
56+
max_files_per_chunk: Maximum number of files to process per chunk (default: 1000)
4857
"""
4958
input_directory = input_directory.expanduser().resolve()
5059
output_directory = output_directory.expanduser().resolve()
@@ -57,62 +66,111 @@ def process_subdirectories(
5766

5867
for subdir in subdirs:
5968
subdir_name = subdir.name
60-
odir = output_directory / subdir_name
61-
odir.mkdir(parents=True, exist_ok=True)
69+
70+
# Count total files in subdirectory
71+
total_files = 0
72+
for ext in ["pdf", "tif", "tiff", "jpg", "jpeg", "png", "bmp", "gif", "json"]:
73+
total_files += len(list(subdir.glob(f"*.{ext}")))
74+
total_files += len(list(subdir.glob(f"*.{ext.upper()}")))
75+
6276
typer.echo(f"\nProcessing: {subdir_name}")
77+
typer.echo(f" Total files found: {total_files}")
78+
79+
# Calculate number of chunks needed
80+
num_chunks = (total_files + max_files_per_chunk - 1) // max_files_per_chunk
81+
typer.echo(
82+
f" Will create {num_chunks} chunk(s) of max {max_files_per_chunk} files each"
83+
)
84+
85+
# Process each chunk
86+
for chunk_idx in range(num_chunks):
87+
begin_index = chunk_idx * max_files_per_chunk
88+
end_index = min((chunk_idx + 1) * max_files_per_chunk, total_files)
89+
files_in_chunk = end_index - begin_index
90+
91+
# Create chunk-specific output directory
92+
if num_chunks == 1:
93+
# Single chunk, use original directory name
94+
chunk_output_dir = output_directory / subdir_name
95+
else:
96+
# Multiple chunks, append chunk number
97+
chunk_output_dir = (
98+
output_directory / f"{subdir_name}_chunk_{chunk_idx + 1:03d}"
99+
)
63100

64-
gt_dir = odir / "gt_dataset"
65-
eval_dir = odir / "eval_dataset"
66-
cvat_dir = odir / "cvat_dataset_preannotated"
67-
68-
if not gt_dir.exists():
69-
typer.echo(f" Creating GT dataset...")
70-
create_gt(
71-
benchmark=BenchMarkNames.PLAIN_FILES,
72-
dataset_source=subdir,
73-
output_dir=odir,
74-
do_visualization=False,
101+
chunk_output_dir.mkdir(parents=True, exist_ok=True)
102+
103+
typer.echo(
104+
f" Processing chunk {chunk_idx + 1}/{num_chunks}: files {begin_index} to {end_index-1} ({files_in_chunk} files)"
75105
)
76-
else:
77-
typer.echo(f" GT dataset already exists, skipping.")
106+
typer.echo(f" Output directory: {chunk_output_dir}")
78107

79-
if use_predictions:
80-
if not eval_dir.exists():
81-
typer.echo(f" Creating prediction dataset (Docling)...")
82-
create_eval(
108+
gt_dir = chunk_output_dir / "gt_dataset"
109+
eval_dir = chunk_output_dir / "eval_dataset"
110+
cvat_dir = chunk_output_dir / "cvat_dataset_preannotated"
111+
112+
if not gt_dir.exists():
113+
typer.echo(f" Creating GT dataset...")
114+
create_gt(
83115
benchmark=BenchMarkNames.PLAIN_FILES,
84-
output_dir=odir,
85-
prediction_provider=PredictionProviderType.DOCLING,
86-
do_visualization=True,
87-
image_scale_factor=2.0,
88-
do_table_structure=False,
116+
dataset_source=subdir,
117+
output_dir=chunk_output_dir,
118+
do_visualization=False,
119+
begin_index=begin_index,
120+
end_index=end_index,
89121
)
90122
else:
91-
typer.echo(f" Prediction dataset already exists, skipping.")
92-
else:
93-
typer.echo(
94-
f" Skipping prediction dataset creation (use_predictions=False)."
95-
)
123+
typer.echo(f" GT dataset already exists, skipping.")
124+
125+
if use_predictions:
126+
if not eval_dir.exists():
127+
typer.echo(f" Creating prediction dataset (Docling)...")
128+
create_eval(
129+
benchmark=BenchMarkNames.PLAIN_FILES,
130+
output_dir=chunk_output_dir,
131+
prediction_provider=PredictionProviderType.DOCLING,
132+
do_visualization=True,
133+
image_scale_factor=2.0,
134+
do_table_structure=False,
135+
begin_index=begin_index,
136+
end_index=end_index,
137+
)
138+
else:
139+
typer.echo(f" Prediction dataset already exists, skipping.")
140+
else:
141+
typer.echo(
142+
f" Skipping prediction dataset creation (use_predictions=False)."
143+
)
96144

97-
if not cvat_dir.exists():
98-
typer.echo(f" Creating CVAT pre-annotated dataset...")
99-
# Use gt_dir when no predictions, eval_dir when using predictions
100-
source_dir = (eval_dir / "test") if use_predictions else (gt_dir / "test")
101-
create_cvat(
102-
gt_dir=source_dir,
103-
output_dir=cvat_dir,
104-
bucket_size=100,
105-
use_predictions=use_predictions,
106-
sliding_window=sliding_window,
145+
if not cvat_dir.exists():
146+
typer.echo(f" Creating CVAT pre-annotated dataset...")
147+
# Use gt_dir when no predictions, eval_dir when using predictions
148+
source_dir = (
149+
(eval_dir / "test") if use_predictions else (gt_dir / "test")
150+
)
151+
create_cvat(
152+
gt_dir=source_dir,
153+
output_dir=cvat_dir,
154+
bucket_size=100,
155+
use_predictions=use_predictions,
156+
sliding_window=sliding_window,
157+
)
158+
else:
159+
typer.echo(f" CVAT dataset already exists, skipping.")
160+
161+
assert (
162+
gt_dir.exists()
163+
), f"gt_dataset not created for {subdir_name} chunk {chunk_idx + 1}"
164+
assert (
165+
cvat_dir.exists()
166+
), f"cvat_dataset_preannotated not created for {subdir_name} chunk {chunk_idx + 1}"
167+
typer.echo(
168+
f" Successfully created all datasets for chunk {chunk_idx + 1}"
107169
)
108-
else:
109-
typer.echo(f" CVAT dataset already exists, skipping.")
110170

111-
assert gt_dir.exists(), f"gt_dataset not created for {subdir_name}"
112-
assert (
113-
cvat_dir.exists()
114-
), f"cvat_dataset_preannotated not created for {subdir_name}"
115-
typer.echo(f" Successfully created all datasets for {subdir_name}")
171+
typer.echo(
172+
f" Successfully processed all {num_chunks} chunk(s) for {subdir_name}"
173+
)
116174

117175

118176
@app.command()
@@ -129,12 +187,19 @@ def batch_prepare(
129187
use_predictions: bool = typer.Option(
130188
True, help="Whether to create prediction dataset and use predictions in CVAT"
131189
),
190+
max_files_per_chunk: int = typer.Option(
191+
1000, help="Maximum number of files to process per chunk (default: 1000)"
192+
),
132193
) -> None:
133194
"""
134195
Batch-create Docling evaluation datasets for all subdirectories in input_directory.
135196
"""
136197
process_subdirectories(
137-
input_directory, output_directory, sliding_window, use_predictions
198+
input_directory,
199+
output_directory,
200+
sliding_window,
201+
use_predictions,
202+
max_files_per_chunk,
138203
)
139204
typer.echo("\nAll benchmarks created successfully!")
140205

docling_eval/cli/cvat_to_docling_cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def main():
187187
description="Convert CVAT annotations to DoclingDocuments in batch."
188188
)
189189
parser.add_argument(
190-
"input_path",
190+
"--input_path",
191191
type=str,
192192
help="Path to input directory or XML file",
193193
)

docling_eval/cli/main.py

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@
2727
PdfPipelineOptions,
2828
VlmPipelineOptions,
2929
)
30+
from docling.datamodel.vlm_model_specs import (
31+
GRANITEDOCLING_MLX,
32+
GRANITEDOCLING_TRANSFORMERS,
33+
)
3034
from docling.datamodel.vlm_model_specs import (
3135
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
3236
)
@@ -475,17 +479,49 @@ def get_prediction_provider(
475479
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
476480
)
477481

478-
format_options: Dict[InputFormat, FormatOption] = {
479-
InputFormat.PDF: pdf_format_option,
480-
InputFormat.IMAGE: pdf_format_option,
481-
}
482-
483482
return DoclingPredictionProvider(
484-
format_options=format_options,
483+
format_options={
484+
InputFormat.PDF: pdf_format_option,
485+
InputFormat.IMAGE: pdf_format_option,
486+
},
485487
do_visualization=do_visualization,
486488
ignore_missing_predictions=True,
487489
)
490+
elif provider_type == PredictionProviderType.GRANITEDOCLING:
491+
pipeline_options = VlmPipelineOptions()
492+
493+
pipeline_options.images_scale = image_scale_factor or 2.0
494+
pipeline_options.generate_page_images = True
495+
pipeline_options.generate_picture_images = True
496+
497+
pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
498+
if artifacts_path is not None:
499+
pipeline_options.artifacts_path = artifacts_path
500+
501+
if sys.platform == "darwin":
502+
try:
503+
import mlx_vlm # type: ignore
504+
505+
pipeline_options.vlm_options = GRANITEDOCLING_MLX
506+
507+
except ImportError:
508+
_log.warning(
509+
"To run SmolDocling faster, please install mlx-vlm:\n"
510+
"pip install mlx-vlm"
511+
)
512+
513+
pdf_format_option = PdfFormatOption(
514+
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
515+
)
488516

517+
return DoclingPredictionProvider(
518+
format_options={
519+
InputFormat.PDF: pdf_format_option,
520+
InputFormat.IMAGE: pdf_format_option,
521+
},
522+
do_visualization=do_visualization,
523+
ignore_missing_predictions=True,
524+
)
489525
elif provider_type == PredictionProviderType.TABLEFORMER:
490526
return TableFormerPredictionProvider(
491527
do_visualization=do_visualization,

0 commit comments

Comments
 (0)