Skip to content

Commit 7903b6a

Browse files
authored
feat: Add extra args for docling-provider and default annotations for CVAT (#98)
* Add README for Docling-DPBench Signed-off-by: Christoph Auer <[email protected]> * Add CVAT annotation features, fix DatasetRecord.features usage Signed-off-by: Christoph Auer <[email protected]> * dev: Updates for CVAT and docling provider args Signed-off-by: Christoph Auer <[email protected]> * documentation for SmolDocling, fix artifacts_path Signed-off-by: Christoph Auer <[email protected]> * Update lock Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]>
1 parent b79dd19 commit 7903b6a

File tree

6 files changed

+527
-85
lines changed

6 files changed

+527
-85
lines changed

docling_eval/cli/main.py

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -227,10 +227,13 @@ def get_dataset_builder(
227227

228228
def get_prediction_provider(
229229
provider_type: PredictionProviderType,
230+
*,
230231
file_source_path: Optional[Path] = None,
231232
file_prediction_format: Optional[PredictionFormats] = None,
232233
do_visualization: bool = True,
234+
do_table_structure: bool = True,
233235
artifacts_path: Optional[Path] = None,
236+
image_scale_factor: Optional[float] = None,
234237
):
235238
pipeline_options: PaginatedPipelineOptions
236239
"""Get the appropriate prediction provider with default settings."""
@@ -248,10 +251,10 @@ def get_prediction_provider(
248251
pipeline_options = PdfPipelineOptions(
249252
do_ocr=True,
250253
ocr_options=ocr_options,
251-
do_table_structure=True,
254+
do_table_structure=do_table_structure,
252255
)
253256

254-
pipeline_options.images_scale = 2.0
257+
pipeline_options.images_scale = image_scale_factor or 2.0
255258
pipeline_options.generate_page_images = True
256259
pipeline_options.generate_picture_images = True
257260
pipeline_options.generate_parsed_pages = True
@@ -278,10 +281,10 @@ def get_prediction_provider(
278281
pipeline_options = PdfPipelineOptions(
279282
do_ocr=True,
280283
ocr_options=ocr_options,
281-
do_table_structure=True,
284+
do_table_structure=do_table_structure,
282285
)
283286

284-
pipeline_options.images_scale = 2.0
287+
pipeline_options.images_scale = image_scale_factor or 2.0
285288
pipeline_options.generate_page_images = True
286289
pipeline_options.generate_picture_images = True
287290

@@ -308,20 +311,20 @@ def get_prediction_provider(
308311
pdf_pipeline_options = PdfPipelineOptions(
309312
do_ocr=False,
310313
ocr_options=ocr_options, # we need to provide OCR options in order to not break the parquet serialization
311-
do_table_structure=True,
314+
do_table_structure=do_table_structure,
312315
)
313316

314-
pdf_pipeline_options.images_scale = 2.0
317+
pdf_pipeline_options.images_scale = image_scale_factor or 2.0
315318
pdf_pipeline_options.generate_page_images = True
316319
pdf_pipeline_options.generate_picture_images = True
317320

318321
ocr_pipeline_options = PdfPipelineOptions(
319322
do_ocr=True,
320323
ocr_options=ocr_options, # we need to provide OCR options in order to not break the parquet serialization
321-
do_table_structure=True,
324+
do_table_structure=do_table_structure,
322325
)
323326

324-
ocr_pipeline_options.images_scale = 2.0
327+
ocr_pipeline_options.images_scale = image_scale_factor or 2.0
325328
ocr_pipeline_options.generate_page_images = True
326329
ocr_pipeline_options.generate_picture_images = True
327330

@@ -343,20 +346,20 @@ def get_prediction_provider(
343346
elif provider_type == PredictionProviderType.SMOLDOCLING:
344347
pipeline_options = VlmPipelineOptions()
345348

346-
pipeline_options.images_scale = 2.0
349+
pipeline_options.images_scale = image_scale_factor or 2.0
347350
pipeline_options.generate_page_images = True
348351
pipeline_options.generate_picture_images = True
349352

350353
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
354+
if artifacts_path is not None:
355+
pipeline_options.artifacts_path = artifacts_path
356+
351357
if sys.platform == "darwin":
352358
try:
353359
import mlx_vlm # type: ignore
354360

355361
pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
356362

357-
if artifacts_path is not None:
358-
pipeline_options.artifacts_path = artifacts_path
359-
360363
except ImportError:
361364
_log.warning(
362365
"To run SmolDocling faster, please install mlx-vlm:\n"
@@ -918,6 +921,13 @@ def create_eval(
918921
do_visualization: Annotated[
919922
bool, typer.Option(help="visualize the predictions")
920923
] = True,
924+
image_scale_factor: Annotated[
925+
float,
926+
typer.Option(help="Scale of page images used in prediction (only Docling)"),
927+
] = 2.0,
928+
do_table_structure: Annotated[
929+
bool, typer.Option(help="Include table structure predictions (only Docling)")
930+
] = True,
921931
):
922932
"""Create evaluation dataset from existing ground truth."""
923933
gt_dir = gt_dir or output_dir / "gt_dataset"
@@ -946,6 +956,8 @@ def create_eval(
946956
file_prediction_format=file_format,
947957
artifacts_path=artifacts_path,
948958
do_visualization=do_visualization,
959+
image_scale_factor=image_scale_factor,
960+
do_table_structure=do_table_structure,
949961
)
950962

951963
# Get the dataset name from the benchmark
@@ -993,6 +1005,13 @@ def create(
9931005
do_visualization: Annotated[
9941006
bool, typer.Option(help="visualize the predictions")
9951007
] = True,
1008+
image_scale_factor: Annotated[
1009+
float,
1010+
typer.Option(help="Scale of page images used in prediction (only Docling)"),
1011+
] = 2.0,
1012+
do_table_structure: Annotated[
1013+
bool, typer.Option(help="Include table structure predictions (only Docling)")
1014+
] = True,
9961015
):
9971016
"""Create both ground truth and evaluation datasets in one step."""
9981017
# First create ground truth
@@ -1020,6 +1039,8 @@ def create(
10201039
file_prediction_format=file_prediction_format,
10211040
file_source_path=file_source_path,
10221041
do_visualization=do_visualization,
1042+
image_scale_factor=image_scale_factor,
1043+
do_table_structure=do_table_structure,
10231044
)
10241045
else:
10251046
_log.info(

docling_eval/datamodels/cvat_types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def get_color(label: "TableComponentLabel") -> Tuple[int, int, int]:
6262
TableComponentLabel.TABLE_COL: (0, 255, 0),
6363
TableComponentLabel.TABLE_GROUP: (0, 0, 255),
6464
}
65-
return color_map[label]
65+
return color_map.get(label, (0, 0, 0))
6666

6767

6868
class BenchMarkDirs(BaseModel):

docling_eval/dataset_builders/cvat_preannotation_builder.py

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from docling_core.types.doc import DocItemLabel
1010
from docling_core.types.doc.base import BoundingBox, CoordOrigin
1111
from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
12+
from docling_core.types.doc.labels import GraphCellLabel, TableCellLabel
1213
from docling_core.types.io import DocumentStream
1314
from PIL import Image
1415
from pydantic import ValidationError
@@ -211,6 +212,17 @@ def _create_project_properties(self) -> None:
211212

212213
# Add DocItemLabel properties
213214
for item in DocItemLabel:
215+
if item in [
216+
DocItemLabel.KEY_VALUE_REGION,
217+
DocItemLabel.PARAGRAPH,
218+
DocItemLabel.PAGE_HEADER,
219+
DocItemLabel.PAGE_FOOTER,
220+
DocItemLabel.TITLE,
221+
DocItemLabel.CHART,
222+
DocItemLabel.REFERENCE,
223+
]:
224+
continue
225+
214226
r, g, b = DocItemLabel.get_color(item)
215227

216228
results.append(
@@ -286,15 +298,42 @@ def _create_project_properties(self) -> None:
286298
)
287299

288300
# Add TableComponentLabel properties
289-
for table_item in TableComponentLabel:
290-
r, g, b = TableComponentLabel.get_color(table_item)
301+
for table_component_label in TableComponentLabel:
302+
r, g, b = TableComponentLabel.get_color(table_component_label)
291303

292304
results.append(
293305
{
294-
"name": table_item.value,
306+
"name": table_component_label.value,
295307
"color": rgb_to_hex(r, g, b),
296308
"type": "rectangle",
297-
"attributes": [],
309+
"attributes": default_attributes.copy(),
310+
}
311+
)
312+
313+
# Add TableCellLabel properties
314+
for table_cell_label in TableCellLabel:
315+
r, g, b = TableCellLabel.get_color(table_cell_label)
316+
317+
results.append(
318+
{
319+
"name": table_cell_label.value,
320+
"color": rgb_to_hex(r, g, b),
321+
"type": "rectangle",
322+
"attributes": default_attributes.copy(),
323+
}
324+
)
325+
326+
for graph_item in GraphCellLabel:
327+
if graph_item in [GraphCellLabel.UNSPECIFIED, GraphCellLabel.CHECKBOX]:
328+
continue
329+
r, g, b = GraphCellLabel.get_color(graph_item)
330+
331+
results.append(
332+
{
333+
"name": graph_item.value,
334+
"color": rgb_to_hex(r, g, b),
335+
"type": "rectangle",
336+
"attributes": default_attributes.copy(),
298337
}
299338
)
300339

docs/SmolDocling-custom-eval.md

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# Evaluate SmolDocling with docling-eval
2+
3+
Below are instructions to evaluate custom weights for SmolDocling with docling-eval.
4+
5+
## Prepare SmolDocling weights for docling
6+
7+
Docling can run SmolDocling out of the box. By default, it will download the model weights from Huggingface and keep them in the user `~/.cache` dir.
8+
If you want to inject custom weights and config, you need to prepare a directory like this:
9+
10+
```shell
11+
models/ # the dir you will point docling-eval to (see below)
12+
├─ ds4sd--SmolDocling-256M-preview/ # the dir you place custom weights in. The name _must_ match the SmolDocling HF repo id, but using -- for /.
13+
```
14+
15+
## Run docling-eval
16+
17+
You can now run `docling-eval` as shown below. Example given for the Docling-DocLayNetV1 dataset:
18+
19+
```shell
20+
# Create GT dataset for DocLayNet v1 test set (only once)
21+
mkdir benchmarks
22+
23+
huggingface-cli login --token your_hf_token_123 # token-type: read is good, get it here: https://huggingface.co/settings/tokens
24+
huggingface-cli download --repo-type dataset --local-dir ./benchmarks/DLN_GT/gt_dataset ds4sd/Docling-DocLayNetV1
25+
# alternatively, create the GT dataset yourself: docling_eval create-gt --benchmark DocLayNetV1 --output-dir ./benchmarks/DLN_GT/
26+
27+
## --- Do benchnmarks ---
28+
export HF_HUB_OFFLINE=1 # no communication with huggingface from now!
29+
30+
# Make predictions for smoldocling
31+
docling_eval create-eval \
32+
--benchmark DocLayNetV1 \
33+
--gt-dir ./benchmarks/DLN_GT/gt_dataset/ \
34+
--output-dir ./benchmarks/DLN_smoldocling_experiment1/ \
35+
--prediction-provider SmolDocling \
36+
--artifacts-path /path/to/your/models/ # see above. Must include the ds4sd--SmolDocling-256M-preview dir.
37+
38+
# Layout metrics eval
39+
docling_eval evaluate \
40+
--modality layout \
41+
--benchmark DocLayNetV1 \
42+
--output-dir ./benchmarks/DLN_smoldocling_experiment1/
43+
44+
docling_eval visualize \
45+
--modality layout \
46+
--benchmark DocLayNetV1 \
47+
--output-dir ./benchmarks/DLN_smoldocling_experiment1/
48+
49+
# Text metrics eval
50+
docling_eval evaluate \
51+
--modality markdown_text \
52+
--benchmark DocLayNetV1 \
53+
--output-dir ./benchmarks/DLN_smoldocling_experiment1/
54+
55+
# Text metrics eval
56+
docling_eval visualize \
57+
--modality markdown_text \
58+
--benchmark DocLayNetV1 \
59+
--output-dir ./benchmarks/DLN_smoldocling_experiment1/
60+
61+
```
62+
To repeat this with another set of weights, please replace the content of your `models/ds4sd--SmolDocling-256M-preview` directory, and adjust the
63+
experiment name used in your `--output-dir` arguments.
64+
65+
**Note**: MacOS users should use weights converted with mlx-vlm.
66+
Install `mlx-vlm`, convert the weights, and place them in a `ds4sd--SmolDocling-256M-preview-mlx-bf16` subdirectory instead.
67+
68+

0 commit comments

Comments
 (0)