Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from docling.datamodel.vlm_model_specs import (
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
)
from docling.document_converter import FormatOption, PdfFormatOption
from docling.document_converter import FormatOption, ImageFormatOption, PdfFormatOption
from docling.models.factories import get_ocr_factory
from docling.pipeline.vlm_pipeline import VlmPipeline
from PyPDF2 import PdfReader, PdfWriter
Expand Down Expand Up @@ -414,7 +414,7 @@ def get_prediction_provider(
return DoclingPredictionProvider(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: ImageFormatOption(pipeline_options=pipeline_options),
},
do_visualization=do_visualization,
ignore_missing_predictions=True,
Expand Down Expand Up @@ -444,7 +444,7 @@ def get_prediction_provider(
return DoclingPredictionProvider(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: ImageFormatOption(pipeline_options=pipeline_options),
},
do_visualization=do_visualization,
ignore_missing_predictions=True,
Expand Down Expand Up @@ -493,7 +493,7 @@ def get_prediction_provider(
return DoclingPredictionProvider(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.IMAGE: PdfFormatOption(
InputFormat.IMAGE: ImageFormatOption(
pipeline_options=ocr_pipeline_options
),
},
Expand Down Expand Up @@ -528,10 +528,14 @@ def get_prediction_provider(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)

image_format_option = ImageFormatOption(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)

return DoclingPredictionProvider(
format_options={
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
InputFormat.IMAGE: image_format_option,
},
do_visualization=do_visualization,
ignore_missing_predictions=True,
Expand Down Expand Up @@ -575,10 +579,14 @@ def get_prediction_provider(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)

image_format_option = ImageFormatOption(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)

return DoclingPredictionProvider(
format_options={
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
InputFormat.IMAGE: image_format_option,
},
do_visualization=do_visualization,
ignore_missing_predictions=True,
Expand Down
45 changes: 39 additions & 6 deletions docling_eval/dataset_builders/file_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,11 +205,44 @@ def iterate(self) -> Iterable[DatasetRecord]:
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
)

# Get source as binary data
source_bytes = get_binary(filename)
source_stream = DocumentStream(
name=filename.name, stream=BytesIO(source_bytes)
)
# Prepare source binary: for JSON inputs prefer image streams when page images exist
source_bytes: bytes
source_stream: DocumentStream
effective_mime_type = mime_type

if mime_type == "application/json" and len(true_page_images) > 0:
images_rgb = [
img.convert("RGB") if img.mode != "RGB" else img
for img in true_page_images
]

if len(images_rgb) == 1:
buffer = BytesIO()
images_rgb[0].save(buffer, format="PNG")
source_bytes = buffer.getvalue()
source_stream = DocumentStream(
name=f"{filename.stem}.png", stream=BytesIO(source_bytes)
)
effective_mime_type = "image/png"
else:
buffer = BytesIO()
images_rgb[0].save(
buffer,
format="TIFF",
save_all=True,
append_images=images_rgb[1:],
compression="tiff_lzw",
)
source_bytes = buffer.getvalue()
source_stream = DocumentStream(
name=f"{filename.stem}.tiff", stream=BytesIO(source_bytes)
)
effective_mime_type = "image/tiff"
else:
source_bytes = get_binary(filename)
source_stream = DocumentStream(
name=filename.name, stream=BytesIO(source_bytes)
)

# Create dataset record
record = DatasetRecord(
Expand All @@ -219,7 +252,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
ground_truth_pictures=true_pictures,
ground_truth_page_images=true_page_images,
original=source_stream,
mime_type=mime_type,
mime_type=effective_mime_type,
)

yield record
Loading