Skip to content

Commit 1e2040a

Browse files
fix: propagate cvat parameters (#82)
* propagated the CVAT parameters in the cli and updated the documentation Signed-off-by: Peter Staar <[email protected]> * fixed the formatting Signed-off-by: Peter Staar <[email protected]> * fix the export in PDF_Docling Signed-off-by: Peter Staar <[email protected]> * fixed the PDF_Docling to parquet Signed-off-by: Peter Staar <[email protected]> * fixed the visualisations and leveraged the new docling-core visualization capability Signed-off-by: Peter Staar <[email protected]> * cleaned up the visualisation code Signed-off-by: Peter Staar <[email protected]> * cleaned up the code and reformatted Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent be62102 commit 1e2040a

File tree

14 files changed

+285
-425
lines changed

14 files changed

+285
-425
lines changed

docling_eval/cli/main.py

Lines changed: 95 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,13 @@
7373

7474
# Configure logging
7575
logging.getLogger("docling").setLevel(logging.WARNING)
76+
logging.getLogger("PIL").setLevel(logging.WARNING)
77+
logging.getLogger("transformers").setLevel(logging.WARNING)
78+
logging.getLogger("datasets").setLevel(logging.WARNING)
79+
logging.getLogger("filelock").setLevel(logging.WARNING)
80+
logging.getLogger("urllib3").setLevel(logging.WARNING)
81+
logging.getLogger("docling_ibm_models").setLevel(logging.WARNING)
82+
7683
_log = logging.getLogger(__name__)
7784

7885
app = typer.Typer(
@@ -188,14 +195,17 @@ def get_dataset_builder(
188195
name="CVAT", dataset_source=dataset_source, target=target, split=split
189196
)
190197
elif benchmark == BenchMarkNames.PLAIN_FILES:
191-
assert dataset_source is not None
198+
if dataset_source is None:
199+
raise ValueError("dataset_source is required for PLAIN_FILES")
200+
192201
return FileDatasetBuilder(
193202
name=dataset_source.name,
194203
dataset_source=dataset_source,
195204
target=target,
196205
split=split,
206+
begin_index=begin_index,
207+
end_index=end_index,
197208
)
198-
199209
else:
200210
raise ValueError(f"Unsupported benchmark: {benchmark}")
201211

@@ -209,7 +219,11 @@ def get_prediction_provider(
209219
):
210220
pipeline_options: PaginatedPipelineOptions
211221
"""Get the appropriate prediction provider with default settings."""
212-
if provider_type == PredictionProviderType.DOCLING:
222+
if (
223+
provider_type == PredictionProviderType.DOCLING
224+
or provider_type == PredictionProviderType.OCR_DOCLING
225+
or provider_type == PredictionProviderType.EasyOCR_DOCLING
226+
):
213227
ocr_factory = get_ocr_factory()
214228

215229
ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
@@ -238,6 +252,78 @@ def get_prediction_provider(
238252
ignore_missing_predictions=True,
239253
)
240254

255+
elif provider_type == PredictionProviderType.MacOCR_DOCLING:
256+
ocr_factory = get_ocr_factory()
257+
258+
ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
259+
kind="ocrmac",
260+
)
261+
262+
pipeline_options = PdfPipelineOptions(
263+
do_ocr=True,
264+
ocr_options=ocr_options,
265+
do_table_structure=True,
266+
)
267+
268+
pipeline_options.images_scale = 2.0
269+
pipeline_options.generate_page_images = True
270+
pipeline_options.generate_picture_images = True
271+
272+
if artifacts_path is not None:
273+
pipeline_options.artifacts_path = artifacts_path
274+
275+
return DoclingPredictionProvider(
276+
format_options={
277+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
278+
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
279+
},
280+
do_visualization=do_visualization,
281+
ignore_missing_predictions=True,
282+
)
283+
284+
elif provider_type == PredictionProviderType.PDF_DOCLING:
285+
286+
ocr_factory = get_ocr_factory()
287+
288+
ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
289+
kind="easyocr",
290+
)
291+
292+
pdf_pipeline_options = PdfPipelineOptions(
293+
do_ocr=False,
294+
ocr_options=ocr_options, # we need to provide OCR options in order to not break the parquet serialization
295+
do_table_structure=True,
296+
)
297+
298+
pdf_pipeline_options.images_scale = 2.0
299+
pdf_pipeline_options.generate_page_images = True
300+
pdf_pipeline_options.generate_picture_images = True
301+
302+
ocr_pipeline_options = PdfPipelineOptions(
303+
do_ocr=True,
304+
ocr_options=ocr_options, # we need to provide OCR options in order to not break the parquet serialization
305+
do_table_structure=True,
306+
)
307+
308+
ocr_pipeline_options.images_scale = 2.0
309+
ocr_pipeline_options.generate_page_images = True
310+
ocr_pipeline_options.generate_picture_images = True
311+
312+
if artifacts_path is not None:
313+
pdf_pipeline_options.artifacts_path = artifacts_path
314+
ocr_pipeline_options.artifacts_path = artifacts_path
315+
316+
return DoclingPredictionProvider(
317+
format_options={
318+
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
319+
InputFormat.IMAGE: PdfFormatOption(
320+
pipeline_options=ocr_pipeline_options
321+
),
322+
},
323+
do_visualization=do_visualization,
324+
ignore_missing_predictions=True,
325+
)
326+
241327
elif provider_type == PredictionProviderType.SMOLDOCLING:
242328
pipeline_options = VlmPipelineOptions()
243329

@@ -614,9 +700,14 @@ def create_cvat(
614700
output_dir: Annotated[Path, typer.Option(help="Output directory")],
615701
gt_dir: Annotated[Path, typer.Option(help="Dataset source path")],
616702
bucket_size: Annotated[int, typer.Option(help="Size of CVAT tasks")] = 20,
703+
use_predictions: Annotated[bool, typer.Option(help="use predictions")] = False,
617704
):
705+
"""Create dataset ready to upload to CVAT starting from (ground-truth) dataset."""
618706
builder = CvatPreannotationBuilder(
619-
dataset_source=gt_dir, target=output_dir, bucket_size=bucket_size
707+
dataset_source=gt_dir,
708+
target=output_dir,
709+
bucket_size=bucket_size,
710+
use_predictions=use_predictions,
620711
)
621712
builder.prepare_for_annotation()
622713

docling_eval/datamodels/types.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ class PredictionProviderType(str, Enum):
118118
"""Types of prediction providers available."""
119119

120120
DOCLING = "Docling"
121+
PDF_DOCLING = "PDF_Docling"
122+
OCR_DOCLING = "OCR_Docling"
123+
MacOCR_DOCLING = "MacOCR_Docling"
124+
EasyOCR_DOCLING = "EasyOCR_Docling"
125+
121126
TABLEFORMER = "TableFormer"
122127
FILE = "File"
123128
SMOLDOCLING = "SmolDocling"

docling_eval/dataset_builders/dataset_builder.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import ibm_boto3 # type: ignore
99
from docling.utils.utils import chunkify
10+
from docling_core.types.doc.document import ImageRefMode
1011
from huggingface_hub import snapshot_download
1112
from pydantic import BaseModel
1213

@@ -15,7 +16,6 @@
1516
TRUE_HTML_EXPORT_LABELS,
1617
)
1718
from docling_eval.utils.utils import save_shard_to_disk, write_datasets_info
18-
from docling_eval.visualisation.visualisations import save_inspection_html
1919

2020
# Get logger
2121
_log = logging.getLogger(__name__)
@@ -276,10 +276,11 @@ def save_to_disk(
276276
record_list.append(r.as_record_dict())
277277
if do_visualization:
278278
viz_path = self.target / "visualizations" / f"{r.doc_id}.html"
279-
save_inspection_html(
279+
r.ground_truth_doc.save_as_html(
280280
filename=viz_path,
281-
doc=r.ground_truth_doc,
282281
labels=TRUE_HTML_EXPORT_LABELS,
282+
image_mode=ImageRefMode.EMBEDDED,
283+
split_page_view=True,
283284
)
284285

285286
save_shard_to_disk(

docling_eval/dataset_builders/file_dataset_builder.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,15 @@ def iterate(self) -> Iterable[DatasetRecord]:
100100

101101
for filename in tqdm(
102102
selected_filenames,
103-
desc="Processing files for DP-Bench",
103+
desc=f"Processing files for {self.name}",
104104
ncols=128,
105105
):
106106
mime_type, _ = mimetypes.guess_type(filename)
107107

108108
# Create the ground truth Document
109109
true_doc = DoclingDocument(name=f"{filename}")
110110
if mime_type == "application/pdf":
111+
_log.info(f"add_pages_to_true_doc: {filename}")
111112
true_doc, _ = add_pages_to_true_doc(
112113
pdf_path=filename, true_doc=true_doc, image_scale=2.0
113114
)
@@ -126,6 +127,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
126127
image=image_ref,
127128
)
128129

130+
_log.info(f"add_pages_to_true_doc: {filename}")
129131
true_doc.pages[1] = page_item
130132
else:
131133
raise ValueError(
@@ -139,18 +141,20 @@ def iterate(self) -> Iterable[DatasetRecord]:
139141
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
140142
)
141143

142-
# Get PDF as binary data
143-
pdf_bytes = get_binary(filename)
144-
pdf_stream = DocumentStream(name=filename.name, stream=BytesIO(pdf_bytes))
144+
# Get source as binary data
145+
source_bytes = get_binary(filename)
146+
source_stream = DocumentStream(
147+
name=filename.name, stream=BytesIO(source_bytes)
148+
)
145149

146150
# Create dataset record
147151
record = DatasetRecord(
148152
doc_id=str(filename.name),
149-
doc_hash=get_binhash(pdf_bytes),
153+
doc_hash=get_binhash(source_bytes),
150154
ground_truth_doc=true_doc,
151155
ground_truth_pictures=true_pictures,
152156
ground_truth_page_images=true_page_images,
153-
original=pdf_stream,
157+
original=source_stream,
154158
mime_type=mime_type,
155159
)
156160

docling_eval/evaluators/readingorder_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ def _show_items(self, true_doc: DoclingDocument):
292292
)
293293
text = item.text if isinstance(item, TextItem) else None
294294
label = item.label # type: ignore
295-
print(f"True {i}: {level} - {label}: {bbox} - {text}")
295+
# print(f"True {i}: {level} - {label}: {bbox} - {text}")
296296

297297

298298
class ReadingOrderVisualizer:

docling_eval/prediction_providers/base_prediction_provider.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,6 @@ def visualize_results(
165165
/ f"{prediction_record.doc_id}.html",
166166
true_doc=gt_doc,
167167
pred_doc=pred_doc,
168-
page_image=prediction_record.ground_truth_page_images[0],
169168
true_labels=self.true_labels,
170169
pred_labels=self.pred_labels,
171170
draw_reading_order=True,

docling_eval/prediction_providers/docling_provider.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
106106
def info(self) -> Dict:
107107
"""Get information about the prediction provider."""
108108

109-
return {
109+
result = {
110110
"asset": PredictionProviderType.DOCLING,
111111
"version": docling_version(),
112112
"package_versions": {
@@ -128,10 +128,11 @@ def info(self) -> Dict:
128128
mode="json", exclude_defaults=True
129129
)
130130
if v.pipeline_options is not None
131-
else {}
131+
else None # Parquet might not like empty dicts!
132132
),
133133
}
134134
for k, v in self.doc_converter.format_to_options.items()
135135
if k in [InputFormat.PDF, InputFormat.IMAGE]
136136
},
137137
}
138+
return result

docling_eval/visualisation/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@
309309
display: flex;
310310
flex-direction: column;
311311
width: 25%; /* Adjust the width of each item */
312-
height: 100%; /* Adjust height to fill parent container */
312+
height: 50%; /* Adjust height to fill parent container */
313313
border: 1px solid #ccc; /* Optional: Add borders */
314314
box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.1); /* Optional: Add shadow */
315315
background-color: #fff; /* Optional: Add background */

0 commit comments

Comments
 (0)