Skip to content

Commit 54d013b

Browse files
feat: add area level f1 (#86)
* added the area-level precision, recall and f1 Signed-off-by: Peter Staar <[email protected]> * WIP: adding timing modality Signed-off-by: Peter Staar <[email protected]> * updated the code with timings Signed-off-by: Peter Staar <[email protected]> * added the timings modality Signed-off-by: Peter Staar <[email protected]> * reformatted the code Signed-off-by: Peter Staar <[email protected]> * fixed the test Signed-off-by: Peter Staar <[email protected]> * ran the test_run_dpbench_tables with success Signed-off-by: Peter Staar <[email protected]> * commented out test_run_dpbench_tables Signed-off-by: Peter Staar <[email protected]> * reformatted code Signed-off-by: Peter Staar <[email protected]> * found potential bug in base_prediction_provider Signed-off-by: Peter Staar <[email protected]> * found potential bug in base_prediction_provider (2) Signed-off-by: Peter Staar <[email protected]> * fixed the timings in base-predictor Signed-off-by: Peter Staar <[email protected]> * removed prints and added logging-level for matplotlib Signed-off-by: Peter Staar <[email protected]> * found bug in stats Signed-off-by: Peter Staar <[email protected]> * updated the logging Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent 518f684 commit 54d013b

File tree

15 files changed

+655
-55
lines changed

15 files changed

+655
-55
lines changed

docling_eval/cli/main.py

Lines changed: 83 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@
3131
)
3232
from docling_eval.dataset_builders.doclaynet_v1_builder import DocLayNetV1DatasetBuilder
3333
from docling_eval.dataset_builders.doclaynet_v2_builder import DocLayNetV2DatasetBuilder
34+
from docling_eval.dataset_builders.doclingdpbench_builder import (
35+
DoclingDPBenchDatasetBuilder,
36+
)
3437
from docling_eval.dataset_builders.docvqa_builder import DocVQADatasetBuilder
3538
from docling_eval.dataset_builders.dpbench_builder import DPBenchDatasetBuilder
3639
from docling_eval.dataset_builders.file_dataset_builder import FileDatasetBuilder
@@ -65,20 +68,27 @@
6568
DatasetTableEvaluation,
6669
TableEvaluator,
6770
)
71+
from docling_eval.evaluators.timings_evaluator import (
72+
DatasetTimingsEvaluation,
73+
TimingsEvaluator,
74+
)
6875
from docling_eval.prediction_providers.docling_provider import DoclingPredictionProvider
6976
from docling_eval.prediction_providers.file_provider import FilePredictionProvider
7077
from docling_eval.prediction_providers.tableformer_provider import (
7178
TableFormerPredictionProvider,
7279
)
7380

7481
# Configure logging
75-
logging.getLogger("docling").setLevel(logging.WARNING)
76-
logging.getLogger("PIL").setLevel(logging.WARNING)
77-
logging.getLogger("transformers").setLevel(logging.WARNING)
78-
logging.getLogger("datasets").setLevel(logging.WARNING)
79-
logging.getLogger("filelock").setLevel(logging.WARNING)
80-
logging.getLogger("urllib3").setLevel(logging.WARNING)
81-
logging.getLogger("docling_ibm_models").setLevel(logging.WARNING)
82+
logging_level = logging.WARNING
83+
# logging_level = logging.DEBUG
84+
logging.getLogger("docling").setLevel(logging_level)
85+
logging.getLogger("PIL").setLevel(logging_level)
86+
logging.getLogger("transformers").setLevel(logging_level)
87+
logging.getLogger("datasets").setLevel(logging_level)
88+
logging.getLogger("filelock").setLevel(logging_level)
89+
logging.getLogger("urllib3").setLevel(logging_level)
90+
logging.getLogger("docling_ibm_models").setLevel(logging_level)
91+
logging.getLogger("matplotlib").setLevel(logging_level)
8292

8393
_log = logging.getLogger(__name__)
8494

@@ -156,6 +166,9 @@ def get_dataset_builder(
156166
if benchmark == BenchMarkNames.DPBENCH:
157167
return DPBenchDatasetBuilder(**common_params) # type: ignore
158168

169+
elif benchmark == BenchMarkNames.DOCLING_DPBENCH:
170+
return DoclingDPBenchDatasetBuilder(**common_params) # type: ignore
171+
159172
elif benchmark == BenchMarkNames.DOCLAYNETV1:
160173
return DocLayNetV1DatasetBuilder(**common_params) # type: ignore
161174

@@ -418,6 +431,16 @@ def evaluate(
418431
if modality == EvaluationModality.END2END:
419432
_log.error("END2END evaluation not supported. ")
420433

434+
elif modality == EvaluationModality.TIMINGS:
435+
timings_evaluator = TimingsEvaluator()
436+
evaluation = timings_evaluator( # type: ignore
437+
idir,
438+
split=split,
439+
)
440+
441+
with open(save_fn, "w") as fd:
442+
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
443+
421444
elif modality == EvaluationModality.LAYOUT:
422445
layout_evaluator = LayoutEvaluator()
423446
evaluation = layout_evaluator( # type: ignore
@@ -538,6 +561,31 @@ def visualize(
538561
if modality == EvaluationModality.END2END:
539562
_log.error("END2END visualization not supported")
540563

564+
elif modality == EvaluationModality.TIMINGS:
565+
try:
566+
with open(metrics_filename, "r") as fd:
567+
timings_evaluation = DatasetTimingsEvaluation.model_validate_json(
568+
fd.read()
569+
)
570+
571+
log_and_save_stats(
572+
odir,
573+
benchmark,
574+
modality,
575+
"time_to_solution_per_doc",
576+
timings_evaluation.timing_per_document_stats,
577+
)
578+
579+
log_and_save_stats(
580+
odir,
581+
benchmark,
582+
modality,
583+
"time_to_solution_per_page",
584+
timings_evaluation.timing_per_page_stats,
585+
)
586+
except Exception as e:
587+
_log.error(f"Error processing timings evaluation: {str(e)}")
588+
541589
elif modality == EvaluationModality.LAYOUT:
542590
try:
543591
with open(metrics_filename, "r") as fd:
@@ -554,6 +602,30 @@ def visualize(
554602
layout_evaluation.map_stats,
555603
)
556604

605+
log_and_save_stats(
606+
odir,
607+
benchmark,
608+
modality,
609+
"precision",
610+
layout_evaluation.segmentation_precision_stats,
611+
)
612+
613+
log_and_save_stats(
614+
odir,
615+
benchmark,
616+
modality,
617+
"recall",
618+
layout_evaluation.segmentation_recall_stats,
619+
)
620+
621+
log_and_save_stats(
622+
odir,
623+
benchmark,
624+
modality,
625+
"f1",
626+
layout_evaluation.segmentation_f1_stats,
627+
)
628+
557629
# Append to layout statistics, the AP per classes
558630
data, headers = layout_evaluation.to_table()
559631
content = "\n\n\nAP[0.5:0.05:0.95] per class (reported as %):\n\n"
@@ -724,6 +796,7 @@ def create_gt(
724796
end_index: Annotated[
725797
int, typer.Option(help="End index (exclusive), -1 for all")
726798
] = -1,
799+
chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80,
727800
):
728801
"""Create ground truth dataset only."""
729802
gt_dir = output_dir / "gt_dataset"
@@ -741,7 +814,7 @@ def create_gt(
741814
# Retrieve and save the dataset
742815
if dataset_builder.must_retrieve:
743816
dataset_builder.retrieve_input_dataset()
744-
dataset_builder.save_to_disk(chunk_size=80)
817+
dataset_builder.save_to_disk(chunk_size=chunk_size)
745818

746819
_log.info(f"Ground truth dataset created at {gt_dir}")
747820
except ValueError as e:
@@ -841,6 +914,7 @@ def create(
841914
end_index: Annotated[
842915
int, typer.Option(help="End index (exclusive), -1 for all")
843916
] = -1,
917+
chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80,
844918
prediction_provider: Annotated[
845919
Optional[PredictionProviderType],
846920
typer.Option(help="Type of prediction provider to use"),
@@ -861,6 +935,7 @@ def create(
861935
split=split,
862936
begin_index=begin_index,
863937
end_index=end_index,
938+
chunk_size=chunk_size,
864939
)
865940

866941
# Then create evaluation if provider specified

docling_eval/datamodels/dataset_record.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ class DatasetRecordWithPrediction(DatasetRecord):
173173
)
174174
original_prediction: Optional[str] = None
175175
prediction_format: PredictionFormats # some enum type
176+
prediction_timings: Optional[Dict] = Field(alias="prediction_timings", default=None)
176177

177178
predicted_page_images: List[PIL.Image.Image] = Field(
178179
alias="PredictionPageImages", default=[]
@@ -201,13 +202,15 @@ def features(cls):
201202
cls.get_field_alias("mime_type"): Value("string"),
202203
cls.get_field_alias("modalities"): Sequence(Value("string")),
203204
cls.get_field_alias("prediction_format"): Value("string"),
205+
cls.get_field_alias("prediction_timings"): Value("string"),
204206
}
205207

206208
def as_record_dict(self):
207209
record = super().as_record_dict()
208210
record.update(
209211
{
210212
self.get_field_alias("prediction_format"): self.prediction_format.value,
213+
self.get_field_alias("prediction_timings"): self.prediction_timings,
211214
}
212215
)
213216

docling_eval/datamodels/types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,14 @@ class EvaluationModality(str, Enum):
4747
OCR = "ocr"
4848
KEY_VALUE = "key_value"
4949
QUESTION_ANSWERING = "question_answering"
50+
TIMINGS = "timings"
5051

5152

5253
class BenchMarkNames(str, Enum):
5354

5455
# End-to-End
5556
DPBENCH = "DPBench"
57+
DOCLING_DPBENCH = "DoclingDPBench"
5658
OMNIDOCBENCH = "OmniDocBench"
5759
WORDSCAPE = "WordScape"
5860

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import json
2+
import logging
3+
import os
4+
from io import BytesIO
5+
from pathlib import Path
6+
from typing import Dict, Iterable, Set
7+
8+
from datasets import load_dataset
9+
from docling_core.types import DoclingDocument
10+
from docling_core.types.io import DocumentStream
11+
from PIL import Image as PILImage
12+
13+
from docling_eval.datamodels.dataset_record import DatasetRecord
14+
from docling_eval.dataset_builders.dataset_builder import (
15+
BaseEvaluationDatasetBuilder,
16+
HFSource,
17+
)
18+
from docling_eval.utils.utils import get_binary, get_binhash
19+
20+
# Get logger
21+
_log = logging.getLogger(__name__)
22+
23+
24+
class DoclingDPBenchDatasetBuilder(BaseEvaluationDatasetBuilder):
25+
"""
26+
DoclingDPBench dataset builder implementing the base dataset builder interface.
27+
28+
This builder processes the DoclingDPBench dataset, which contains document
29+
understanding benchmarks for various document types.
30+
"""
31+
32+
def __init__(
33+
self,
34+
target: Path,
35+
split: str = "test",
36+
begin_index: int = 0,
37+
end_index: int = -1,
38+
):
39+
"""
40+
Initialize the DoclingDPBench dataset builder.
41+
42+
Args:
43+
target: Path where processed dataset will be saved
44+
split: Dataset split to use
45+
begin_index: Start index for processing (inclusive)
46+
end_index: End index for processing (exclusive), -1 means process all
47+
"""
48+
super().__init__(
49+
name="DoclingDPBench",
50+
dataset_source=HFSource(repo_id="ds4sd/docling-dpbench"),
51+
target=target,
52+
split=split,
53+
begin_index=begin_index,
54+
end_index=end_index,
55+
)
56+
57+
self.must_retrieve = True
58+
59+
def iterate(self) -> Iterable[DatasetRecord]:
60+
"""
61+
Iterate through the dataset and yield DatasetRecord objects.
62+
63+
Yields:
64+
DatasetRecord objects
65+
"""
66+
if not self.retrieved and self.must_retrieve:
67+
raise RuntimeError(
68+
"You must first retrieve the source dataset. Call retrieve_input_dataset()."
69+
)
70+
71+
assert self.dataset_local_path is not None
72+
_log.info(f"dataset_local_path: {self.dataset_local_path}")
73+
74+
# Login using e.g. `huggingface-cli login` to access this dataset
75+
ds = load_dataset("ds4sd/docling-dpbench")
76+
77+
for idx, _ in enumerate(ds["test"]):
78+
doc_hash = str(get_binhash(_["BinaryDocument"]))
79+
doc = (DoclingDocument.model_validate_json(_["GroundTruthDocument"]),)
80+
81+
page_images = [
82+
PILImage.open(BytesIO(__["bytes"])) for __ in _["GroundTruthPageImages"]
83+
]
84+
pictures = [
85+
PILImage.open(BytesIO(__["bytes"])) for __ in _["GroundTruthPictures"]
86+
]
87+
88+
pdf_stream = DocumentStream(
89+
name=f"ds4sd/docling-dpbench/{idx}", stream=BytesIO(_["BinaryDocument"])
90+
)
91+
92+
# Create dataset record
93+
record = DatasetRecord(
94+
doc_id=str(_["document_id"]),
95+
doc_hash=doc_hash,
96+
ground_truth_doc=doc[0],
97+
ground_truth_pictures=pictures,
98+
ground_truth_page_images=page_images,
99+
original=pdf_stream,
100+
mime_type=_["mimetype"],
101+
)
102+
103+
yield record

docling_eval/dataset_builders/file_dataset_builder.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
108108
# Create the ground truth Document
109109
true_doc = DoclingDocument(name=f"{filename}")
110110
if mime_type == "application/pdf":
111-
_log.info(f"add_pages_to_true_doc: {filename}")
111+
_log.debug(f"add_pages_to_true_doc: {filename}")
112112
true_doc, _ = add_pages_to_true_doc(
113113
pdf_path=filename, true_doc=true_doc, image_scale=2.0
114114
)
@@ -127,7 +127,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
127127
image=image_ref,
128128
)
129129

130-
_log.info(f"add_pages_to_true_doc: {filename}")
130+
_log.debug(f"add_pages_to_true_doc: {filename}")
131131
true_doc.pages[1] = page_item
132132
else:
133133
raise ValueError(

0 commit comments

Comments
 (0)