Skip to content

Commit a850784

Browse files
feat: Improvements in user experience: Performance, error handling, logging (#189)
* feat: Extend evaluate_dpbench_on_external_predictions.sh to include visualisations of the evaluations Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Improve error checking in main.py:visualize() Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Improve logging Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * feat: Parallelize the computation of PixelLayoutEvaluator at the level of page Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Make DatasetPixelLayoutEvaluation a subclass of DatasetEvaluation Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * feat: Parallelize the MarkdownTextEvaluator Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * chore: Improve logging Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> --------- Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
1 parent bcc5200 commit a850784

File tree

8 files changed

+348
-240
lines changed

8 files changed

+348
-240
lines changed

docling_eval/cli/main.py

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,7 @@ def evaluate(
641641
split: str = "test",
642642
cvat_overview_path: Optional[Path] = None,
643643
external_predictions_path: Optional[Path] = None,
644+
concurrency: int = 4,
644645
) -> Optional[DatasetEvaluationType]:
645646
"""Evaluate predictions against ground truth."""
646647
if not os.path.exists(idir):
@@ -673,17 +674,16 @@ def evaluate(
673674
# label_filtering_strategy=LabelFilteringStrategy.INTERSECTION,
674675
page_mapping_path=cvat_overview_path,
675676
)
676-
evaluation = layout_evaluator( # type: ignore
677+
layout_evaluation = layout_evaluator( # type: ignore
677678
idir,
678679
split=split,
679680
external_predictions_path=external_predictions_path,
680681
)
681-
682682
with open(save_fn, "w") as fd:
683-
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
683+
json.dump(layout_evaluation.model_dump(), fd, indent=2, sort_keys=True)
684684

685685
# Evaluate with the pixel-wise layout evaluation
686-
pixel_layout_evaluator = PixelLayoutEvaluator()
686+
pixel_layout_evaluator = PixelLayoutEvaluator(concurrency=concurrency)
687687
pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
688688
idir,
689689
split=split,
@@ -696,6 +696,9 @@ def evaluate(
696696
pixel_save_root,
697697
)
698698

699+
# TODO: Redesign evaluate() to return multiple evaluation objects
700+
evaluation = pixel_ds_evaluation # type: ignore
701+
699702
elif modality == EvaluationModality.TABLE_STRUCTURE:
700703
table_evaluator = TableEvaluator()
701704
evaluation = table_evaluator( # type: ignore
@@ -764,7 +767,7 @@ def evaluate(
764767
)
765768

766769
elif modality == EvaluationModality.MARKDOWN_TEXT:
767-
md_evaluator = MarkdownTextEvaluator()
770+
md_evaluator = MarkdownTextEvaluator(concurrency=concurrency)
768771
evaluation = md_evaluator( # type: ignore
769772
idir,
770773
split=split,
@@ -823,8 +826,8 @@ def evaluate(
823826
def visualize(
824827
modality: EvaluationModality,
825828
benchmark: BenchMarkNames,
826-
idir: Path,
827829
odir: Path,
830+
idir: Path | None = None,
828831
split: str = "test",
829832
):
830833
"""
@@ -839,10 +842,6 @@ def visualize(
839842
begin_index: Begin index
840843
end_index: End index
841844
"""
842-
if not os.path.exists(idir):
843-
_log.error(f"Input directory not found: {idir}")
844-
return
845-
846845
os.makedirs(odir, exist_ok=True)
847846
metrics_filename = odir / f"evaluation_{benchmark.value}_{modality.value}.json"
848847

@@ -989,6 +988,11 @@ def visualize(
989988

990989
elif modality == EvaluationModality.READING_ORDER:
991990
try:
991+
# idir is required here
992+
if idir is None or not idir.is_dir():
993+
_log.error(f"Input directory not found: {idir}")
994+
return
995+
992996
with open(metrics_filename, "r") as fd:
993997
ro_evaluation = DatasetReadingOrderEvaluation.model_validate_json(
994998
fd.read()
@@ -1080,6 +1084,11 @@ def visualize(
10801084

10811085
elif modality == EvaluationModality.OCR:
10821086
try:
1087+
# idir is required here
1088+
if idir is None or not idir.is_dir():
1089+
_log.error(f"Input directory not found: {idir}")
1090+
return
1091+
10831092
with open(metrics_filename, "r") as fd:
10841093
ocr_evaluation = OcrDatasetEvaluationResult.model_validate_json(
10851094
fd.read()
@@ -1511,6 +1520,9 @@ def evaluate_cmd(
15111520
help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]",
15121521
),
15131522
] = None,
1523+
concurrency: Annotated[
1524+
int, typer.Option(help="Concurrency for the computation of each metric")
1525+
] = 4,
15141526
):
15151527
"""Evaluate predictions against ground truth."""
15161528
input_dir, output_dir = derive_input_output_dirs(
@@ -1531,6 +1543,7 @@ def evaluate_cmd(
15311543
odir=eval_output_dir,
15321544
split=split,
15331545
external_predictions_path=external_predictions_path,
1546+
concurrency=concurrency,
15341547
)
15351548

15361549

@@ -1573,8 +1586,8 @@ def visualize_cmd(
15731586
visualize(
15741587
modality=modality,
15751588
benchmark=benchmark,
1576-
idir=input_dir,
15771589
odir=eval_output_dir,
1590+
idir=input_dir,
15781591
split=split,
15791592
)
15801593

docling_eval/evaluators/base_evaluator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,14 @@ def __init__(
7575
supported_prediction_formats: List[PredictionFormats] = [
7676
PredictionFormats.DOCLING_DOCUMENT
7777
],
78+
concurrency: int = 4,
7879
):
7980
r"""
8081
Parameters
8182
----------
8283
intermediate_evaluations_path: When True the evalution per example will be saved in a file
8384
"""
85+
self._concurrency = concurrency
8486
self._intermediate_evaluations_path = intermediate_evaluations_path
8587

8688
# Validate the prediction_sources

docling_eval/evaluators/layout_evaluator.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ def __call__(
489489
weighted_map_90_values.append(average_iou_90)
490490
weighted_map_95_values.append(average_iou_95)
491491

492-
_log.info(
492+
_log.debug(
493493
"doc: %s\tprecision: %.2f, recall: %.2f, f1: %.2f, map_50: %.2f, "
494494
"precision_no_pics: %.2f, recall_no_pics: %.2f, f1_no_pics: %.2f",
495495
doc_id_page,
@@ -528,7 +528,6 @@ def __call__(
528528
segmentation_precision_no_pictures=precision_no_pics,
529529
segmentation_recall_no_pictures=recall_no_pics,
530530
segmentation_f1_no_pictures=f1_no_pics,
531-
# New per-sample element count metrics
532531
true_element_count=true_element_count,
533532
pred_element_count=pred_element_count,
534533
true_table_count=true_table_count,
@@ -836,9 +835,7 @@ def _find_intersecting_labels(
836835
true_labels: Dict[str, int] = {}
837836
pred_labels: Dict[str, int] = {}
838837

839-
for i, data in tqdm(
840-
enumerate(ds), desc="Layout evaluations", ncols=120, total=len(ds)
841-
):
838+
for i, data in enumerate(ds):
842839
data_record = DatasetRecordWithPrediction.model_validate(data)
843840
true_doc = data_record.ground_truth_doc
844841
pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)

0 commit comments

Comments
 (0)