Skip to content
35 changes: 24 additions & 11 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,7 @@ def evaluate(
split: str = "test",
cvat_overview_path: Optional[Path] = None,
external_predictions_path: Optional[Path] = None,
concurrency: int = 4,
) -> Optional[DatasetEvaluationType]:
"""Evaluate predictions against ground truth."""
if not os.path.exists(idir):
Expand Down Expand Up @@ -673,17 +674,16 @@ def evaluate(
# label_filtering_strategy=LabelFilteringStrategy.INTERSECTION,
page_mapping_path=cvat_overview_path,
)
evaluation = layout_evaluator( # type: ignore
layout_evaluation = layout_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
json.dump(layout_evaluation.model_dump(), fd, indent=2, sort_keys=True)

# Evaluate with the pixel-wise layout evaluation
pixel_layout_evaluator = PixelLayoutEvaluator()
pixel_layout_evaluator = PixelLayoutEvaluator(concurrency=concurrency)
pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
idir,
split=split,
Expand All @@ -696,6 +696,9 @@ def evaluate(
pixel_save_root,
)

# TODO: Redesign evaluate() to return multiple evaluation objects
evaluation = pixel_ds_evaluation # type: ignore

elif modality == EvaluationModality.TABLE_STRUCTURE:
table_evaluator = TableEvaluator()
evaluation = table_evaluator( # type: ignore
Expand Down Expand Up @@ -764,7 +767,7 @@ def evaluate(
)

elif modality == EvaluationModality.MARKDOWN_TEXT:
md_evaluator = MarkdownTextEvaluator()
md_evaluator = MarkdownTextEvaluator(concurrency=concurrency)
evaluation = md_evaluator( # type: ignore
idir,
split=split,
Expand Down Expand Up @@ -823,8 +826,8 @@ def evaluate(
def visualize(
modality: EvaluationModality,
benchmark: BenchMarkNames,
idir: Path,
odir: Path,
idir: Path | None = None,
split: str = "test",
):
"""
Expand All @@ -839,10 +842,6 @@ def visualize(
begin_index: Begin index
end_index: End index
"""
if not os.path.exists(idir):
_log.error(f"Input directory not found: {idir}")
return

os.makedirs(odir, exist_ok=True)
metrics_filename = odir / f"evaluation_{benchmark.value}_{modality.value}.json"

Expand Down Expand Up @@ -989,6 +988,11 @@ def visualize(

elif modality == EvaluationModality.READING_ORDER:
try:
# idir is required here
if idir is None or not idir.is_dir():
_log.error(f"Input directory not found: {idir}")
return

with open(metrics_filename, "r") as fd:
ro_evaluation = DatasetReadingOrderEvaluation.model_validate_json(
fd.read()
Expand Down Expand Up @@ -1080,6 +1084,11 @@ def visualize(

elif modality == EvaluationModality.OCR:
try:
# idir is required here
if idir is None or not idir.is_dir():
_log.error(f"Input directory not found: {idir}")
return

with open(metrics_filename, "r") as fd:
ocr_evaluation = OcrDatasetEvaluationResult.model_validate_json(
fd.read()
Expand Down Expand Up @@ -1511,6 +1520,9 @@ def evaluate_cmd(
help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]",
),
] = None,
concurrency: Annotated[
int, typer.Option(help="Concurrency for the computation of each metric")
] = 4,
):
"""Evaluate predictions against ground truth."""
input_dir, output_dir = derive_input_output_dirs(
Expand All @@ -1531,6 +1543,7 @@ def evaluate_cmd(
odir=eval_output_dir,
split=split,
external_predictions_path=external_predictions_path,
concurrency=concurrency,
)


Expand Down Expand Up @@ -1573,8 +1586,8 @@ def visualize_cmd(
visualize(
modality=modality,
benchmark=benchmark,
idir=input_dir,
odir=eval_output_dir,
idir=input_dir,
split=split,
)

Expand Down
2 changes: 2 additions & 0 deletions docling_eval/evaluators/base_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,14 @@ def __init__(
supported_prediction_formats: List[PredictionFormats] = [
PredictionFormats.DOCLING_DOCUMENT
],
concurrency: int = 4,
):
r"""
Parameters
----------
intermediate_evaluations_path: When True the evalution per example will be saved in a file
"""
self._concurrency = concurrency
self._intermediate_evaluations_path = intermediate_evaluations_path

# Validate the prediction_sources
Expand Down
7 changes: 2 additions & 5 deletions docling_eval/evaluators/layout_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ def __call__(
weighted_map_90_values.append(average_iou_90)
weighted_map_95_values.append(average_iou_95)

_log.info(
_log.debug(
"doc: %s\tprecision: %.2f, recall: %.2f, f1: %.2f, map_50: %.2f, "
"precision_no_pics: %.2f, recall_no_pics: %.2f, f1_no_pics: %.2f",
doc_id_page,
Expand Down Expand Up @@ -528,7 +528,6 @@ def __call__(
segmentation_precision_no_pictures=precision_no_pics,
segmentation_recall_no_pictures=recall_no_pics,
segmentation_f1_no_pictures=f1_no_pics,
# New per-sample element count metrics
true_element_count=true_element_count,
pred_element_count=pred_element_count,
true_table_count=true_table_count,
Expand Down Expand Up @@ -836,9 +835,7 @@ def _find_intersecting_labels(
true_labels: Dict[str, int] = {}
pred_labels: Dict[str, int] = {}

for i, data in tqdm(
enumerate(ds), desc="Layout evaluations", ncols=120, total=len(ds)
):
for i, data in enumerate(ds):
data_record = DatasetRecordWithPrediction.model_validate(data)
true_doc = data_record.ground_truth_doc
pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
Expand Down
Loading
Loading