docling-eval/docling_eval/evaluators/base_evaluator.py at 88051e4170697ac761862e02aa656a124d49676d · docling-project/docling-eval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import json
import logging
from enum import Enum
from pathlib import Path
from typing import Any, Dict, Generic, List, Optional, TypeVar

from docling.datamodel.base_models import ConversionStatus
from docling_core.types.doc.document import (
    DoclingDocument,
    DocTagsDocument,
    DocTagsPage,
)
from pydantic import BaseModel

from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction
from docling_eval.datamodels.types import PredictionFormats

_log = logging.getLogger(__name__)


class UnitEvaluation(BaseModel):
    pass


class EvaluationRejectionType(str, Enum):
    INVALID_CONVERSION_STATUS = "invalid_conversion_status"
    MISSING_PREDICTION = "missing_prediction"
    MISMATHCED_DOCUMENT = "mismatched_document"
    BROKEN_PREDICTION = "broken_prediction"
    EVALUATION_ERROR = "evaluation_error"


class DatasetEvaluation(BaseModel):
    evaluated_samples: int = -1
    rejected_samples: Dict[EvaluationRejectionType, int] = {}


UnitEvaluationType = TypeVar("UnitEvaluationType", bound=UnitEvaluation)
DatasetEvaluationType = TypeVar("DatasetEvaluationType", bound=DatasetEvaluation)


def docling_document_from_doctags(
    data_record: DatasetRecordWithPrediction,
) -> DoclingDocument:
    r""" """
    doc_id = data_record.doc_id
    doctags = data_record.original
    if not isinstance(doctags, str):
        raise RuntimeError("Invalid format of original prediction")

    page_image = (
        data_record.ground_truth_page_images[0]
        if data_record.ground_truth_page_images
        else None
    )

    doctags_page = DocTagsPage(tokens=doctags, image=page_image)
    doctags_doc = DocTagsDocument(pages=[doctags_page])
    pred_doc = DoclingDocument.load_from_doctags(doctags_doc, document_name=doc_id)

    return pred_doc


class BaseEvaluator(Generic[UnitEvaluationType, DatasetEvaluationType]):
    r"""
    Base class for all evaluators
    """

    def __init__(
        self,
        intermediate_evaluations_path: Optional[Path] = None,
        prediction_sources: List[PredictionFormats] = [
            PredictionFormats.DOCLING_DOCUMENT
        ],
        supported_prediction_formats: List[PredictionFormats] = [
            PredictionFormats.DOCLING_DOCUMENT
        ],
        concurrency: int = 4,
    ):
        r"""
        Parameters
        ----------
        intermediate_evaluations_path: When True the evalution per example will be saved in a file
        """
        self._concurrency = concurrency
        self._intermediate_evaluations_path = intermediate_evaluations_path

        # Validate the prediction_sources
        if set(prediction_sources) - set(supported_prediction_formats):
            msg = "Unsupported prediction_sources. "
            msg += f"It should be something out of {supported_prediction_formats}"
            raise RuntimeError(msg)
        self._prediction_sources = prediction_sources
        self._supported_prediction_sources = supported_prediction_formats

        self._accepted_status: List[ConversionStatus] = [
            ConversionStatus.SUCCESS,
            ConversionStatus.PARTIAL_SUCCESS,
        ]

    def __call__(
        self,
        ds_path: Path,
        split: str = "test",
        external_predictions_path: Optional[Path] = None,
    ) -> DatasetEvaluationType:
        r"""
        Perform the evaluation
        """
        return None  # type: ignore

    def supported_prediction_formats(self) -> List[PredictionFormats]:
        r"""
        Return the supported formats for predictions
        """
        return self._supported_prediction_sources

    def save_intermediate_evaluations(
        self,
        evaluation_name: str,
        enunumerate_id: int,
        doc_id: str,
        evaluations: List[UnitEvaluationType],
    ) -> Optional[Path]:
        r"""
        Utility method to save intermediate evaluation results
        Return immediatelly if the intermediate_evaluation_path is not set
        It returns the file Path with the intermediate results or None
        """
        if self._intermediate_evaluations_path:
            return None

        evals = [ev.model_dump() for ev in evaluations]
        evaluation_filename = f"{evaluation_name}_{enunumerate_id:05d}_{doc_id}.json"
        evaluation_fn = self._intermediate_evaluations_path / evaluation_filename  # type: ignore
        _log.info("Saving intermediate evaluations: %s", evaluation_fn)
        with open(evaluation_fn, "w") as fd:
            json.dump(evals, fd)

        return evaluation_fn