Skip to content

Commit be62102

Browse files
cau-gitsamiucsamiullahchattha
authored
feat: OCR evaluator (#63)
* Add README for Docling-DPBench Signed-off-by: Christoph Auer <[email protected]> * Pick ocr_evaluator, create basic test unit Signed-off-by: Christoph Auer <[email protected]> * Add basic OCR evaluator test unit Signed-off-by: Christoph Auer <[email protected]> * fix: update ocr evaluator * fix ocr tests * address review comments * remove unused imports from OCR test files * Remove unnecessary hasattr checks * Add back test depdendency for OCR Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: samiuc <[email protected]> Co-authored-by: samiuc <[email protected]> Co-authored-by: samiullahchattha <[email protected]>
1 parent dee40e8 commit be62102

13 files changed

+504
-23
lines changed

docling_eval/cli/main.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
DatasetMarkdownEvaluation,
5555
MarkdownTextEvaluator,
5656
)
57+
from docling_eval.evaluators.ocr_evaluator import OCREvaluator
5758
from docling_eval.evaluators.readingorder_evaluator import (
5859
DatasetReadingOrderEvaluation,
5960
ReadingOrderEvaluator,
@@ -351,6 +352,16 @@ def evaluate(
351352
with open(save_fn, "w") as fd:
352353
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
353354

355+
elif modality == EvaluationModality.OCR:
356+
ocr_evaluator = OCREvaluator()
357+
ocr_evaluation = ocr_evaluator(
358+
idir,
359+
split=split,
360+
)
361+
362+
with open(save_fn, "w") as fd:
363+
json.dump(ocr_evaluation.model_dump(), fd, indent=2, sort_keys=True)
364+
354365
elif modality == EvaluationModality.READING_ORDER:
355366
readingorder_evaluator = ReadingOrderEvaluator()
356367
evaluation = readingorder_evaluator( # type: ignore

docling_eval/evaluators/base_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def supported_prediction_formats(self) -> List[PredictionFormats]:
113113
"""
114114
return self._supported_prediction_sources
115115

116-
def save_intermediate_evalutions(
116+
def save_intermediate_evaluations(
117117
self,
118118
evaluation_name: str,
119119
enunumerate_id: int,

docling_eval/evaluators/layout_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ def __call__(
306306
)
307307
evaluations_per_image.append(image_evaluation)
308308
if self._intermediate_evaluations_path:
309-
self.save_intermediate_evalutions(
309+
self.save_intermediate_evaluations(
310310
"Layout_image", i, doc_id, evaluations_per_image
311311
)
312312

docling_eval/evaluators/markdown_text_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def __call__(
185185
evaluations.append(md_evaluation)
186186

187187
if self._intermediate_evaluations_path:
188-
self.save_intermediate_evalutions("MD", i, doc_id, evaluations)
188+
self.save_intermediate_evaluations("MD", i, doc_id, evaluations)
189189

190190
ds_md_evalutions = DatasetMarkdownEvaluation(
191191
evaluated_samples=len(evaluations),
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
import glob
2+
import json
3+
import logging
4+
import os
5+
import statistics
6+
from pathlib import Path
7+
from typing import Any, Dict, Generic, List, Optional, TypeVar
8+
9+
import evaluate
10+
import pandas as pd
11+
from datasets import Dataset, load_dataset
12+
from docling_core.types.doc.document import DoclingDocument
13+
from pydantic import BaseModel
14+
from tqdm import tqdm
15+
16+
from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction
17+
from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats
18+
from docling_eval.evaluators.base_evaluator import BaseEvaluator
19+
20+
logging.basicConfig(
21+
level=logging.DEBUG,
22+
format="%(asctime)s - %(levelname)s - %(message)s",
23+
)
24+
25+
_log = logging.getLogger(__name__)
26+
27+
28+
class PageOcrEvaluation(BaseModel):
29+
doc_id: str
30+
true_text: str
31+
pred_text: str
32+
cer: float
33+
char_accuracy: float
34+
35+
36+
class DatasetOcrEvaluation(BaseModel):
37+
evaluations: List[PageOcrEvaluation]
38+
mean_character_accuracy: float
39+
40+
41+
class OCREvaluator(BaseEvaluator):
42+
"""Evaluator for OCR tasks that computes Character Accuracy"""
43+
44+
def __init__(
45+
self,
46+
intermediate_evaluations_path: Optional[Path] = None,
47+
prediction_sources: List[PredictionFormats] = [
48+
PredictionFormats.DOCLING_DOCUMENT
49+
],
50+
):
51+
"""Initialize the OCR evaluator"""
52+
super().__init__(
53+
intermediate_evaluations_path=intermediate_evaluations_path,
54+
prediction_sources=prediction_sources,
55+
supported_prediction_formats=[PredictionFormats.DOCLING_DOCUMENT],
56+
)
57+
# Load the CER evaluation metric
58+
# https://huggingface.co/spaces/evaluate-metric/cer
59+
self._cer_eval = evaluate.load("cer")
60+
61+
def __call__(
62+
self,
63+
ds_path: Path,
64+
split: str = "test",
65+
) -> DatasetOcrEvaluation:
66+
67+
_log.info("Loading the split '%s' from: '%s'", split, ds_path)
68+
split_path = str(ds_path / split / "*.parquet")
69+
split_files = glob.glob(split_path)
70+
_log.info("Files: %s", split_files)
71+
ds = load_dataset("parquet", data_files={split: split_files})
72+
_log.info("Overview of dataset: %s", ds)
73+
74+
# Select the split
75+
ds_selection: Dataset = ds[split]
76+
77+
text_evaluations_list = []
78+
char_accuracy_list = []
79+
80+
for i, data in tqdm(
81+
enumerate(ds_selection),
82+
desc="Evaluating OCR",
83+
ncols=120,
84+
total=len(ds_selection),
85+
):
86+
data_record = DatasetRecordWithPrediction.model_validate(data)
87+
doc_id = data_record.doc_id
88+
if data_record.status not in self._accepted_status:
89+
_log.error(
90+
"Skipping record without successfull conversion status: %s", doc_id
91+
)
92+
continue
93+
94+
true_doc = data_record.ground_truth_doc
95+
pred_doc = data_record.predicted_doc
96+
97+
if not pred_doc:
98+
_log.error("There is no prediction for doc_id=%s", doc_id)
99+
continue
100+
101+
true_text = self._extract_text(true_doc)
102+
pred_text = self._extract_text(pred_doc)
103+
104+
if true_text and pred_text:
105+
cer = self._compute_cer_score(true_text, pred_text)
106+
char_accuracy = 1.0 - cer
107+
else:
108+
cer = 1.0 # max error when text is missing
109+
char_accuracy = 0.0 # zero accuracy when text is missing
110+
111+
char_accuracy_list.append(char_accuracy)
112+
113+
page_evaluation = PageOcrEvaluation(
114+
doc_id=doc_id,
115+
true_text=true_text,
116+
pred_text=pred_text,
117+
cer=cer,
118+
char_accuracy=char_accuracy,
119+
)
120+
121+
text_evaluations_list.append(page_evaluation)
122+
if self._intermediate_evaluations_path:
123+
self.save_intermediate_evaluations(
124+
evaluation_name="ocr_eval",
125+
enunumerate_id=i,
126+
doc_id=doc_id,
127+
evaluations=[page_evaluation],
128+
)
129+
130+
mean_character_accuracy = (
131+
statistics.mean(char_accuracy_list) if char_accuracy_list else 0.0
132+
)
133+
134+
_log.info(f"Mean Character Accuracy: {mean_character_accuracy:.4f}")
135+
136+
return DatasetOcrEvaluation(
137+
evaluations=text_evaluations_list,
138+
mean_character_accuracy=mean_character_accuracy,
139+
)
140+
141+
def _compute_cer_score(self, true_txt: str, pred_txt: str) -> float:
142+
"""Compute Character Error Rate"""
143+
result = self._cer_eval.compute(predictions=[pred_txt], references=[true_txt])
144+
return result
145+
146+
def _extract_text(self, doc: DoclingDocument) -> str:
147+
"""Extract text from document JSON structure"""
148+
extracted_text = ""
149+
for text_item in doc.texts:
150+
extracted_text += text_item.text + " "
151+
return extracted_text.strip()

docling_eval/evaluators/readingorder_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def __call__(
130130
evaluations.append(page_evaluation)
131131

132132
if self._intermediate_evaluations_path:
133-
self.save_intermediate_evalutions(
133+
self.save_intermediate_evaluations(
134134
"reading_order", i, doc_id, evaluations
135135
)
136136

docling_eval/evaluators/table_evaluator.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def __call__(
182182
table_evaluations.extend(results)
183183

184184
if self._intermediate_evaluations_path:
185-
self.save_intermediate_evalutions(
185+
self.save_intermediate_evaluations(
186186
"TEDs_struct_content", i, doc_id, results
187187
)
188188

@@ -194,7 +194,9 @@ def __call__(
194194
)
195195
table_struct_evaluations.extend(results)
196196
if self._intermediate_evaluations_path:
197-
self.save_intermediate_evalutions("TEDs_struct", i, doc_id, results)
197+
self.save_intermediate_evaluations(
198+
"TEDs_struct", i, doc_id, results
199+
)
198200

199201
except Exception as ex:
200202
rejected_samples[EvaluationRejectionType.EVALUATION_ERROR] += 1

0 commit comments

Comments
 (0)