Skip to content

Commit a34f264

Browse files
samiucsamiullahchatthacau-git
authored
fix: update hyperscalers to support multiple image file types (#118)
* fix: update docling prediction provider to include word cells Signed-off-by: samiullahchattha <[email protected]> * fix: missing parsed_page in set_word_cells method Signed-off-by: samiullahchattha <[email protected]> * Update docling_eval/prediction_providers/docling_provider.py Co-authored-by: Christoph Auer <[email protected]> Signed-off-by: samiuc <[email protected]> * Update docling_eval/prediction_providers/docling_provider.py Co-authored-by: Christoph Auer <[email protected]> Signed-off-by: samiuc <[email protected]> * fix: conditionally populate word_cells in _set_word_cells method Signed-off-by: samiullahchattha <[email protected]> * feat: Implement smart weighted character distribution for line text processing Signed-off-by: samiullahchattha <[email protected]> * fix: remove redundant field validators Signed-off-by: samiullahchattha <[email protected]> * refactor: replace BoundingBoxDict with BoundingBox Signed-off-by: samiullahchattha <[email protected]> * refactor: update BoundingBox usage in prediction providers Signed-off-by: samiullahchattha <[email protected]> * refactor: remove unused code Signed-off-by: samiullahchattha <[email protected]> * refactor: move validate_evaluation_results to test_utils Signed-off-by: samiullahchattha <[email protected]> --------- Signed-off-by: samiullahchattha <[email protected]> Signed-off-by: samiuc <[email protected]> Co-authored-by: samiullahchattha <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
1 parent 629a451 commit a34f264

File tree

7 files changed

+68
-15
lines changed

7 files changed

+68
-15
lines changed

docling_eval/evaluators/ocr_evaluator.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@
33
import json
44
import logging
55
import traceback
6-
from io import BytesIO
76
from pathlib import Path
8-
from typing import Any, Dict, List, Optional, Tuple
7+
from typing import Any, Dict, List, Optional, Union
98

109
from datasets import Dataset, load_dataset
1110
from docling_core.types.doc import CoordOrigin
@@ -17,10 +16,7 @@
1716
from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats
1817
from docling_eval.evaluators.base_evaluator import BaseEvaluator
1918
from docling_eval.evaluators.ocr.benchmark_runner import _OcrBenchmark
20-
from docling_eval.evaluators.ocr.evaluation_models import (
21-
DocumentEvaluationEntry,
22-
OcrDatasetEvaluationResult,
23-
)
19+
from docling_eval.evaluators.ocr.evaluation_models import OcrDatasetEvaluationResult
2420
from docling_eval.evaluators.ocr.processing_utils import parse_segmented_pages
2521

2622
logging.basicConfig(
@@ -56,7 +52,7 @@ def __call__(
5652
use_space_for_gt_merge = True
5753

5854
benchmark_tool = _OcrBenchmark(
59-
model_identifier="ocr_model_under_test",
55+
model_identifier="ocr_model",
6056
ignore_zone_filter_type=ignore_zone_filter_config,
6157
add_space_for_merged_prediction_words=use_space_for_prediction_merge,
6258
add_space_for_merged_gt_words=use_space_for_gt_merge,
@@ -287,7 +283,11 @@ def __call__(
287283
if parsed_pred_pages:
288284
prediction_segmented_pages = parsed_pred_pages
289285

290-
base_image: Image.Image = page_images_data[0]
286+
image_item: Union[dict, Image.Image] = page_images_data[0]
287+
if isinstance(image_item, dict):
288+
base_image: Image.Image = image_item["image"]
289+
else:
290+
base_image = image_item
291291
if base_image.mode != "RGB":
292292
base_image = base_image.convert("RGB")
293293

docling_eval/prediction_providers/aws_prediction_provider.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,12 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
521521
)
522522

523523
try:
524-
if record.mime_type in ["application/pdf", "image/png"]:
524+
if record.mime_type in [
525+
"application/pdf",
526+
"image/png",
527+
"image/jpg",
528+
"image/jpeg",
529+
]:
525530
# Call the AWS Textract API by passing in the image for prediction
526531

527532
file_bytes = record.original.stream.read()
@@ -540,7 +545,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
540545
)
541546
else:
542547
raise RuntimeError(
543-
f"Unsupported mime type: {record.mime_type}. AzureDocIntelligencePredictionProvider supports 'application/pdf' and 'image/png'"
548+
f"Unsupported mime type: {record.mime_type}. AWSTextractPredictionProvider supports 'application/pdf' 'image/png', 'image/jpeg', and 'image/jpg'"
544549
)
545550
except Exception as e:
546551
_log.error(f"Error in AWS Textract prediction: {str(e)}")

docling_eval/prediction_providers/azure_prediction_provider.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -405,12 +405,13 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
405405
f"Successfully processed [{record.doc_id}] using Azure API..!!"
406406
)
407407

408-
elif record.mime_type == "image/png":
408+
elif record.mime_type in ["image/png", "image/jpeg", "image/jpg"]:
409409
# Call the Azure API by passing in the image for prediction
410410
buf = BytesIO()
411411

412412
# TODO do this in a loop for all page images in the doc, not just the first.
413-
record.ground_truth_page_images[0].save(buf, format="PNG")
413+
save_format = "PNG" if record.mime_type == "image/png" else "JPEG"
414+
record.ground_truth_page_images[0].save(buf, format=save_format)
414415

415416
poller = self.doc_intelligence_client.begin_analyze_document(
416417
"prebuilt-layout",
@@ -425,7 +426,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
425426
)
426427
else:
427428
raise RuntimeError(
428-
f"Unsupported mime type: {record.mime_type}. AzureDocIntelligencePredictionProvider supports 'application/pdf' and 'image/png'"
429+
f"Unsupported mime type: {record.mime_type}. AzureDocIntelligencePredictionProvider supports 'application/pdf', 'image/png', 'image/jpeg', and 'image/jpg'"
429430
)
430431
# Convert the prediction to doclingDocument
431432
pred_doc, pred_segmented_pages = self.convert_azure_output_to_docling(

tests/__init__.py

Whitespace-only changes.

tests/test_ocr_aws_funsd.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from docling_eval.prediction_providers.aws_prediction_provider import (
1111
AWSTextractPredictionProvider,
1212
)
13+
from tests.test_utils import validate_evaluation_results
1314

1415
IS_CI = os.getenv("RUN_IN_CI") == "1"
1516

@@ -33,7 +34,7 @@ def test_run_funsd_builder():
3334
dataset = FUNSDDatasetBuilder(
3435
dataset_source=dataset_source,
3536
target=target_path / "gt_dataset",
36-
end_index=4,
37+
end_index=1,
3738
)
3839
dataset.retrieve_input_dataset()
3940
dataset.save_to_disk()
@@ -52,6 +53,11 @@ def test_run_funsd_builder():
5253
odir=target_path / "evaluations" / EvaluationModality.OCR.value,
5354
)
5455

56+
validate_evaluation_results(
57+
target_path=target_path,
58+
benchmark=BenchMarkNames.FUNSD.value,
59+
modality=EvaluationModality.OCR.value,
60+
)
5561
visualize(
5662
modality=EvaluationModality.OCR,
5763
benchmark=BenchMarkNames.FUNSD,

tests/test_ocr_google.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from docling_eval.prediction_providers.google_prediction_provider import (
1212
GoogleDocAIPredictionProvider,
1313
)
14+
from tests.test_utils import validate_evaluation_results
1415

1516
IS_CI = bool(os.getenv("CI"))
1617

@@ -90,7 +91,11 @@ def test_run_xfund_builder():
9091
odir=target_path / "evaluations" / EvaluationModality.OCR.value,
9192
split="val",
9293
)
93-
94+
validate_evaluation_results(
95+
target_path=target_path,
96+
benchmark=BenchMarkNames.XFUND.value,
97+
modality=EvaluationModality.OCR.value,
98+
)
9499
visualize(
95100
modality=EvaluationModality.OCR,
96101
benchmark=BenchMarkNames.XFUND,

tests/test_utils.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import json
2+
from pathlib import Path
3+
4+
from docling_eval.evaluators.ocr.evaluation_models import OcrDatasetEvaluationResult
5+
6+
7+
def validate_evaluation_results(
8+
target_path: Path,
9+
benchmark: str,
10+
modality: str,
11+
evaluation_type: str = "ocr",
12+
) -> OcrDatasetEvaluationResult:
13+
eval_json_filename = f"evaluation_{benchmark}_{modality}.json"
14+
eval_json_path = target_path / "evaluations" / evaluation_type / eval_json_filename
15+
16+
assert eval_json_path.exists(), f"Evaluation JSON file not found: {eval_json_path}"
17+
18+
with open(eval_json_path, "r") as f:
19+
result = json.load(f)
20+
21+
assert result is not None, "Evaluation JSON file is empty or invalid."
22+
assert result, "Overall metrics not found in evaluation results."
23+
24+
metrics = OcrDatasetEvaluationResult(**result)
25+
26+
assert (
27+
metrics.f1_score > 0
28+
), f"F1 score ({metrics.f1_score}) must be greater than 0."
29+
assert (
30+
metrics.precision > 0
31+
), f"Precision score ({metrics.precision}) must be greater than 0."
32+
assert (
33+
metrics.recall > 0
34+
), f"Recall score ({metrics.recall}) must be greater than 0."
35+
36+
return metrics

0 commit comments

Comments
 (0)