fix: update hyperscalers to support multiple image file types (#118)

samiuc · samiullahchattha · cau-git · web-flow · commit a34f2649abd0 · 2025-07-01T12:17:50.000-07:00
* fix: update docling prediction provider to include word cells

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

* fix: missing parsed_page in set_word_cells method

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

* Update docling_eval/prediction_providers/docling_provider.py

Co-authored-by: Christoph Auer &lt;60343111+cau-git@users.noreply.github.com&gt;
Signed-off-by: samiuc &lt;sami.ullah.chat@gmail.com&gt;

* Update docling_eval/prediction_providers/docling_provider.py

Co-authored-by: Christoph Auer &lt;60343111+cau-git@users.noreply.github.com&gt;
Signed-off-by: samiuc &lt;sami.ullah.chat@gmail.com&gt;

* fix: conditionally populate word_cells in _set_word_cells method

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

* feat: Implement smart weighted character distribution for line text processing

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

* fix: remove redundant field validators

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

* refactor: replace BoundingBoxDict with BoundingBox

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

* refactor: update BoundingBox usage in prediction providers

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

* refactor: remove unused code

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

* refactor: move validate_evaluation_results to test_utils

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

---------

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;
Signed-off-by: samiuc &lt;sami.ullah.chat@gmail.com&gt;
Co-authored-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;
Co-authored-by: Christoph Auer &lt;60343111+cau-git@users.noreply.github.com&gt;
diff --git a/docling_eval/evaluators/ocr_evaluator.py b/docling_eval/evaluators/ocr_evaluator.py
@@ -3,9 +3,8 @@
 import json
 import logging
 import traceback
-from io import BytesIO
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Union
 
 from datasets import Dataset, load_dataset
 from docling_core.types.doc import CoordOrigin
@@ -17,10 +16,7 @@
 from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats
 from docling_eval.evaluators.base_evaluator import BaseEvaluator
 from docling_eval.evaluators.ocr.benchmark_runner import _OcrBenchmark
-from docling_eval.evaluators.ocr.evaluation_models import (
-    DocumentEvaluationEntry,
-    OcrDatasetEvaluationResult,
-)
+from docling_eval.evaluators.ocr.evaluation_models import OcrDatasetEvaluationResult
 from docling_eval.evaluators.ocr.processing_utils import parse_segmented_pages
 
 logging.basicConfig(
@@ -56,7 +52,7 @@ def __call__(
         use_space_for_gt_merge = True
 
         benchmark_tool = _OcrBenchmark(
-            model_identifier="ocr_model_under_test",
+            model_identifier="ocr_model",
             ignore_zone_filter_type=ignore_zone_filter_config,
             add_space_for_merged_prediction_words=use_space_for_prediction_merge,
             add_space_for_merged_gt_words=use_space_for_gt_merge,
@@ -287,7 +283,11 @@ def __call__(
                     if parsed_pred_pages:
                         prediction_segmented_pages = parsed_pred_pages
 
-                base_image: Image.Image = page_images_data[0]
+                image_item: Union[dict, Image.Image] = page_images_data[0]
+                if isinstance(image_item, dict):
+                    base_image: Image.Image = image_item["image"]
+                else:
+                    base_image = image_item
                 if base_image.mode != "RGB":
                     base_image = base_image.convert("RGB")
 
diff --git a/docling_eval/prediction_providers/aws_prediction_provider.py b/docling_eval/prediction_providers/aws_prediction_provider.py
@@ -521,7 +521,12 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
             )
 
         try:
-            if record.mime_type in ["application/pdf", "image/png"]:
+            if record.mime_type in [
+                "application/pdf",
+                "image/png",
+                "image/jpg",
+                "image/jpeg",
+            ]:
                 # Call the AWS Textract API by passing in the image for prediction
 
                 file_bytes = record.original.stream.read()
@@ -540,7 +545,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
                 )
             else:
                 raise RuntimeError(
-                    f"Unsupported mime type: {record.mime_type}. AzureDocIntelligencePredictionProvider supports 'application/pdf' and 'image/png'"
+                    f"Unsupported mime type: {record.mime_type}. AWSTextractPredictionProvider supports 'application/pdf' 'image/png', 'image/jpeg', and 'image/jpg'"
                 )
         except Exception as e:
             _log.error(f"Error in AWS Textract prediction: {str(e)}")
diff --git a/docling_eval/prediction_providers/azure_prediction_provider.py b/docling_eval/prediction_providers/azure_prediction_provider.py
@@ -405,12 +405,13 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
                     f"Successfully processed [{record.doc_id}] using Azure API..!!"
                 )
 
-            elif record.mime_type == "image/png":
+            elif record.mime_type in ["image/png", "image/jpeg", "image/jpg"]:
                 # Call the Azure API by passing in the image for prediction
                 buf = BytesIO()
 
                 # TODO do this in a loop for all page images in the doc, not just the first.
-                record.ground_truth_page_images[0].save(buf, format="PNG")
+                save_format = "PNG" if record.mime_type == "image/png" else "JPEG"
+                record.ground_truth_page_images[0].save(buf, format=save_format)
 
                 poller = self.doc_intelligence_client.begin_analyze_document(
                     "prebuilt-layout",
@@ -425,7 +426,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
                 )
             else:
                 raise RuntimeError(
-                    f"Unsupported mime type: {record.mime_type}. AzureDocIntelligencePredictionProvider supports 'application/pdf' and 'image/png'"
+                    f"Unsupported mime type: {record.mime_type}. AzureDocIntelligencePredictionProvider supports 'application/pdf', 'image/png', 'image/jpeg', and 'image/jpg'"
                 )
             # Convert the prediction to doclingDocument
             pred_doc, pred_segmented_pages = self.convert_azure_output_to_docling(
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_ocr_aws_funsd.py b/tests/test_ocr_aws_funsd.py
@@ -10,6 +10,7 @@
 from docling_eval.prediction_providers.aws_prediction_provider import (
     AWSTextractPredictionProvider,
 )
+from tests.test_utils import validate_evaluation_results
 
 IS_CI = os.getenv("RUN_IN_CI") == "1"
 
@@ -33,7 +34,7 @@ def test_run_funsd_builder():
     dataset = FUNSDDatasetBuilder(
         dataset_source=dataset_source,
         target=target_path / "gt_dataset",
-        end_index=4,
+        end_index=1,
     )
     dataset.retrieve_input_dataset()
     dataset.save_to_disk()
@@ -52,6 +53,11 @@ def test_run_funsd_builder():
         odir=target_path / "evaluations" / EvaluationModality.OCR.value,
     )
 
+    validate_evaluation_results(
+        target_path=target_path,
+        benchmark=BenchMarkNames.FUNSD.value,
+        modality=EvaluationModality.OCR.value,
+    )
     visualize(
         modality=EvaluationModality.OCR,
         benchmark=BenchMarkNames.FUNSD,
diff --git a/tests/test_ocr_google.py b/tests/test_ocr_google.py
@@ -11,6 +11,7 @@
 from docling_eval.prediction_providers.google_prediction_provider import (
     GoogleDocAIPredictionProvider,
 )
+from tests.test_utils import validate_evaluation_results
 
 IS_CI = bool(os.getenv("CI"))
 
@@ -90,7 +91,11 @@ def test_run_xfund_builder():
         odir=target_path / "evaluations" / EvaluationModality.OCR.value,
         split="val",
     )
-
+    validate_evaluation_results(
+        target_path=target_path,
+        benchmark=BenchMarkNames.XFUND.value,
+        modality=EvaluationModality.OCR.value,
+    )
     visualize(
         modality=EvaluationModality.OCR,
         benchmark=BenchMarkNames.XFUND,
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,36 @@
+import json
+from pathlib import Path
+
+from docling_eval.evaluators.ocr.evaluation_models import OcrDatasetEvaluationResult
+
+
+def validate_evaluation_results(
+    target_path: Path,
+    benchmark: str,
+    modality: str,
+    evaluation_type: str = "ocr",
+) -> OcrDatasetEvaluationResult:
+    eval_json_filename = f"evaluation_{benchmark}_{modality}.json"
+    eval_json_path = target_path / "evaluations" / evaluation_type / eval_json_filename
+
+    assert eval_json_path.exists(), f"Evaluation JSON file not found: {eval_json_path}"
+
+    with open(eval_json_path, "r") as f:
+        result = json.load(f)
+
+    assert result is not None, "Evaluation JSON file is empty or invalid."
+    assert result, "Overall metrics not found in evaluation results."
+
+    metrics = OcrDatasetEvaluationResult(**result)
+
+    assert (
+        metrics.f1_score > 0
+    ), f"F1 score ({metrics.f1_score}) must be greater than 0."
+    assert (
+        metrics.precision > 0
+    ), f"Precision score ({metrics.precision}) must be greater than 0."
+    assert (
+        metrics.recall > 0
+    ), f"Recall score ({metrics.recall}) must be greater than 0."
+
+    return metrics