separate batch ocr from evaluation into 2 pipelines

marwan37 · marwan37 · commit 8e709237c6ce · 2025-04-03T18:03:08.000-05:00
diff --git a/omni-reader/pipelines/__init__.py b/omni-reader/pipelines/__init__.py
@@ -14,8 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""OCR comparison pipelines package."""
+"""OCR pipelines package."""
 
-from pipelines.ocr_pipeline import ocr_comparison_pipeline
-
-__all__ = ["ocr_comparison_pipeline"]
+from pipelines.batch_pipeline import ocr_batch_pipeline, run_ocr_batch_pipeline
+from pipelines.evaluation_pipeline import ocr_evaluation_pipeline, run_ocr_evaluation_pipeline
diff --git a/omni-reader/pipelines/batch_pipeline.py b/omni-reader/pipelines/batch_pipeline.py
@@ -0,0 +1,117 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2025. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""OCR Batch Pipeline implementation for processing images with multiple models."""
+
+from typing import Any, Dict, List, Optional
+
+from dotenv import load_dotenv
+from zenml import pipeline
+from zenml.logger import get_logger
+
+from steps import (
+    load_images,
+    run_ocr,
+    save_ocr_results,
+)
+
+load_dotenv()
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def ocr_batch_pipeline(
+    image_paths: Optional[List[str]] = None,
+    image_folder: Optional[str] = None,
+    custom_prompt: Optional[str] = None,
+    models: List[str] = None,
+    save_ocr_results_data: bool = False,
+    ocr_results_output_dir: str = "ocr_results",
+) -> None:
+    """Run OCR batch processing pipeline with multiple models.
+
+    Args:
+        image_paths: Optional list of specific image paths to process
+        image_folder: Optional folder to search for images
+        custom_prompt: Optional custom prompt to use for the models
+        models: List of model names to use for OCR
+        save_ocr_results_data: Whether to save OCR results
+        ocr_results_output_dir: Directory to save OCR results
+
+    Returns:
+        None
+    """
+    if not models or len(models) == 0:
+        raise ValueError("At least one model must be specified for the batch pipeline")
+
+    images = load_images(
+        image_paths=image_paths,
+        image_folder=image_folder,
+    )
+    model_results = run_ocr(
+        images=images,
+        models=models,
+        custom_prompt=custom_prompt,
+    )
+
+    if save_ocr_results_data:
+        save_ocr_results(
+            ocr_results=model_results,
+            model_names=models,
+            output_dir=ocr_results_output_dir,
+        )
+
+
+def run_ocr_batch_pipeline(config: Dict[str, Any]) -> None:
+    """Run the OCR batch pipeline from a configuration dictionary.
+
+    Args:
+        config: Dictionary containing configuration
+
+    Returns:
+        None
+    """
+    # Check pipeline mode
+    mode = config.get("parameters", {}).get("mode", "batch")
+    if mode != "batch":
+        logger.warning(f"Expected mode 'batch', but got '{mode}'. Proceeding anyway.")
+
+    # Get selected models from config
+    selected_models = config.get("parameters", {}).get("selected_models", [])
+    if not selected_models:
+        raise ValueError(
+            "No models selected in configuration. Add 'selected_models' to parameters section."
+        )
+
+    # Create pipeline instance
+    pipeline_instance = ocr_batch_pipeline.with_options(
+        enable_cache=config.get("enable_cache", False),
+    )
+
+    # Get params from config
+    pipeline_params = config.get("parameters", {})
+    pipeline_steps = config.get("steps", {})
+    save_ocr_results_params = pipeline_steps.get("save_ocr_results", {}).get("parameters", {})
+
+    # Run the pipeline
+    pipeline_instance(
+        image_paths=pipeline_params.get("input_image_paths", []),
+        image_folder=pipeline_params.get("input_image_folder"),
+        custom_prompt=pipeline_steps.get("run_ocr", {}).get("parameters", {}).get("custom_prompt"),
+        models=selected_models,
+        save_ocr_results_data=save_ocr_results_params.get("save_locally", False),
+        ocr_results_output_dir=save_ocr_results_params.get("output_dir", "ocr_results"),
+    )
diff --git a/omni-reader/pipelines/evaluation_pipeline.py b/omni-reader/pipelines/evaluation_pipeline.py
@@ -0,0 +1,142 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2025. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""OCR Evaluation Pipeline implementation for comparing models using existing results."""
+
+from typing import Any, Dict, List, Optional
+
+import polars as pl
+from dotenv import load_dotenv
+from zenml import pipeline, step
+from zenml.logger import get_logger
+
+from steps import (
+    evaluate_models,
+    load_ground_truth_texts,
+    load_ocr_results,
+    save_visualization,
+)
+
+load_dotenv()
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def ocr_evaluation_pipeline(
+    model_names: List[str] = None,
+    results_dir: str = "ocr_results",
+    result_files: Optional[List[str]] = None,
+    ground_truth_folder: Optional[str] = None,
+    ground_truth_files: Optional[List[str]] = None,
+    save_visualization_data: bool = False,
+    visualization_output_dir: str = "visualizations",
+) -> None:
+    """Run OCR evaluation pipeline comparing existing model results."""
+    if not model_names or len(model_names) < 2:
+        raise ValueError("At least two models are required for comparison")
+
+    if not ground_truth_folder and not ground_truth_files:
+        raise ValueError(
+            "Either ground_truth_folder or ground_truth_files must be provided for evaluation"
+        )
+
+    model_results = load_ocr_results(
+        model_names=model_names,
+        results_dir=results_dir,
+        result_files=result_files,
+    )
+
+    ground_truth_df = load_ground_truth_texts(
+        model_results=model_results,
+        ground_truth_folder=ground_truth_folder,
+        ground_truth_files=ground_truth_files,
+    )
+
+    visualization = evaluate_models(
+        model_results=model_results,
+        ground_truth_df=ground_truth_df,
+    )
+
+    if save_visualization_data:
+        save_visualization(
+            visualization,
+            output_dir=visualization_output_dir,
+        )
+
+
+def run_ocr_evaluation_pipeline(config: Dict[str, Any]) -> None:
+    """Run the OCR evaluation pipeline from a configuration dictionary.
+
+    Args:
+        config: Dictionary containing configuration
+
+    Returns:
+        None
+    """
+    mode = config.get("parameters", {}).get("mode", "evaluation")
+    if mode != "evaluation":
+        logger.warning(f"Expected mode 'evaluation', but got '{mode}'. Proceeding anyway.")
+
+    selected_models = config.get("parameters", {}).get("selected_models", [])
+    if len(selected_models) < 2:
+        raise ValueError("At least two models are required for evaluation")
+
+    model_registry = config.get("models_registry", [])
+    if not model_registry:
+        raise ValueError("models_registry is required in the config")
+
+    # Get model names from registry by using the passed models (may be shorthands or full names)
+    model_names = []
+    shorthand_to_name = {
+        m.get("shorthand"): m.get("name") for m in model_registry if "shorthand" in m
+    }
+
+    for model_id in selected_models:
+        if model_id in shorthand_to_name:
+            model_names.append(shorthand_to_name[model_id])
+        else:
+            if any(m.get("name") == model_id for m in model_registry):
+                model_names.append(model_id)
+            else:
+                logger.warning(f"Model '{model_id}' not found in registry, using as-is")
+                model_names.append(model_id)
+
+    if len(selected_models) < 2:
+        raise ValueError("At least two models are required for evaluation")
+
+    # Set up pipeline options
+    pipeline_instance = ocr_evaluation_pipeline.with_options(
+        enable_cache=config.get("enable_cache", False),
+        enable_artifact_metadata=config.get("enable_artifact_metadata", True),
+        enable_artifact_visualization=config.get("enable_artifact_visualization", True),
+    )
+
+    evaluate_models_params = (
+        config.get("steps", {}).get("evaluate_models", {}).get("parameters", {})
+    )
+    save_visualization_params = (
+        config.get("steps", {}).get("save_visualization", {}).get("parameters", {})
+    )
+
+    pipeline_instance(
+        model_names=model_names,
+        results_dir=evaluate_models_params.get("results_dir", "ocr_results"),
+        result_files=evaluate_models_params.get("result_files"),
+        ground_truth_folder=evaluate_models_params.get("ground_truth_folder"),
+        ground_truth_files=evaluate_models_params.get("ground_truth_files", []),
+        save_visualization_data=save_visualization_params.get("save_locally", False),
+        visualization_output_dir=save_visualization_params.get("output_dir", "visualizations"),
+    )
diff --git a/omni-reader/pipelines/ocr_pipeline.py b/omni-reader/pipelines/ocr_pipeline.py