update steps for multi-model ocr changes

marwan37 · marwan37 · commit 58ac407851f3 · 2025-03-31T23:57:15.000-05:00
diff --git a/omni-reader/steps/evaluate_models.py b/omni-reader/steps/evaluate_models.py
@@ -235,8 +235,8 @@ def create_summary_visualization(
                         <div class="text-right font-medium">{avg_metrics["avg_models_similarity"]:.4f}</div>
                         <div class="text-gray-600">Time Diff:</div>
                         <div class="text-right font-medium">{time_comparison["time_difference"]:.2f}s</div>
-                        <div class="text-gray-600">Faster Model:</div>
-                        <div class="text-right font-medium">{time_comparison["faster_model"]}</div>
+                        <div class="text-gray-600">Fastest Model:</div>
+                        <div class="text-right font-medium">{time_comparison["fastest_model"]}</div>
                         <div class="text-gray-600">Better CER:</div>
                         <div class="text-right font-medium">
                             {model1_display if avg_metrics[f"avg_{model1_prefix}_cer"] < avg_metrics[f"avg_{model2_prefix}_cer"] else model2_display}
@@ -267,8 +267,8 @@ def create_summary_visualization(
                         <div class="text-xl font-bold">{time_comparison["time_difference"]:.2f}s</div>
                     </div>
                     <div>
-                        <div class="text-gray-600 mb-1">Faster Model</div>
-                        <div class="text-xl font-bold">{time_comparison["faster_model"]}</div>
+                        <div class="text-gray-600 mb-1">Fastest Model</div>
+                        <div class="text-xl font-bold">{time_comparison["fastest_model"]}</div>
                     </div>
                 </div>
             </div>
@@ -325,34 +325,62 @@ def create_summary_visualization(
 
 @step()
 def evaluate_models(
-    model1_df: pl.DataFrame,
-    model2_df: pl.DataFrame,
+    model_results: Dict[str, pl.DataFrame],
     ground_truth_df: Optional[pl.DataFrame] = None,
-    model1_name: str = "ollama/gemma3:27b",
-    model2_name: str = "pixtral-12b-2409",
+    primary_models: Optional[List[str]] = None,
 ) -> Annotated[HTMLString, "ocr_visualization"]:
-    """Compare the performance of two configurable models with visualization.
+    """Compare the performance of multiple configurable models with visualization.
 
     Args:
-        model1_df: First model results DataFrame
-        model2_df: Second model results DataFrame
+        model_results: Dictionary mapping model names to their results DataFrames
         ground_truth_df: Optional ground truth results DataFrame
-        model1_name: Name of the first model (default: ollama/gemma3:27b)
-        model2_name: Name of the second model (default: pixtral-12b-2409)
-        model1_display: Display name for the first model (default: Gemma)
-        model2_display: Display name for the second model (default: Mistral)
+        primary_models: Optional list of model names to highlight in comparison.
+            If None or less than 2 models, uses the first two models from model_results.
 
     Returns:
         HTMLString visualization of the results
     """
+    # Ensure we have at least two models for comparison
+    if len(model_results) < 2:
+        raise ValueError("At least two models are required for comparison")
+
+    # If primary_models not specified or invalid, use the first two models
+    if not primary_models or len(primary_models) < 2:
+        primary_models = list(model_results.keys())[:2]
+
+    # Extract the primary models for main comparison
+    model1_name = primary_models[0]
+    model2_name = primary_models[1]
+
+    model1_df = model_results[model1_name]
+    model2_df = model_results[model2_name]
+
     model1_display, model1_prefix = get_model_info(model1_name)
     model2_display, model2_prefix = get_model_info(model2_name)
 
     # Join results
-    results = model1_df.join(model2_df, on=["id", "image_name"], how="inner")
+    results = model1_df.join(model2_df, on=["id", "image_name"], how="inner", suffix="_right")
     evaluation_metrics = []
     processed_results = []
 
+    # Calculate processing times for all models
+    all_model_times = {}
+    for model_name, df in model_results.items():
+        display, prefix = get_model_info(model_name)
+        time_key = f"avg_{prefix}_time"
+        all_model_times[time_key] = df.select("processing_time").to_series().mean()
+        all_model_times[f"{prefix}_display"] = display
+
+    # Find fastest model
+    fastest_model_time = min(
+        [(time, model) for model, time in all_model_times.items() if not model.endswith("_display")]
+    )
+    fastest_model_key = fastest_model_time[1]
+    fastest_model_prefix = fastest_model_key.replace("avg_", "").replace("_time", "")
+    fastest_model_display = all_model_times.get(
+        f"{fastest_model_prefix}_display", fastest_model_prefix
+    )
+
     if ground_truth_df is not None:
         results = results.join(
             ground_truth_df,
@@ -412,38 +440,49 @@ def evaluate_models(
                 ].mean(),
             }
 
-            model1_times = model1_df.select("processing_time").to_series().mean()
-            model2_times = model2_df.select("processing_time").to_series().mean()
             model1_time_key = f"avg_{model1_prefix}_time"
             model2_time_key = f"avg_{model2_prefix}_time"
+
+            # Combine processing times with other metrics
             time_comparison = {
-                model1_time_key: model1_times,
-                model2_time_key: model2_times,
-                "time_difference": abs(model1_times - model2_times),
-                "faster_model": model1_display if model1_times < model2_times else model2_display,
+                **all_model_times,
+                "time_difference": abs(
+                    all_model_times[model1_time_key] - all_model_times[model2_time_key]
+                ),
+                "fastest_model": fastest_model_display,
             }
 
-            # Log metadata for ZenML dashboard
-            log_metadata(
-                metadata={
+            # Prepare metadata for ZenML dashboard
+            metadata_dict = {
+                **{
+                    f"avg_{model}_time": float(time)
+                    for model, time in all_model_times.items()
+                    if not model.endswith("_display")
+                },
+                "fastest_model": fastest_model_display,
+                "model_count": len(model_results),
+                "avg_models_similarity": float(avg_metrics["avg_models_similarity"]),
+            }
+
+            # Add accuracy metrics for primary models
+            metadata_dict.update(
+                {
                     f"avg_{model1_prefix}_cer": float(avg_metrics[f"avg_{model1_prefix}_cer"]),
                     f"avg_{model1_prefix}_wer": float(avg_metrics[f"avg_{model1_prefix}_wer"]),
                     f"avg_{model2_prefix}_cer": float(avg_metrics[f"avg_{model2_prefix}_cer"]),
                     f"avg_{model2_prefix}_wer": float(avg_metrics[f"avg_{model2_prefix}_wer"]),
-                    "avg_models_similarity": float(avg_metrics["avg_models_similarity"]),
                     f"avg_{model1_prefix}_gt_similarity": float(
                         avg_metrics[f"avg_{model1_prefix}_gt_similarity"]
                     ),
                     f"avg_{model2_prefix}_gt_similarity": float(
                         avg_metrics[f"avg_{model2_prefix}_gt_similarity"]
                     ),
-                    model1_time_key: float(time_comparison[model1_time_key]),
-                    model2_time_key: float(time_comparison[model2_time_key]),
-                    "time_difference": float(time_comparison["time_difference"]),
-                    "faster_model": time_comparison["faster_model"],
                 }
             )
 
+            # Log metadata for ZenML dashboard
+            log_metadata(metadata=metadata_dict)
+
             html_visualization = create_summary_visualization(
                 avg_metrics=avg_metrics,
                 time_comparison=time_comparison,
@@ -456,30 +495,33 @@ def evaluate_models(
             return html_visualization
 
         # FALLBACK: if no ground truth metrics, only use processing times.
-        model1_times = model1_df.select("processing_time").to_series().mean()
-        model2_times = model2_df.select("processing_time").to_series().mean()
-        model1_time_key = f"avg_{model1_prefix}_time"
-        model2_time_key = f"avg_{model2_prefix}_time"
         time_comparison = {
-            model1_time_key: model1_times,
-            model2_time_key: model2_times,
-            "time_difference": abs(model1_times - model2_times),
-            "faster_model": model1_display if model1_times < model2_times else model2_display,
+            **all_model_times,
+            "time_difference": abs(
+                all_model_times[f"avg_{model1_prefix}_time"]
+                - all_model_times[f"avg_{model2_prefix}_time"]
+            ),
+            "fastest_model": fastest_model_display,
         }
+
         html_visualization = create_summary_visualization(
             avg_metrics={},
             time_comparison=time_comparison,
             model1_name=model1_name,
             model2_name=model2_name,
         )
 
-        log_metadata(
-            metadata={
-                model1_time_key: float(time_comparison[model1_time_key]),
-                model2_time_key: float(time_comparison[model2_time_key]),
-                "time_difference": float(time_comparison["time_difference"]),
-                "faster_model": time_comparison["faster_model"],
-            }
-        )
+        # Prepare metadata for ZenML dashboard
+        metadata_dict = {
+            **{
+                f"avg_{model}_time": float(time)
+                for model, time in all_model_times.items()
+                if not model.endswith("_display")
+            },
+            "fastest_model": fastest_model_display,
+            "model_count": len(model_results),
+        }
+
+        log_metadata(metadata=metadata_dict)
 
         return html_visualization
diff --git a/omni-reader/steps/load_files.py b/omni-reader/steps/load_files.py
@@ -97,14 +97,14 @@ def load_images(
 @step
 def load_ground_truth_file(
     filepath: str,
-) -> Annotated[Dict[str, pl.DataFrame], "ground_truth"]:
+) -> Annotated[pl.DataFrame, "ground_truth"]:
     """Load ground truth data from a JSON file.
 
     Args:
         filepath: Path to the ground truth file
 
     Returns:
-        Dictionary containing ground truth results
+        pl.DataFrame containing ground truth results
     """
     from utils.io_utils import load_ocr_data_from_json
 
@@ -115,4 +115,4 @@ def load_ground_truth_file(
 
     log_metadata(metadata={"ground_truth_loaded": {"path": filepath, "count": len(df)}})
 
-    return {"ground_truth_results": df}
+    return df
diff --git a/omni-reader/steps/run_ocr.py b/omni-reader/steps/run_ocr.py
@@ -15,53 +15,86 @@
 # limitations under the License.
 """This module contains a unified OCR step that works with multiple models."""
 
-from typing import List, Optional
+import os
+from typing import Dict, List, Optional
 
 import polars as pl
 from typing_extensions import Annotated
 from zenml import step
 from zenml.logger import get_logger
 
-from utils import (
-    MODEL_CONFIGS,
-    process_images_with_model,
-)
+from utils.model_configs import MODEL_CONFIGS
+from utils.ocr_processing import process_images_with_model
 
 logger = get_logger(__name__)
 
 
-@step(enable_cache=False)
+@step()
 def run_ocr(
-    images: List[str], model_name: str, custom_prompt: Optional[str] = None
-) -> Annotated[pl.DataFrame, "ocr_results"]:
-    """Extract text from images using the specified model.
+    images: List[str], model_names: List[str], custom_prompt: Optional[str] = None
+) -> Annotated[Dict[str, pl.DataFrame], "ocr_results"]:
+    """Extract text from images using multiple models in parallel.
 
     Args:
         images: List of paths to image files
-        model_name: Name of the model to use (e.g., "gpt-4o-mini", "ollama/gemma3:27b", "pixtral-12b-2409")
+        model_names: List of model names to use
         custom_prompt: Optional custom prompt to override the default prompt
 
     Returns:
-        Dict: Containing results dataframe with OCR results
+        Dict: Mapping of model name to results dataframe with OCR results
 
     Raises:
-        ValueError: If the model_name is not supported
+        ValueError: If any model_name is not supported
     """
-    if model_name not in MODEL_CONFIGS:
-        supported_models = ", ".join(MODEL_CONFIGS.keys())
-        raise ValueError(
-            f"Unsupported model: {model_name}. Supported models are: {supported_models}"
-        )
+    from concurrent.futures import ThreadPoolExecutor
 
-    model_config = MODEL_CONFIGS[model_name]
+    from tqdm import tqdm
 
-    logger.info(f"Running OCR with model: {model_name}")
+    # Validate all models
+    for model_name in model_names:
+        if model_name not in MODEL_CONFIGS:
+            supported_models = ", ".join(MODEL_CONFIGS.keys())
+            raise ValueError(
+                f"Unsupported model: {model_name}. Supported models are: {supported_models}"
+            )
+
+    logger.info(f"Running OCR with {len(model_names)} models: {', '.join(model_names)}")
     logger.info(f"Processing {len(images)} images")
 
-    results_df = process_images_with_model(
-        model_config=model_config,
-        images=images,
-        custom_prompt=custom_prompt,
-    )
+    results = {}
+
+    with ThreadPoolExecutor(max_workers=min(len(model_names), 5)) as executor:
+        futures = {
+            model_name: executor.submit(
+                process_images_with_model,
+                model_config=MODEL_CONFIGS[model_name],
+                images=images,
+                custom_prompt=custom_prompt,
+            )
+            for model_name in model_names
+        }
+
+        with tqdm(total=len(model_names), desc="Processing models") as pbar:
+            for model_name, future in futures.items():
+                try:
+                    results_df = future.result()
+                    results[model_name] = results_df
+                    logger.info(f"Completed processing with model: {model_name}")
+                except Exception as e:
+                    logger.error(f"Error processing model {model_name}: {str(e)}")
+                    # empty dataframe with error message to avoid pipeline failure
+                    results[model_name] = pl.DataFrame(
+                        {
+                            "id": range(len(images)),
+                            "image_name": [os.path.basename(img) for img in images],
+                            "raw_text": [f"Error processing with {model_name}: {str(e)}"]
+                            * len(images),
+                            "processing_time": [0.0] * len(images),
+                            "confidence": [0.0] * len(images),
+                            "error": [str(e)] * len(images),
+                        }
+                    )
+                finally:
+                    pbar.update(1)
 
-    return results_df
+    return results
diff --git a/omni-reader/steps/save_results.py b/omni-reader/steps/save_results.py