update ocr pipeline

marwan37 · marwan37 · commit 70c6f45ffacb · 2025-03-30T20:16:09.000-05:00
diff --git a/omni-reader/pipelines/ocr_pipeline.py b/omni-reader/pipelines/ocr_pipeline.py
@@ -38,94 +38,78 @@
 def ocr_comparison_pipeline(
     image_paths: Optional[List[str]] = None,
     image_folder: Optional[str] = None,
-    image_patterns: Optional[List[str]] = None,
     custom_prompt: Optional[str] = None,
-    ground_truth_texts: Optional[List[str]] = None,
+    model1: str = "ollama/gemma3:27b",
+    model2: str = "pixtral-12b-2409",
+    ground_truth_model: str = "gpt-4o-mini",
     ground_truth_source: Literal["openai", "manual", "file", "none"] = "none",
     ground_truth_file: Optional[str] = None,
     save_ground_truth_data: bool = False,
-    ground_truth_output_dir: str = "ground_truth",
+    ground_truth_output_dir: str = "ocr_results",
     save_ocr_results_data: bool = False,
     ocr_results_output_dir: str = "ocr_results",
     save_visualization_data: bool = False,
     visualization_output_dir: str = "visualizations",
 ) -> None:
-    """Run OCR comparison pipeline between Gemma3 and Mistral models.
+    """Run OCR comparison pipeline between two configurable models.
 
     Args:
         image_paths: Optional list of specific image paths to process
         image_folder: Optional folder to search for images
-        image_patterns: Optional list of glob patterns to use when searching image_folder
         custom_prompt: Optional custom prompt to use for both models
-        ground_truth_texts: Optional list of ground truth texts for evaluation (used when ground_truth_source="manual")
-        ground_truth_source: Source of ground truth - "openai" to use GPT-4V, "manual" for user-provided texts,
+        model1: Name of the first model to use (default: ollama/gemma3:27b)
+        model2: Name of the second model to use (default: pixtral-12b-2409)
+        ground_truth_model: Name of the model to use for ground truth when source is "openai"
+        ground_truth_source: Source of ground truth - "openai" to use configured model, "manual" for user-provided texts,
                             "file" to load from a saved JSON file, or "none" to skip ground truth evaluation
         ground_truth_file: Path to ground truth JSON file (used when ground_truth_source="file")
         save_ground_truth_data: Whether to save generated ground truth data for future use
         ground_truth_output_dir: Directory to save ground truth data
-        save_ocr_results_data: Whether to save OCR results from Gemma and Mistral
+        save_ocr_results_data: Whether to save OCR results from both models
         ocr_results_output_dir: Directory to save OCR results
         save_visualization_data: Whether to save HTML visualization to local file
         visualization_output_dir: Directory to save HTML visualization
 
     Returns:
         None
     """
-    images = load_images(
-        image_paths=image_paths,
-        image_folder=image_folder,
-        image_patterns=image_patterns,
-    )
-
-    # Keep track of which models were run
+    images = load_images(image_paths=image_paths, image_folder=image_folder)
     model_names = []
 
-    # Run models in parallel on all images using the unified OCR step
-    gemma_results = run_ocr(images=images, model_name="ollama/gemma3:27b", custom_prompt=custom_prompt)
-    model_names.append("ollama/gemma3:27b")
+    model1_results = run_ocr(images=images, model_name=model1, custom_prompt=custom_prompt)
+    model_names.append(model1)
 
-    mistral_results = run_ocr(images=images, model_name="pixtral-12b-2409", custom_prompt=custom_prompt)
-    model_names.append("pixtral-12b-2409")
+    model2_results = run_ocr(images=images, model_name=model2, custom_prompt=custom_prompt)
+    model_names.append(model2)
 
     # Handle ground truth based on the selected source
     ground_truth = None
     openai_results = None
 
     if ground_truth_source == "openai":
-        openai_results = run_ocr(images=images, model_name="gpt-4o-mini", custom_prompt=custom_prompt)
+        openai_results = run_ocr(
+            images=images, model_name=ground_truth_model, custom_prompt=custom_prompt
+        )
         ground_truth = openai_results
-        model_names.append("gpt-4o-mini")
-
-    elif ground_truth_source == "manual" and ground_truth_texts:
-        ground_truth_data = []
-        for i, (text, image_path) in enumerate(zip(ground_truth_texts, images)):
-            ground_truth_data.append(
-                {
-                    "id": i,
-                    "image_name": os.path.basename(image_path),
-                    "raw_text": text,
-                    "confidence": 1.0,  # Manual ground truth has perfect confidence
-                }
-            )
-        ground_truth_df = pl.DataFrame(ground_truth_data)
-        ground_truth = {"ground_truth_results": ground_truth_df}
-
+        model_names.append(ground_truth_model)
     elif ground_truth_source == "file" and ground_truth_file:
         ground_truth = load_ground_truth_file(filepath=ground_truth_file)
 
     # Evaluate models
     visualization = evaluate_models(
-        gemma_results=gemma_results,
-        mistral_results=mistral_results,
-        ground_truth=ground_truth,
+        model1_df=model1_results,
+        model2_df=model2_results,
+        ground_truth_df=ground_truth,
+        model1_name=model1,
+        model2_name=model2,
     )
 
     # Save OCR results if requested
     if save_ocr_results_data or save_ground_truth_data:
         save_ocr_results(
-            gemma_results=gemma_results,
-            mistral_results=mistral_results,
-            openai_results=openai_results,
+            model1_results=model1_results,
+            model2_results=model2_results,
+            ground_truth_results=ground_truth,
             model_names=model_names,
             output_dir=ocr_results_output_dir,
             ground_truth_output_dir=ground_truth_output_dir,
@@ -134,10 +118,7 @@ def ocr_comparison_pipeline(
 
     # Save HTML visualization if requested
     if save_visualization_data:
-        save_visualization(
-            visualization=visualization,
-            output_dir=visualization_output_dir,
-        )
+        save_visualization(visualization, output_dir=visualization_output_dir)
 
 
 def run_ocr_pipeline(config: Dict[str, Any]) -> None:
@@ -152,15 +133,18 @@ def run_ocr_pipeline(config: Dict[str, Any]) -> None:
     ocr_comparison_pipeline(
         image_paths=config["input"].get("image_paths"),
         image_folder=config["input"].get("image_folder"),
-        image_patterns=config["input"].get("image_patterns"),
         custom_prompt=config["models"].get("custom_prompt"),
-        ground_truth_texts=config["ground_truth"].get("texts"),
+        model1=config["models"].get("model1", "ollama/gemma3:27b"),
+        model2=config["models"].get("model2", "pixtral-12b-2409"),
+        ground_truth_model=config["models"].get("ground_truth_model", "gpt-4o-mini"),
         ground_truth_source=config["ground_truth"].get("source", "none"),
         ground_truth_file=config["ground_truth"].get("file"),
         save_ground_truth_data=config["output"]["ground_truth"].get("save", False),
         ground_truth_output_dir=config["output"]["ground_truth"].get("directory", "ocr_results"),
         save_ocr_results_data=config["output"]["ocr_results"].get("save", False),
         ocr_results_output_dir=config["output"]["ocr_results"].get("directory", "ocr_results"),
         save_visualization_data=config["output"]["visualization"].get("save", False),
-        visualization_output_dir=config["output"]["visualization"].get("directory", "visualizations"),
+        visualization_output_dir=config["output"]["visualization"].get(
+            "directory", "visualizations"
+        ),
     )