update run and run_compare_ocr entrypoint files

marwan37 · marwan37 · commit a33b38c82477 · 2025-03-30T20:16:34.000-05:00
diff --git a/omni-reader/run.py b/omni-reader/run.py
@@ -32,7 +32,9 @@
 
 def main():
     """Run the OCR comparison pipeline."""
-    parser = argparse.ArgumentParser(description="Run OCR comparison between Mistral and Gemma3 using ZenML")
+    parser = argparse.ArgumentParser(
+        description="Run OCR comparison between Mistral and Gemma3 using ZenML"
+    )
 
     # Config file options
     config_group = parser.add_argument_group("Configuration")
@@ -72,7 +74,7 @@ def main():
     gt_group.add_argument(
         "--ground-truth-dir",
         type=str,
-        default="ground_truth",
+        default="ocr_results",
         help="Directory to look for ground truth files (for --list-ground-truth-files)",
     )
 
diff --git a/omni-reader/run_compare_ocr.py b/omni-reader/run_compare_ocr.py
@@ -5,111 +5,50 @@
 import time
 from typing import Any, Dict, List, Optional
 
-# For faster performance in interactive mode without ZenML overhead,
-# we implement the OCR functions directly here
 import instructor
-import polars as pl
 from dotenv import load_dotenv
 from litellm import completion
 from mistralai import Mistral
 from PIL import Image
 
-from schemas.image_description import ImageDescription
 from utils.encode_image import encode_image
-from utils.metrics import compare_results
-from utils.prompt import get_prompt
+from utils.prompt import ImageDescription, get_prompt
 
 load_dotenv()
 
 
-def run_gemma3_ocr_direct(
+def run_ocr_from_ui(
     image: str | Image.Image,
+    model: str = "gemma3",
     custom_prompt: Optional[str] = None,
 ) -> Dict[str, Any]:
-    """Extract text directly using gemma3 model.
-
-    Args:
-        image: Path to image or PIL image
-        custom_prompt: Optional custom prompt
-
-    Returns:
-        Dict with extraction results
-    """
-    start_time = time.time()
-    content_type, image_base64 = encode_image(image)
-
-    client = instructor.from_litellm(completion)
-    model_name = "ollama/gemma3:27b"
-
-    prompt = custom_prompt if custom_prompt else get_prompt()
-
-    try:
-        response = client.chat.completions.create(
-            model=model_name,
-            response_model=ImageDescription,
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": prompt},
-                        {
-                            "type": "image_url",
-                            "image_url": f"data:{content_type};base64,{image_base64}",
-                        },
-                    ],
-                }
-            ],
-        )
-
-        processing_time = time.time() - start_time
-
-        result = {
-            "raw_text": response.raw_text if response.raw_text else "No text found",
-            "description": response.description if response.description else "No description found",
-            "entities": response.entities if response.entities else [],
-            "processing_time": processing_time,
-            "model": model_name,
-        }
-
-        return result
-    except Exception as e:
-        error_message = f"An unexpected error occurred: {str(e)}"
-        return {
-            "raw_text": "Error: Failed to extract text",
-            "description": "Error: Failed to extract description",
-            "entities": [],
-            "error": error_message,
-            "processing_time": time.time() - start_time,
-            "model": model_name,
-        }
+    """Extract text directly using OCR model.
 
-
-def run_mistral_ocr_direct(
-    image: str | Image.Image,
-    custom_prompt: Optional[str] = None,
-) -> Dict[str, Any]:
-    """Extract text directly using mistral model.
+    This function is designed for use in the streamlit app.
 
     Args:
         image: Path to image or PIL image
         custom_prompt: Optional custom prompt
-
+        model: Name of the model to use
     Returns:
         Dict with extraction results
     """
     start_time = time.time()
     content_type, image_base64 = encode_image(image)
 
-    mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
-    client = instructor.from_mistral(mistral_client)
-
-    model_name = "pixtral-12b-2409"
+    if "gemma" in model.lower():
+        client = instructor.from_ollama(completion)
+    elif "mistral" in model.lower() or "pixtral" in model.lower():
+        mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
+        client = instructor.from_mistral(mistral_client)
+    else:
+        raise ValueError(f"Unsupported model: {model}")
 
     prompt = custom_prompt if custom_prompt else get_prompt()
 
     try:
         response = client.chat.completions.create(
-            model=model_name,
+            model=model,
             response_model=ImageDescription,
             messages=[
                 {
@@ -125,63 +64,34 @@ def run_mistral_ocr_direct(
             ],
         )
 
-        print(f"Response: {response}")
-
         processing_time = time.time() - start_time
 
         result = {
             "raw_text": response.raw_text if response.raw_text else "No text found",
-            "description": response.description if response.description else "No description found",
-            "entities": response.entities if response.entities else [],
             "processing_time": processing_time,
-            "model": model_name,
+            "model": model,
         }
 
         return result
     except Exception as e:
         error_message = f"An unexpected error occurred: {str(e)}"
         return {
             "raw_text": "Error: Failed to extract text",
-            "description": "Error: Failed to extract description",
-            "entities": [],
             "error": error_message,
             "processing_time": time.time() - start_time,
-            "model": model_name,
+            "model": model,
         }
 
 
-def run_ocr(
-    image: str | Image.Image,
-    model: str = "gemma3",
-    custom_prompt: Optional[str] = None,
-) -> Dict[str, Any]:
-    """Run OCR using either Gemma3 or Mistral model.
-
-    Args:
-        image: Path to image or PIL image
-        model: Model to use ('gemma3' or 'mistral')
-        custom_prompt: Optional custom prompt
-
-    Returns:
-        Dict with extraction results
-    """
-    if model.lower() == "gemma3":
-        return run_gemma3_ocr_direct(image=image, custom_prompt=custom_prompt)
-    else:
-        return run_mistral_ocr_direct(image=image, custom_prompt=custom_prompt)
-
-
 def compare_models(
     image_paths: List[str],
     custom_prompt: Optional[str] = None,
-    ground_truth_texts: Optional[List[str]] = None,
 ) -> Dict[str, Any]:
     """Compare Gemma3 and Mistral OCR capabilities on a list of images.
 
     Args:
         image_paths: List of paths to images
         custom_prompt: Optional custom prompt to use for both models
-        ground_truth_texts: Optional list of ground truth texts
     Returns:
         Dictionary with comparison results
     """
@@ -197,14 +107,14 @@ def compare_models(
         print(f"Processing image {i + 1}/{len(image_paths)}: {image_name}")
 
         # Run both models
-        gemma_result = run_ocr(
+        gemma_result = run_ocr_from_ui(
             image=image_path,
-            model="gemma3",
+            model_name="ollama/gemma3:27b",
             custom_prompt=custom_prompt,
         )
-        mistral_result = run_ocr(
+        mistral_result = run_ocr_from_ui(
             image=image_path,
-            model="mistral",
+            model_name="pixtral-12b-2409",
             custom_prompt=custom_prompt,
         )
 
@@ -213,41 +123,19 @@ def compare_models(
             "id": i,
             "image_name": image_name,
             "gemma_text": gemma_result["raw_text"],
-            "gemma_entities": ", ".join(gemma_result.get("entities", [])),
             "gemma_processing_time": gemma_result.get("processing_time", 0),
         }
 
         mistral_entry = {
             "id": i,
             "image_name": image_name,
             "mistral_text": mistral_result["raw_text"],
-            "mistral_entities": ", ".join(mistral_result.get("entities", [])),
             "mistral_processing_time": mistral_result.get("processing_time", 0),
         }
 
         results["gemma_results"].append(gemma_entry)
         results["mistral_results"].append(mistral_entry)
 
-        # Add ground truth if available
-        if ground_truth_texts and i < len(ground_truth_texts):
-            results["ground_truth"].append(
-                {
-                    "id": i,
-                    "image_name": image_name,
-                    "ground_truth_text": ground_truth_texts[i],
-                }
-            )
-
-            # Calculate metrics
-            metrics = compare_results(
-                ground_truth_texts[i],
-                gemma_result["raw_text"],
-                mistral_result["raw_text"],
-            )
-            print(f"Metrics for {image_name}:")
-            for key, value in metrics.items():
-                print(f"  {key}: {value:.4f}")
-
     return results
 
 
@@ -266,22 +154,19 @@ def compare_models(
 
     if args.model.lower() == "both":
         start_time = time.time()
-        gemma_result = run_ocr(args.image, "gemma3", args.prompt)
-        mistral_result = run_ocr(args.image, "mistral", args.prompt)
+        gemma_result = run_ocr_from_ui(args.image, "ollama/gemma3:27b", args.prompt)
+        mistral_result = run_ocr_from_ui(args.image, "pixtral-12b-2409", args.prompt)
         print("\nGemma3 results:")
         print(f"Text: {gemma_result['raw_text']}")
-        print(f"Entities: {gemma_result.get('entities', [])}")
         print(f"Processing time: {gemma_result.get('processing_time', 0):.2f}s")
 
         print("\nMistral results:")
         print(f"Text: {mistral_result['raw_text']}")
-        print(f"Entities: {mistral_result.get('entities', [])}")
         print(f"Processing time: {mistral_result.get('processing_time', 0):.2f}s")
 
         print(f"\nTotal time: {time.time() - start_time:.2f}s")
     else:
-        result = run_ocr(args.image, args.model, args.prompt)
+        result = run_ocr_from_ui(args.image, args.model, args.prompt)
         print(f"\n{args.model} results:")
         print(f"Text: {result['raw_text']}")
-        print(f"Entities: {result.get('entities', [])}")
         print(f"Processing time: {result.get('processing_time', 0):.2f}s")