refactor: revert to use litellm+instructor for ollama models

marwan37 · marwan37 · commit b65894b33f5e · 2025-03-31T13:34:48.000-05:00
diff --git a/omni-reader/app.py b/omni-reader/app.py
@@ -176,13 +176,8 @@ def has_no_text(result):
                     col1, col2 = st.columns(2)
 
                     start = time.time()
-                    # gemma_result = run_ocr_from_ui(
-                    #     image=image, model="ollama/gemma3:27b", custom_prompt=prompt_param
-                    # )
-                    gemma_result = run_ollama_ocr_from_ui(
-                        image,
-                        model="gemma3:27b",
-                        custom_prompt=prompt_param,
+                    gemma_result = run_ocr_from_ui(
+                        image=image, model="ollama/gemma3:27b", custom_prompt=prompt_param
                     )
                     gemma_time = time.time() - start
 
@@ -266,14 +261,9 @@ def has_no_text(result):
             with st.spinner(f"Processing image with {model_choice}..."):
                 try:
                     start = time.time()
-                    if "gemma" in model_param.lower():
-                        response = run_ollama_ocr_from_ui(
-                            image, model="gemma3:27b", custom_prompt=prompt_param
-                        )
-                    else:
-                        response = run_ocr_from_ui(
-                            image=image, model=model_param, custom_prompt=prompt_param
-                        )
+                    response = run_ocr_from_ui(
+                        image=image, model=model_param, custom_prompt=prompt_param
+                    )
                     proc_time = time.time() - start
                     st.session_state["ocr_result"] = response
 
diff --git a/omni-reader/configs/ocr_config.yaml b/omni-reader/configs/ocr_config.yaml
@@ -3,7 +3,7 @@
 # Image input configuration
 input:
   image_paths: [] # List of specific image paths to process
-  image_folder: "assets/handwriting" # Folder containing images to process
+  image_folder: "assets/street_signs" # Folder containing images to process
 
 models:
   custom_prompt: null # Optional custom prompt to use for both models
diff --git a/omni-reader/run_compare_ocr.py b/omni-reader/run_compare_ocr.py
@@ -85,7 +85,7 @@ def run_ocr_from_ui(
 
 def run_ollama_ocr_from_ui(
     image: str | Image.Image,
-    model: str = "gemma3:27b",
+    model: str = "ollama/gemma3:27b",
     custom_prompt: Optional[str] = None,
 ) -> Dict[str, Any]:
     """Run OCR using Ollama.
diff --git a/omni-reader/utils/ocr_model_utils.py b/omni-reader/utils/ocr_model_utils.py
@@ -23,7 +23,6 @@
 import time
 from typing import Dict, List, Optional
 
-import ollama
 import polars as pl
 from dotenv import load_dotenv
 from zenml import log_metadata
@@ -157,100 +156,6 @@ def log_summary_metadata(
     )
 
 
-def process_with_ollama(
-    model_name: str,
-    image_path: str,
-    prompt: str,
-    model_config: ModelConfig,
-) -> Dict:
-    """Process an image with Ollama.
-
-    Args:
-        model_name: Name of the Ollama model
-        image_path: Path to the image file
-        prompt: Prompt text
-        model_config: Model configuration
-
-    Returns:
-        Dict with OCR results
-    """
-    _, image_base64 = encode_image(image_path)
-
-    ollama_params = {
-        "model": model_name,
-        "messages": [
-            {
-                "role": "user",
-                "content": prompt,
-                "images": [image_base64],
-                "format": ImageDescription.model_json_schema(),
-            }
-        ],
-    }
-
-    if model_config.additional_params:
-        ollama_params.update(model_config.additional_params)
-
-    try:
-        response = ollama.chat(**ollama_params)
-        result = try_extract_json_from_response(response.message.content)
-        return result
-    except Exception as e:
-        error_msg = f"Error with Ollama OCR: {str(e)}"
-        logger.error(error_msg)
-        return {
-            "raw_text": f"Error: {error_msg}",
-            "confidence": 0.0,
-        }
-
-
-def process_with_client(
-    client,
-    model_name: str,
-    image_path: str,
-    prompt: str,
-    model_config: ModelConfig,
-) -> ImageDescription | Dict:
-    """Process images with an API client (OpenAI, Mistral, etc.).
-
-    Args:
-        client: API client
-        model_name: Name of the model
-        image_path: Path to the image file
-        prompt: Prompt text
-        model_config: Model configuration
-
-    Returns:
-        API response processed into ImageDescription or Dict
-    """
-    content_type, image_base64 = encode_image(image_path)
-
-    params = {
-        "model": model_name,
-        "response_model": ImageDescription,
-        **({"max_tokens": model_config.max_tokens} if model_config.max_tokens else {}),
-        **(model_config.additional_params or {}),
-    }
-
-    return client.chat.completions.create(
-        **params,
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:{content_type};base64,{image_base64}",
-                        },
-                    },
-                ],
-            }
-        ],
-    )
-
-
 def process_images_with_model(
     model_config: ModelConfig,
     images: List[str],
@@ -277,18 +182,38 @@ def process_images_with_model(
     processing_times = []
     confidence_scores = []
 
-    if "ollama" not in model_name:
-        client = model_config.client_factory()
+    client = model_config.client_factory()
 
     for i, image_path in enumerate(images):
         start_time = time.time()
         image_name = os.path.basename(image_path)
 
+        content_type, image_base64 = encode_image(image_path)
+        params = {
+            "model": model_name,
+            "response_model": ImageDescription,
+            **({"max_tokens": model_config.max_tokens} if model_config.max_tokens else {}),
+            **(model_config.additional_params or {}),
+        }
+
         try:
-            if "gemma" in model_name:
-                response = process_with_ollama(model_name, image_path, prompt, model_config)
-            else:
-                response = process_with_client(client, model_name, image_path, prompt, model_config)
+            response = client.chat.completions.create(
+                **params,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{content_type};base64,{image_base64}",
+                                },
+                            },
+                        ],
+                    }
+                ],
+            )
 
             processing_time = time.time() - start_time
             processing_times.append(processing_time)