mypy fixes and reasoning fixes

pamelafox · pamelafox · commit c4086c2eab27 · 2025-06-30T21:57:38.000-07:00
diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
@@ -150,7 +150,9 @@ class Approach(ABC):
     # List of GPT reasoning models support
     GPT_REASONING_MODELS = {
         "o1": GPTReasoningModelSupport(streaming=False),
+        "o3": GPTReasoningModelSupport(streaming=True),
         "o3-mini": GPTReasoningModelSupport(streaming=True),
+        "o4-mini": GPTReasoningModelSupport(streaming=True),
     }
     # Set a higher token limit for GPT reasoning models
     RESPONSE_DEFAULT_TOKEN_LIMIT = 1024
diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py
@@ -1,6 +1,7 @@
 import base64
 import logging
 from abc import ABC
+from typing import Optional
 
 import aiohttp
 from azure.core.credentials_async import AsyncTokenCredential
@@ -109,7 +110,7 @@ async def describe_image(self, image_bytes: bytes) -> str:
 
 
 class MultimodalModelDescriber(MediaDescriber):
-    def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: str):
+    def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: Optional[str] = None):
         self.openai_client = openai_client
         self.model = model
         self.deployment = deployment
diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py
@@ -1,16 +1,17 @@
 from dataclasses import dataclass, field
+from typing import Optional
 
 
 @dataclass
 class ImageOnPage:
     bytes: bytes
-    bbox: list[float, float, float, float]  # Pixels
+    bbox: tuple[float, float, float, float]  # Pixels
     filename: str
     description: str
     figure_id: str
     page_num: int  # 0-indexed
-    url: str | None = None
-    embedding: list[float] | None = None
+    url: Optional[str] = None
+    embedding: Optional[list[float]] = None
 
 
 @dataclass
diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py
@@ -1,6 +1,7 @@
 import html
 import io
 import logging
+import uuid
 from collections.abc import AsyncGenerator
 from enum import Enum
 from typing import IO, Optional, Union
@@ -221,23 +222,27 @@ class ObjectType(Enum):
                 offset += len(page_text)
 
     @staticmethod
-    async def process_figure(doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber) -> str:
+    async def process_figure(
+        doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber
+    ) -> ImageOnPage:
         figure_title = (figure.caption and figure.caption.content) or ""
-        figure_filename = f"figure{figure.id.replace('.', '_')}.png"
+        # Generate a random UUID if figure.id is None
+        figure_id = figure.id or f"fig_{uuid.uuid4().hex[:8]}"
+        figure_filename = f"figure{figure_id.replace('.', '_')}.png"
         logger.info(
-            "Describing figure %s with title '%s' using %s", figure.id, figure_title, type(media_describer).__name__
+            "Describing figure %s with title '%s' using %s", figure_id, figure_title, type(media_describer).__name__
         )
         if not figure.bounding_regions:
             return ImageOnPage(
                 bytes=b"",
                 page_num=0,  # O-indexed
-                figure_id=figure.id,
-                bbox=[0, 0, 0, 0],
+                figure_id=figure_id,
+                bbox=(0, 0, 0, 0),
                 filename=figure_filename,
                 description=f"<figure><figcaption>{figure_title}</figcaption></figure>",
             )
         if len(figure.bounding_regions) > 1:
-            logger.warning("Figure %s has more than one bounding region, using the first one", figure.id)
+            logger.warning("Figure %s has more than one bounding region, using the first one", figure_id)
         first_region = figure.bounding_regions[0]
         # To learn more about bounding regions, see https://aka.ms/bounding-region
         bounding_box = (
@@ -252,7 +257,7 @@ async def process_figure(doc: pymupdf.Document, figure: DocumentFigure, media_de
         return ImageOnPage(
             bytes=cropped_img,
             page_num=page_number - 1,  # Convert to 0-indexed
-            figure_id=figure.id,
+            figure_id=figure_id,
             bbox=bbox_pixels,
             filename=figure_filename,
             description=f"<figure><figcaption>{figure_title}<br>{figure_description}</figcaption></figure>",
@@ -282,18 +287,18 @@ def table_to_html(table: DocumentTable):
     @staticmethod
     def crop_image_from_pdf_page(
         doc: pymupdf.Document, page_number: int, bbox_inches: tuple[float, float, float, float]
-    ) -> tuple[bytes, list[float]]:
+    ) -> tuple[bytes, tuple[float, float, float, float]]:
         """
         Crops a region from a given page in a PDF and returns it as an image.
 
         :param pdf_path: Path to the PDF file.
         :param page_number: The page number to crop from (0-indexed).
         :param bbox_inches: A tuple of (x0, y0, x1, y1) coordinates for the bounding box, in inches.
-        :return: A PIL Image of the cropped area.
+        :return: A tuple of (image_bytes, bbox_pixels).
         """
         # Scale the bounding box to 72 DPI
         bbox_dpi = 72
-        bbox_pixels = [x * bbox_dpi for x in bbox_inches]
+        bbox_pixels = tuple(x * bbox_dpi for x in bbox_inches)  # Convert to tuple
         rect = pymupdf.Rect(bbox_pixels)
         # Assume that the PDF has 300 DPI,
         # and use the matrix to convert between the 2 DPIs
diff --git a/docs/deploy_features.md b/docs/deploy_features.md
@@ -135,14 +135,12 @@ This process does *not* delete your previous model deployment. If you want to de
 
 ## Using reasoning models
 
-⚠️ This feature is not currently compatible with [multimodal feature](./multimodal.md). TODO: OR IS IT?
-
 This feature allows you to use reasoning models to generate responses based on retrieved content. These models spend more time processing and understanding the user's request.
 To enable reasoning models, follow the steps in [the reasoning models guide](./reasoning.md).
 
 ## Using agentic retrieval
 
-⚠️ This feature is not currently compatible with [multimodal feature](./multimodal.md). TODO: OR IS IT?
+⚠️ This feature is not fully compatible with [multimodal feature](./multimodal.md).
 
 This feature allows you to use agentic retrieval in place of the Search API. To enable agentic retrieval, follow the steps in [the agentic retrieval guide](./agentic_retrieval.md)
 
diff --git a/docs/multimodal.md b/docs/multimodal.md
@@ -104,3 +104,10 @@ For more details on how this feature works, read [this blog post](https://techco
 
    You can also modify those settings in the "Developer Settings" in the chat UI,
    to experiment with different options before committing to them.
+
+## Compatibility
+
+* This feature is not fully compatible with the [agentic retrieval](./agentic_retrieval.md) feature.
+The agent *will* perform the multimodal vector embedding search, but it will not return images in the response,
+so we cannot send the images to the chat completion model.
+* This feature is compatible with the [reasoning models](./reasoning.md) feature, as long as you use a model that [supports image inputs](https://learn.microsoft.com/azure/ai-services/openai/how-to/reasoning?tabs=python-secure%2Cpy#api--feature-support).
diff --git a/docs/reasoning.md b/docs/reasoning.md
@@ -19,17 +19,27 @@ This repository includes an optional feature that uses reasoning models to gener
 
    Set the environment variables for your Azure OpenAI GPT deployments to your reasoning model
 
-   For o3-mini:
+   For o4-mini:
 
    ```shell
-   azd env set AZURE_OPENAI_CHATGPT_MODEL o3-mini
-   azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT o3-mini
-   azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION 2025-01-31
+   azd env set AZURE_OPENAI_CHATGPT_MODEL o4-mini
+   azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT o4-mini
+   azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION 2025-04-16
    azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT_SKU GlobalStandard
-   azd env set AZURE_OPENAI_API_VERSION 2024-12-01-preview
+   azd env set AZURE_OPENAI_API_VERSION 2025-04-01-preview
+   ```
+
+   For o3:
+
+   ```shell
+   azd env set AZURE_OPENAI_CHATGPT_MODEL o3
+   azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT o3
+   azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION 2025-04-16
+   azd env set AZURE_OPENAI_CHATGPT_DEPLOYMENT_SKU GlobalStandard
+   azd env set AZURE_OPENAI_API_VERSION 2025-04-01-preview
    ```
 
-   For o1:
+   For o1: (No streaming support)
 
    ```shell
    azd env set AZURE_OPENAI_CHATGPT_MODEL o1
diff --git a/infra/main.bicep b/infra/main.bicep
@@ -703,14 +703,14 @@ module computerVision 'br/public:avm/res/cognitive-services/account:0.7.2' = if
   params: {
     name: !empty(computerVisionServiceName)
       ? computerVisionServiceName
-      : '${abbrs.cognitiveServicesComputerVision}${resourceToken}2'
-    kind: 'AIServices'
+      : '${abbrs.cognitiveServicesComputerVision}${resourceToken}cs'
+    kind: 'CognitiveServices'
     networkAcls: {
       defaultAction: 'Allow'
     }
     customSubDomainName: !empty(computerVisionServiceName)
       ? computerVisionServiceName
-      : '${abbrs.cognitiveServicesComputerVision}${resourceToken}'
+      : '${abbrs.cognitiveServicesComputerVision}${resourceToken}cs'
     location: computerVisionResourceGroupLocation
     tags: tags
     sku: 'S0'
@@ -1065,6 +1065,16 @@ module openAiRoleSearchService 'core/security/role.bicep' = if (isAzureOpenAiHos
   }
 }
 
+module computerVisionRoleSearchService 'core/security/role.bicep' = if (useMultimodal) {
+  scope: computerVisionResourceGroup
+  name: 'computervision-role-searchservice'
+  params: {
+    principalId: searchService.outputs.principalId
+    roleDefinitionId: 'a97b65f3-24c7-4388-baec-2e87135dc908'
+    principalType: 'ServicePrincipal'
+  }
+}
+
 module storageRoleBackend 'core/security/role.bicep' = {
   scope: storageResourceGroup
   name: 'storage-role-backend'
diff --git a/todo.txt b/todo.txt
@@ -9,6 +9,10 @@ TODO:
 * Test with integrated vectorization
 * Update all TODOs in the code/docs
 
+
 Decide:
 * In conftest, should I make a new env for vision? Currently I mashed it into the existing env, but it might be cleaner to have a separate one, as now I have to pass llm_inputs explicitly in the tests to turn off image responses.
 * LLMInputType and VectorFields have inconsistently named values
+
+Later:
+Agentic: Incompatible since it doesnt retrieve images. We would need to do a follow-up search query to get each document, like filter: id eq 'x' or id eq 'y' or....