Rename vision variables, fix mypy

pamelafox · pamelafox · commit e0a8843deaf5 · 2025-06-30T22:57:45.000-07:00
diff --git a/.azdo/pipelines/azure-dev.yml b/.azdo/pipelines/azure-dev.yml
@@ -89,10 +89,10 @@ steps:
       USE_MULTIMODAL: $(USE_MULTIMODAL)
       AZURE_VISION_ENDPOINT: $(AZURE_VISION_ENDPOINT)
       VISION_SECRET_NAME: $(VISION_SECRET_NAME)
-      AZURE_COMPUTER_VISION_SERVICE: $(AZURE_COMPUTER_VISION_SERVICE)
-      AZURE_COMPUTER_VISION_RESOURCE_GROUP: $(AZURE_COMPUTER_VISION_RESOURCE_GROUP)
-      AZURE_COMPUTER_VISION_LOCATION: $(AZURE_COMPUTER_VISION_LOCATION)
-      AZURE_COMPUTER_VISION_SKU: $(AZURE_COMPUTER_VISION_SKU)
+      AZURE_VISION_SERVICE: $(AZURE_VISION_SERVICE)
+      AZURE_VISION_RESOURCE_GROUP: $(AZURE_VISION_RESOURCE_GROUP)
+      AZURE_VISION_LOCATION: $(AZURE_VISION_LOCATION)
+      AZURE_VISION_SKU: $(AZURE_VISION_SKU)
       ENABLE_LANGUAGE_PICKER: $(ENABLE_LANGUAGE_PICKER)
       USE_SPEECH_INPUT_BROWSER: $(USE_SPEECH_INPUT_BROWSER)
       USE_SPEECH_OUTPUT_BROWSER: $(USE_SPEECH_OUTPUT_BROWSER)
diff --git a/.github/workflows/azure-dev.yml b/.github/workflows/azure-dev.yml
@@ -37,10 +37,10 @@ jobs:
       AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP }}
       AZURE_DOCUMENTINTELLIGENCE_SKU: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_SKU }}
       AZURE_DOCUMENTINTELLIGENCE_LOCATION: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_LOCATION }}
-      AZURE_COMPUTER_VISION_SERVICE: ${{ vars.AZURE_COMPUTER_VISION_SERVICE }}
-      AZURE_COMPUTER_VISION_RESOURCE_GROUP: ${{ vars.AZURE_COMPUTER_VISION_RESOURCE_GROUP }}
-      AZURE_COMPUTER_VISION_LOCATION: ${{ vars.AZURE_COMPUTER_VISION_LOCATION }}
-      AZURE_COMPUTER_VISION_SKU: ${{ vars.AZURE_COMPUTER_VISION_SKU }}
+      AZURE_VISION_SERVICE: ${{ vars.AZURE_VISION_SERVICE }}
+      AZURE_VISION_RESOURCE_GROUP: ${{ vars.AZURE_VISION_RESOURCE_GROUP }}
+      AZURE_VISION_LOCATION: ${{ vars.AZURE_VISION_LOCATION }}
+      AZURE_VISION_SKU: ${{ vars.AZURE_VISION_SKU }}
       AZURE_SEARCH_INDEX: ${{ vars.AZURE_SEARCH_INDEX }}
       AZURE_SEARCH_SERVICE: ${{ vars.AZURE_SEARCH_SERVICE }}
       AZURE_SEARCH_SERVICE_RESOURCE_GROUP: ${{ vars.AZURE_SEARCH_SERVICE_RESOURCE_GROUP }}
diff --git a/.github/workflows/evaluate.yaml b/.github/workflows/evaluate.yaml
@@ -35,10 +35,10 @@ jobs:
       AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP }}
       AZURE_DOCUMENTINTELLIGENCE_SKU: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_SKU }}
       AZURE_DOCUMENTINTELLIGENCE_LOCATION: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_LOCATION }}
-      AZURE_COMPUTER_VISION_SERVICE: ${{ vars.AZURE_COMPUTER_VISION_SERVICE }}
-      AZURE_COMPUTER_VISION_RESOURCE_GROUP: ${{ vars.AZURE_COMPUTER_VISION_RESOURCE_GROUP }}
-      AZURE_COMPUTER_VISION_LOCATION: ${{ vars.AZURE_COMPUTER_VISION_LOCATION }}
-      AZURE_COMPUTER_VISION_SKU: ${{ vars.AZURE_COMPUTER_VISION_SKU }}
+      AZURE_VISION_SERVICE: ${{ vars.AZURE_VISION_SERVICE }}
+      AZURE_VISION_RESOURCE_GROUP: ${{ vars.AZURE_VISION_RESOURCE_GROUP }}
+      AZURE_VISION_LOCATION: ${{ vars.AZURE_VISION_LOCATION }}
+      AZURE_VISION_SKU: ${{ vars.AZURE_VISION_SKU }}
       AZURE_SEARCH_INDEX: ${{ vars.AZURE_SEARCH_INDEX }}
       AZURE_SEARCH_SERVICE: ${{ vars.AZURE_SEARCH_SERVICE }}
       AZURE_SEARCH_SERVICE_RESOURCE_GROUP: ${{ vars.AZURE_SEARCH_SERVICE_RESOURCE_GROUP }}
diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
@@ -1,6 +1,6 @@
 from abc import ABC
 from collections.abc import AsyncGenerator, Awaitable
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Callable, Optional, TypedDict, Union, cast
 from urllib.parse import urljoin
@@ -116,7 +116,7 @@ class DataPoints:
 @dataclass
 class ExtraInfo:
     data_points: DataPoints
-    thoughts: Optional[list[ThoughtStep]] = None
+    thoughts: list[ThoughtStep] = field(default_factory=list)
     followup_questions: Optional[list[Any]] = None
 
 
@@ -395,6 +395,8 @@ def nonewlines(s: str) -> str:
                 text_sources.append(f"{citation}: {nonewlines(doc.content or '')}")
 
             if use_image_sources and hasattr(doc, "images") and doc.images:
+                if self.images_blob_container_client is None:
+                    raise ValueError("The images blob container client must be set to use image sources.")
                 for img in doc.images:
                     # Skip if we've already processed this URL
                     if img["url"] in seen_urls:
@@ -440,11 +442,15 @@ class ExtraArgs(TypedDict, total=False):
         return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields=self.embedding_field)
 
     async def compute_image_embedding(self, q: str):
+        if not self.vision_endpoint:
+            raise ValueError("Azure AI Vision endpoint must be set to compute image embedding.")
         endpoint = urljoin(self.vision_endpoint, "computervision/retrieval:vectorizeText")
         headers = {"Content-Type": "application/json"}
         params = {"api-version": "2024-02-01", "model-version": "2023-04-15"}
         data = {"text": q}
 
+        if not self.vision_token_provider:
+            raise ValueError("Azure AI Vision token provider must be set to compute image embedding.")
         headers["Authorization"] = "Bearer " + await self.vision_token_provider()
 
         async with aiohttp.ClientSession() as session:
diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
@@ -331,7 +331,7 @@ def setup_image_embeddings_service(
     image_embeddings_service: Optional[ImageEmbeddings] = None
     if use_multimodal:
         if vision_endpoint is None:
-            raise ValueError("A computer vision endpoint is required when GPT-4-vision is enabled.")
+            raise ValueError("An Azure AI Vision endpoint must be provided to use multimodal features.")
         image_embeddings_service = ImageEmbeddings(
             endpoint=vision_endpoint,
             token_provider=get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default"),
diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py
@@ -19,7 +19,7 @@ async def describe_image(self, image_bytes) -> str:
         raise NotImplementedError  # pragma: no cover
 
 
-class ContentUnderstandingDescriber:
+class ContentUnderstandingDescriber(MediaDescriber):
     CU_API_VERSION = "2024-12-01-preview"
 
     analyzer_schema = {
diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py
@@ -150,6 +150,8 @@ async def create_index(self):
                 )
 
             if self.search_images:
+                if not self.search_info.azure_vision_endpoint:
+                    raise ValueError("Azure AI Vision endpoint must be provided to use image embeddings")
                 image_vector_algorithm = HnswAlgorithmConfiguration(
                     name="images_hnsw_config",
                     parameters=HnswParameters(metric="cosine"),
@@ -366,7 +368,11 @@ async def create_index(self):
                     existing_index.vector_search.compressions.append(text_vector_compression)
                     await search_index_client.create_or_update_index(existing_index)
 
-                if images_field and not any(field.name == "images" for field in existing_index.fields):
+                if (
+                    images_field
+                    and images_field.fields
+                    and not any(field.name == "images" for field in existing_index.fields)
+                ):
                     logger.info("Adding %s field for image embeddings", images_field.name)
                     images_field.fields[0].stored = True
                     existing_index.fields.append(images_field)
diff --git a/azure.yaml b/azure.yaml
@@ -93,10 +93,10 @@ pipeline:
       - USE_MULTIMODAL
       - AZURE_VISION_ENDPOINT
       - VISION_SECRET_NAME
-      - AZURE_COMPUTER_VISION_SERVICE
-      - AZURE_COMPUTER_VISION_RESOURCE_GROUP
-      - AZURE_COMPUTER_VISION_LOCATION
-      - AZURE_COMPUTER_VISION_SKU
+      - AZURE_VISION_SERVICE
+      - AZURE_VISION_RESOURCE_GROUP
+      - AZURE_VISION_LOCATION
+      - AZURE_VISION_SKU
       - ENABLE_LANGUAGE_PICKER
       - USE_SPEECH_INPUT_BROWSER
       - USE_SPEECH_OUTPUT_BROWSER
diff --git a/docs/customization.md b/docs/customization.md
@@ -53,7 +53,7 @@ TODO FIX THIS!
 If you followed the instructions in [the multimodal guide](multimodal.md) to enable the vision approach and the "Use GPT vision model" option is selected, then the chat tab will use the `chatreadretrievereadvision.py` approach instead. This approach is similar to the `chatreadretrieveread.py` approach, with a few differences:
 
 1. Step 1 is the same as before, except it uses the GPT-4 Vision model instead of the default GPT-3.5 model.
-2. For this step, it also calculates a vector embedding for the user question using [the Computer Vision vectorize text API](https://learn.microsoft.com/azure/ai-services/computer-vision/how-to/image-retrieval#call-the-vectorize-text-api), and passes that to the Azure AI Search to compare against the `imageEmbeddings` fields in the indexed documents. For each matching document, it downloads the image blob and converts it to a base 64 encoding.
+2. For this step, it also calculates a vector embedding for the user question using [the Azure AI Vision vectorize text API](https://learn.microsoft.com/azure/ai-services/computer-vision/how-to/image-retrieval#call-the-vectorize-text-api), and passes that to the Azure AI Search to compare against the `imageEmbeddings` fields in the indexed documents. For each matching document, it downloads the image blob and converts it to a base 64 encoding.
 3. When it combines the search results and user question, it includes the base 64 encoded images, and sends along both the text and images to the GPT4 Vision model (similar to this [documentation example](https://platform.openai.com/docs/guides/vision/quick-start)). The model generates a response that includes citations to the images, and the UI renders the base64 encoded images when a citation is clicked.
 
 The prompt for step 2 is currently tailored to the sample data since it starts with "You are an intelligent assistant helping analyze the Annual Financial Report of Contoso Ltd.". Modify the [chat_answer_question_vision.prompty](https://github.com/Azure-Samples/azure-search-openai-demo/blob/main/app/backend/approaches/prompts/chat_answer_question_vision.prompty) prompt to match your data.
@@ -72,7 +72,7 @@ The prompt for step 2 is currently tailored to the sample data since it starts w
 TODO FIX THIS!
 If you followed the instructions in [the multimodal guide](multimodal.md) to enable the vision approach and the "Use GPT vision model" option is selected, then the ask tab will use the `retrievethenreadvision.py` approach instead. This approach is similar to the `retrievethenread.py` approach, with a few differences:
 
-1. For this step, it also calculates a vector embedding for the user question using [the Computer Vision vectorize text API](https://learn.microsoft.com/azure/ai-services/computer-vision/how-to/image-retrieval#call-the-vectorize-text-api), and passes that to the Azure AI Search to compare against the `imageEmbeddings` fields in the indexed documents. For each matching document, it downloads the image blob and converts it to a base 64 encoding.
+1. For this step, it also calculates a vector embedding for the user question using [the Azure AI Vision vectorize text API](https://learn.microsoft.com/azure/ai-services/computer-vision/how-to/image-retrieval#call-the-vectorize-text-api), and passes that to the Azure AI Search to compare against the `imageEmbeddings` fields in the indexed documents. For each matching document, it downloads the image blob and converts it to a base 64 encoding.
 2. When it combines the search results and user question, it includes the base 64 encoded images, and sends along both the text and images to the GPT4 Vision model (similar to this [documentation example](https://platform.openai.com/docs/guides/vision/quick-start)). The model generates a response that includes citations to the images, and the UI renders the base64 encoded images when a citation is clicked.
 
 The prompt for step 2 is currently tailored to the sample data since it starts with "You are an intelligent assistant helping analyze the Annual Financial Report of Contoso Ltd". Modify the [ask_answer_question_vision.prompty](https://github.com/Azure-Samples/azure-search-openai-demo/blob/main/app/backend/approaches/prompts/ask_answer_question_vision.prompty) prompt to match your data.
diff --git a/docs/deploy_existing.md b/docs/deploy_existing.md
@@ -9,7 +9,7 @@ You should set these values before running `azd up`. Once you've set them, retur
 * [Azure AI Search resource](#azure-ai-search-resource)
 * [Azure App Service Plan and App Service resources](#azure-app-service-plan-and-app-service-resources)
 * [Azure Application Insights and related resources](#azure-application-insights-and-related-resources)
-* [Azure Computer Vision resources](#azure-computer-vision-resources)
+* [Azure AI Vision resources](#azure-ai-vision-resources)
 * [Azure Document Intelligence resource](#azure-document-intelligence-resource)
 * [Azure Speech resource](#azure-speech-resource)
 * [Other Azure resources](#other-azure-resources)
@@ -78,12 +78,12 @@ You can also customize the search service (new or existing) for non-English sear
 1. Run `azd env set AZURE_APPLICATION_INSIGHTS_DASHBOARD {Name of existing Azure App Insights Dashboard}`.
 1. Run `azd env set AZURE_LOG_ANALYTICS {Name of existing Azure Log Analytics Workspace Name}`.
 
-## Azure Computer Vision resources
+## Azure AI Vision resources
 
-1. Run `azd env set AZURE_COMPUTER_VISION_SERVICE {Name of existing Azure Computer Vision Service Name}`
-1. Run `azd env set AZURE_COMPUTER_VISION_RESOURCE_GROUP {Name of existing Azure Computer Vision Resource Group Name}`
-1. Run `azd env set AZURE_COMPUTER_VISION_LOCATION {Name of existing Azure Computer Vision Location}`
-1. Run `azd env set AZURE_COMPUTER_VISION_SKU {SKU of Azure Computer Vision service, defaults to F0}`
+1. Run `azd env set AZURE_VISION_SERVICE {Name of existing Azure AI Vision Service Name}`
+1. Run `azd env set AZURE_VISION_RESOURCE_GROUP {Name of existing Azure AI Vision Resource Group Name}`
+1. Run `azd env set AZURE_VISION_LOCATION {Name of existing Azure AI Vision Location}`
+1. Run `azd env set AZURE_VISION_SKU {SKU of Azure AI Vision service, defaults to F0}`
 
 ## Azure Document Intelligence resource
 
diff --git a/docs/multimodal.md b/docs/multimodal.md
@@ -27,14 +27,8 @@ For more details on how this feature works, read [this blog post](https://techco
 
 ### Prerequisites
 
-* Create a [AI Vision account in Azure Portal first](https://ms.portal.azure.com/#create/Microsoft.CognitiveServicesComputerVision), so that you can agree to the Responsible AI terms for that resource. You can delete that account after agreeing.
-* The ability to deploy a gpt-4o model in the [supported regions](https://learn.microsoft.com/azure/ai-services/openai/concepts/models#standard-deployment-model-availability). If you're not sure, try to create a gpt-4o deployment from your Azure OpenAI deployments page.
-* Ensure that you can deploy the Azure OpenAI resource group in [a region and deployment SKU where all required components are available](https://learn.microsoft.com/azure/cognitive-services/openai/concepts/models#model-summary-table-and-region-availability):
-  * Azure OpenAI models
-    * gpt-4.1-mini
-    * text-embedding-3-large
-    * gpt-4o (for vision/evaluation features)
-  * [Azure AI Vision](https://learn.microsoft.com/azure/ai-services/computer-vision/)
+* Create a [AI Vision account in Azure Portal first](https://ms.portal.azure.com/#create/Microsoft.CognitiveServicesComputerVision), so that you can agree to the Responsible AI terms for that resource. You can delete that account after agreeing. (TODO: Is this still needed?)
+* The use of a chat completion model that supports multimodal inputs. The default model for the repository is currently `gpt-4.1-mini`, which does support multimodal inputs.
 
 ### Deployment
 
diff --git a/infra/abbreviations.json b/infra/abbreviations.json
@@ -12,7 +12,7 @@
   "cdnProfiles": "cdnp-",
   "cdnProfilesEndpoints": "cdne-",
   "cognitiveServicesAccounts": "cog-",
-  "cognitiveServicesComputerVision": "cog-cv-",
+  "cognitiveServicesVision": "cog-vz-",
   "cognitiveServicesDocumentIntelligence": "cog-di-",
   "cognitiveServicesFormRecognizer": "cog-fr-",
   "cognitiveServicesSpeech": "cog-sp-",
diff --git a/infra/main.bicep b/infra/main.bicep
diff --git a/infra/main.parameters.json b/infra/main.parameters.json
diff --git a/infra/main.test.bicep b/infra/main.test.bicep
diff --git a/tests/conftest.py b/tests/conftest.py
diff --git a/tests/mocks.py b/tests/mocks.py
diff --git a/todo.txt b/todo.txt