Integrated vectorization and user upload work

pamelafox · pamelafox · commit 806828e37faa · 2025-07-02T07:08:53.000-07:00
diff --git a/app/backend/app.py b/app/backend/app.py
@@ -98,6 +98,7 @@
     clean_key_if_exists,
     setup_embeddings_service,
     setup_file_processors,
+    setup_image_embeddings_service,
     setup_search_info,
 )
 from prepdocslib.filestrategy import UploadUserFileStrategy
@@ -596,11 +597,18 @@ async def setup_clients():
             openai_org=OPENAI_ORGANIZATION,
             disable_vectors=os.getenv("USE_VECTORS", "").lower() == "false",
         )
+        image_embeddings_service = setup_image_embeddings_service(
+            azure_credential=azure_credential,
+            vision_endpoint=AZURE_VISION_ENDPOINT,
+            use_multimodal=USE_MULTIMODAL,
+        )
         ingester = UploadUserFileStrategy(
             search_info=search_info,
-            embeddings=text_embeddings_service,
             file_processors=file_processors,
+            embeddings=text_embeddings_service,
+            image_embeddings=image_embeddings_service,
             search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
+            blob_manager=user_blob_container_client,
         )
         current_app.config[CONFIG_INGESTER] = ingester
 
diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
@@ -393,11 +393,6 @@ async def main(strategy: Strategy, setup_index: bool = True):
         required=False,
         help="Optional. Use this Azure Document Intelligence account key instead of the current user identity to login (use az login to set current user for Azure)",
     )
-    parser.add_argument(
-        "--searchserviceassignedid",
-        required=False,
-        help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)",
-    )
 
     parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
     args = parser.parse_args()
@@ -526,10 +521,15 @@ async def main(strategy: Strategy, setup_index: bool = True):
             embeddings=openai_embeddings_service,
             search_field_name_embedding=os.environ["AZURE_SEARCH_FIELD_NAME_EMBEDDING"],
             subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
-            search_service_user_assigned_id=args.searchserviceassignedid,
             search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
             use_acls=use_acls,
             category=args.category,
+            use_multimodal=use_multimodal,
+            image_embeddings=setup_image_embeddings_service(
+                azure_credential=azd_credential,
+                vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"),
+                use_multimodal=use_multimodal,
+            ),
         )
     else:
         file_processors = setup_file_processors(
diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py
@@ -48,7 +48,7 @@ async def upload_blob(self, file: File) -> Optional[list[str]]:
             if file.url is None:
                 with open(file.content.name, "rb") as reopened_file:
                     blob_name = BlobManager.blob_name_from_file_name(file.content.name)
-                    logger.info("Uploading blob for document %s", blob_name)
+                    logger.info("Uploading blob for document '%s'", blob_name)
                     blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
                     file.url = blob_client.url
         return None
@@ -108,7 +108,7 @@ async def upload_document_image(
             blob_name = (
                 f"{self.blob_name_from_file_name(document_file.content.name)}/page{image_page_num}/{image_filename}"
             )
-            logger.info("Uploading blob for document image %s", blob_name)
+            logger.info("Uploading blob for document image '%s'", blob_name)
             blob_client = await container_client.upload_blob(blob_name, output, overwrite=True)
             return blob_client.url
         return None
diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py
@@ -145,11 +145,13 @@ def __init__(
         embeddings: Optional[OpenAIEmbeddings] = None,
         image_embeddings: Optional[ImageEmbeddings] = None,
         search_field_name_embedding: Optional[str] = None,
+        blob_manager: Optional[BlobManager] = None,
     ):
         self.file_processors = file_processors
         self.embeddings = embeddings
         self.image_embeddings = image_embeddings
         self.search_info = search_info
+        self.blob_manager = blob_manager
         self.search_manager = SearchManager(
             search_info=self.search_info,
             search_analyzer_name=None,
@@ -164,7 +166,7 @@ def __init__(
     async def add_file(self, file: File):
         if self.image_embeddings:
             logging.warning("Image embeddings are not currently supported for the user upload feature")
-        sections = await parse_file(file, self.file_processors)
+        sections = await parse_file(file, self.file_processors, None, self.blob_manager, self.image_embeddings)
         if sections:
             await self.search_manager.update_content(sections, url=file.url)
 
diff --git a/app/backend/prepdocslib/integratedvectorizerstrategy.py b/app/backend/prepdocslib/integratedvectorizerstrategy.py
@@ -5,7 +5,11 @@
     NativeBlobSoftDeleteDeletionDetectionPolicy,
 )
 from azure.search.documents.indexes.models import (
+    AIServicesAccountIdentity,
     AzureOpenAIEmbeddingSkill,
+    BlobIndexerImageAction,
+    IndexingParameters,
+    IndexingParametersConfiguration,
     IndexProjectionMode,
     InputFieldMappingEntry,
     OutputFieldMappingEntry,
@@ -16,12 +20,17 @@
     SearchIndexerIndexProjection,
     SearchIndexerIndexProjectionSelector,
     SearchIndexerIndexProjectionsParameters,
+    SearchIndexerKnowledgeStore,
+    SearchIndexerKnowledgeStoreFileProjectionSelector,
+    SearchIndexerKnowledgeStoreProjection,
     SearchIndexerSkillset,
+    ShaperSkill,
     SplitSkill,
+    VisionVectorizeSkill,
 )
 
 from .blobmanager import BlobManager
-from .embeddings import AzureOpenAIEmbeddingService
+from .embeddings import AzureOpenAIEmbeddingService, ImageEmbeddings
 from .listfilestrategy import ListFileStrategy
 from .searchmanager import SearchManager
 from .strategy import DocumentAction, SearchInfo, Strategy
@@ -42,20 +51,20 @@ def __init__(
         embeddings: AzureOpenAIEmbeddingService,
         search_field_name_embedding: str,
         subscription_id: str,
-        search_service_user_assigned_id: str,
         document_action: DocumentAction = DocumentAction.Add,
         search_analyzer_name: Optional[str] = None,
         use_acls: bool = False,
         category: Optional[str] = None,
+        use_multimodal: bool = False,
+        image_embeddings: Optional[ImageEmbeddings] = None,
     ):
-
         self.list_file_strategy = list_file_strategy
         self.blob_manager = blob_manager
         self.document_action = document_action
         self.embeddings = embeddings
+        self.image_embeddings = image_embeddings
         self.search_field_name_embedding = search_field_name_embedding
         self.subscription_id = subscription_id
-        self.search_user_assigned_identity = search_service_user_assigned_id
         self.search_analyzer_name = search_analyzer_name
         self.use_acls = use_acls
         self.category = category
@@ -64,6 +73,7 @@ def __init__(
         self.skillset_name = f"{prefix}-skillset"
         self.indexer_name = f"{prefix}-indexer"
         self.data_source_name = f"{prefix}-blob"
+        self.use_multimodal = use_multimodal and image_embeddings is not None
 
     async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset:
         """
@@ -97,6 +107,23 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
             outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
         )
 
+        vision_embedding_skill = VisionVectorizeSkill(
+            name="vision-embedding-skill",
+            description="Skill to generate image embeddings via Azure AI Vision",
+            context="/document/normalized_images/*",
+            model_version="2023-04-15",
+            inputs=[InputFieldMappingEntry(name="image", source="/document/normalized_images/*")],
+            outputs=[OutputFieldMappingEntry(name="vector", target_name="image_vector")],
+        )
+        vision_embedding_shaper_skill = ShaperSkill(
+            name="vision-embedding-shaper-skill",
+            description="Shaper skill to ensure image embeddings are in the correct format",
+            context="/document/normalized_images/*",
+            inputs=[InputFieldMappingEntry(name="embedding", source="/document/normalized_images/*/image_vector")],
+            outputs=[OutputFieldMappingEntry(name="output", target_name="images")],
+        )
+        # TODO: project images into a container
+
         index_projection = SearchIndexerIndexProjection(
             selectors=[
                 SearchIndexerIndexProjectionSelector(
@@ -111,6 +138,7 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
                         InputFieldMappingEntry(
                             name=self.search_field_name_embedding, source="/document/pages/*/vector"
                         ),
+                        InputFieldMappingEntry(name="images", source="/document/normalized_images/*/images"),
                     ],
                 ),
             ],
@@ -119,11 +147,36 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
             ),
         )
 
+        indexer_skills = [split_skill, embedding_skill]
+        if self.use_multimodal:
+            indexer_skills.extend([vision_embedding_skill, vision_embedding_shaper_skill])
+        extra_params = {}
+        if self.use_multimodal:
+            extra_params = {
+                "cognitive_services_account": AIServicesAccountIdentity(subdomain_url=self.image_embeddings.endpoint),
+                "knowledge_store": SearchIndexerKnowledgeStore(
+                    storage_connection_string=self.blob_manager.get_managedidentity_connectionstring(),
+                    projections=[
+                        SearchIndexerKnowledgeStoreProjection(
+                            files=[
+                                SearchIndexerKnowledgeStoreFileProjectionSelector(
+                                    storage_container=self.blob_manager.image_container,
+                                    source="/document/normalized_images/*",
+                                )
+                            ]
+                        )
+                    ],
+                ),
+            }
+
+        # We still need to map the images onto url in the images complex field type
+        # something about key path
         skillset = SearchIndexerSkillset(
             name=self.skillset_name,
             description="Skillset to chunk documents and generate embeddings",
-            skills=[split_skill, embedding_skill],
+            skills=indexer_skills,
             index_projection=index_projection,
+            **extra_params,
         )
 
         return skillset
@@ -137,7 +190,7 @@ async def setup(self):
             use_int_vectorization=True,
             embeddings=self.embeddings,
             field_name_embedding=self.search_field_name_embedding,
-            search_images=False,
+            search_images=self.use_multimodal,
         )
 
         await search_manager.create_index()
@@ -175,12 +228,24 @@ async def run(self):
             await self.blob_manager.remove_blob()
 
         # Create an indexer
+        extra_params = {}
+        if self.use_multimodal:
+            extra_params = {
+                "parameters": IndexingParameters(
+                    configuration=IndexingParametersConfiguration(
+                        query_timeout=None,  # Current bug in AI Search SDK
+                        image_action=BlobIndexerImageAction.GENERATE_NORMALIZED_IMAGES,
+                    ),
+                )
+            }
+
         indexer = SearchIndexer(
             name=self.indexer_name,
             description="Indexer to index documents and generate embeddings",
             skillset_name=self.skillset_name,
             target_index_name=self.search_info.index_name,
             data_source_name=self.data_source_name,
+            **extra_params,
         )
 
         indexer_client = self.search_info.create_search_indexer_client()
diff --git a/app/backend/prepdocslib/listfilestrategy.py b/app/backend/prepdocslib/listfilestrategy.py
@@ -102,7 +102,7 @@ def check_md5(self, path: str) -> bool:
                 stored_hash = md5_f.read()
 
         if stored_hash and stored_hash.strip() == existing_hash.strip():
-            logger.info("Skipping %s, no changes detected.", path)
+            logger.info("Skipping '%s', no changes detected.", path)
             return True
 
         # Write the hash
diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py
@@ -6,9 +6,16 @@
 import aiohttp
 from azure.core.credentials_async import AsyncTokenCredential
 from azure.identity.aio import get_bearer_token_provider
-from openai import AsyncOpenAI
+from openai import AsyncOpenAI, RateLimitError
 from rich.progress import Progress
-from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
+from tenacity import (
+    AsyncRetrying,
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_fixed,
+    wait_random_exponential,
+)
 
 logger = logging.getLogger("scripts")
 
@@ -116,29 +123,39 @@ def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: Optional[
         self.deployment = deployment
 
     async def describe_image(self, image_bytes: bytes) -> str:
+        def before_retry_sleep(retry_state):
+            logger.info("Rate limited on the OpenAI chat completions API, sleeping before retrying...")
+
         image_base64 = base64.b64encode(image_bytes).decode("utf-8")
         image_datauri = f"data:image/png;base64,{image_base64}"
 
-        response = await self.openai_client.chat.completions.create(
-            model=self.model if self.deployment is None else self.deployment,
-            max_tokens=500,
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You are a helpful assistant that describes images from organizational documents.",
-                },
-                {
-                    "role": "user",
-                    "content": [
+        async for attempt in AsyncRetrying(
+            retry=retry_if_exception_type(RateLimitError),
+            wait=wait_random_exponential(min=15, max=60),
+            stop=stop_after_attempt(15),
+            before_sleep=before_retry_sleep,
+        ):
+            with attempt:
+                response = await self.openai_client.chat.completions.create(
+                    model=self.model if self.deployment is None else self.deployment,
+                    max_tokens=500,
+                    messages=[
                         {
-                            "text": "Describe image with no more than 5 sentences. Do not speculate about anything you don't know.",
-                            "type": "text",
+                            "role": "system",
+                            "content": "You are a helpful assistant that describes images from organizational documents.",
+                        },
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "text": "Describe image with no more than 5 sentences. Do not speculate about anything you don't know.",
+                                    "type": "text",
+                                },
+                                {"image_url": {"url": image_datauri, "detail": "auto"}, "type": "image_url"},
+                            ],
                         },
-                        {"image_url": {"url": image_datauri, "detail": "auto"}, "type": "image_url"},
                     ],
-                },
-            ],
-        )
+                )
         description = ""
         if response.choices and response.choices[0].message.content:
             description = response.choices[0].message.content.strip()
diff --git a/docs/multimodal.md b/docs/multimodal.md
@@ -27,14 +27,13 @@ For more details on how this feature works, read [this blog post](https://techco
 
 ### Prerequisites
 
-* Create a [AI Vision account in Azure Portal first](https://ms.portal.azure.com/#create/Microsoft.CognitiveServicesComputerVision), so that you can agree to the Responsible AI terms for that resource. You can delete that account after agreeing. (TODO: Is this still needed?)
-* The use of a chat completion model that supports multimodal inputs. The default model for the repository is currently `gpt-4.1-mini`, which does support multimodal inputs.
+* The use of a chat completion model that supports multimodal inputs. The default model for the repository is currently `gpt-4.1-mini`, which does support multimodal inputs. The `gpt-4o-mini` technically supports multimodal inputs, but due to how image tokens are calculated, you need a much higher deployment capacity to use it effectively. Please try `gpt-4.1-mini` first, and experiment with other models later.
 
 ### Deployment
 
 1. **Enable multimodal capabilities:**
 
-   First, make sure you do *not* have integrated vectorization enabled, since that is currently incompatible:
+   First, make sure you do *not* have integrated vectorization enabled, since that is currently incompatible: (TODO!)
 
    ```shell
    azd env set USE_FEATURE_INT_VECTORIZATION false
diff --git a/infra/main.bicep b/infra/main.bicep
@@ -1109,6 +1109,16 @@ module storageRoleSearchService 'core/security/role.bicep' = if (useIntegratedVe
   }
 }
 
+module storageRoleContributorSearchService 'core/security/role.bicep' = if (useIntegratedVectorization && useMultimodal) {
+  scope: storageResourceGroup
+  name: 'storage-role-contributor-searchservice'
+  params: {
+    principalId: searchService.outputs.principalId
+    roleDefinitionId: 'ba92f5b4-2d11-453d-a403-e96b0029c9fe' // Storage Blob Data Contributor
+    principalType: 'ServicePrincipal'
+  }
+}
+
 // Used to issue search queries
 // https://learn.microsoft.com/azure/search/search-security-rbac
 module searchRoleBackend 'core/security/role.bicep' = {
diff --git a/todo.txt b/todo.txt