Revert integrated vectorization changes, using a different strategy

pamelafox · pamelafox · commit 5ed3e8d67df8 · 2025-08-19T11:04:37.000-07:00
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -30,7 +30,7 @@
 
 # Adding new data
 
-New files should be added to the `data` folder, and then either run scripts/prepdocs.sh or script/prepdocs.ps1 to ingest the data.
+New files should be added to the `data` folder, and then either run scripts/prepdocs.sh or scripts/prepdocs.ps1 to ingest the data.
 
 # Adding a new azd environment variable
 
diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
@@ -559,12 +559,6 @@ async def main(strategy: Strategy, setup_index: bool = True):
             search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
             use_acls=use_acls,
             category=args.category,
-            use_multimodal=use_multimodal,
-            image_embeddings=setup_image_embeddings_service(
-                azure_credential=azd_credential,
-                vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"),
-                use_multimodal=use_multimodal,
-            ),
         )
     else:
         file_processors = setup_file_processors(
diff --git a/app/backend/prepdocslib/goals.json b/app/backend/prepdocslib/goals.json
diff --git a/app/backend/prepdocslib/integratedvectorizerstrategy.py b/app/backend/prepdocslib/integratedvectorizerstrategy.py
@@ -5,15 +5,9 @@
     NativeBlobSoftDeleteDeletionDetectionPolicy,
 )
 from azure.search.documents.indexes.models import (
-    AIServicesAccountIdentity,
     AzureOpenAIEmbeddingSkill,
-    DocumentIntelligenceLayoutSkill,
-    DocumentIntelligenceLayoutSkillChunkingProperties,
-    IndexingParameters,
-    IndexingParametersConfiguration,
     IndexProjectionMode,
     InputFieldMappingEntry,
-    MergeSkill,
     OutputFieldMappingEntry,
     SearchIndexer,
     SearchIndexerDataContainer,
@@ -22,17 +16,12 @@
     SearchIndexerIndexProjection,
     SearchIndexerIndexProjectionSelector,
     SearchIndexerIndexProjectionsParameters,
-    SearchIndexerKnowledgeStore,
-    SearchIndexerKnowledgeStoreFileProjectionSelector,
-    SearchIndexerKnowledgeStoreProjection,
     SearchIndexerSkillset,
-    ShaperSkill,
     SplitSkill,
-    VisionVectorizeSkill,
 )
 
 from .blobmanager import BlobManager
-from .embeddings import AzureOpenAIEmbeddingService, ImageEmbeddings
+from .embeddings import AzureOpenAIEmbeddingService
 from .listfilestrategy import ListFileStrategy
 from .searchmanager import SearchManager
 from .strategy import DocumentAction, SearchInfo, Strategy
@@ -53,20 +42,20 @@ def __init__(
         embeddings: AzureOpenAIEmbeddingService,
         search_field_name_embedding: str,
         subscription_id: str,
+        search_service_user_assigned_id: str,
         document_action: DocumentAction = DocumentAction.Add,
         search_analyzer_name: Optional[str] = None,
         use_acls: bool = False,
         category: Optional[str] = None,
-        use_multimodal: bool = False,
-        image_embeddings: Optional[ImageEmbeddings] = None,
     ):
+
         self.list_file_strategy = list_file_strategy
         self.blob_manager = blob_manager
         self.document_action = document_action
         self.embeddings = embeddings
-        self.image_embeddings = image_embeddings
         self.search_field_name_embedding = search_field_name_embedding
         self.subscription_id = subscription_id
+        self.search_user_assigned_identity = search_service_user_assigned_id
         self.search_analyzer_name = search_analyzer_name
         self.use_acls = use_acls
         self.category = category
@@ -75,12 +64,12 @@ def __init__(
         self.skillset_name = f"{prefix}-skillset"
         self.indexer_name = f"{prefix}-indexer"
         self.data_source_name = f"{prefix}-blob"
-        self.use_multimodal = use_multimodal and image_embeddings is not None
 
-    async def create_skillset(self, index_name: str) -> SearchIndexerSkillset:
+    async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset:
         """
         Create a skillset for the indexer to chunk documents and generate embeddings
         """
+
         split_skill = SplitSkill(
             name="split-skill",
             description="Split skill to chunk documents",
@@ -139,152 +128,6 @@ async def create_skillset(self, index_name: str) -> SearchIndexerSkillset:
 
         return skillset
 
-    async def create_multimodal_skillset(self, index_name: str) -> SearchIndexerSkillset:
-        if self.image_embeddings is None:
-            raise ValueError("Image embeddings client must be provided for multimodal skillset creation.")
-        if self.blob_manager.image_container is None:
-            raise ValueError("Blob manager must have an image container set for multimodal skillset creation.")
-
-        document_layout_skill = DocumentIntelligenceLayoutSkill(
-            description="Layout skill to read documents",
-            context="/document",
-            output_mode="oneToMany",
-            output_format="text",
-            markdown_header_depth="",  # Necessary so that SDK doesnt send a header depth
-            extraction_options=["images", "locationMetadata"],
-            chunking_properties=DocumentIntelligenceLayoutSkillChunkingProperties(
-                unit="characters",
-                maximum_length=2000,
-                overlap_length=200,
-            ),
-            inputs=[InputFieldMappingEntry(name="file_data", source="/document/file_data")],
-            outputs=[
-                OutputFieldMappingEntry(name="text_sections", target_name="text_sections"),
-                OutputFieldMappingEntry(name="normalized_images", target_name="normalized_images"),
-            ],
-        )
-
-        split_skill = SplitSkill(
-            description="Split skill to chunk pages of documents",
-            text_split_mode="pages",
-            context="/document/text_sections/*",
-            maximum_page_length=2000,
-            page_overlap_length=500,
-            inputs=[
-                InputFieldMappingEntry(name="text", source="/document/text_sections/*/content"),
-            ],
-            outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
-        )
-
-        embedding_skill = AzureOpenAIEmbeddingSkill(
-            name="embedding-skill",
-            description="Skill to generate embeddings via Azure OpenAI",
-            context="/document/text_sections/*/pages/*",
-            resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
-            deployment_name=self.embeddings.open_ai_deployment,
-            model_name=self.embeddings.open_ai_model_name,
-            dimensions=self.embeddings.open_ai_dimensions,
-            inputs=[
-                InputFieldMappingEntry(name="text", source="/document/text_sections/*/pages/*"),
-            ],
-            outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
-        )
-
-        vision_embedding_skill = VisionVectorizeSkill(
-            name="vision-embedding-skill",
-            description="Skill to generate image embeddings via Azure AI Vision",
-            context="/document/normalized_images/*",
-            model_version="2023-04-15",
-            inputs=[InputFieldMappingEntry(name="image", source="/document/normalized_images/*")],
-            outputs=[OutputFieldMappingEntry(name="vector", target_name="image_vector")],
-        )
-
-        vision_embedding_shaper_skill = ShaperSkill(
-            name="vision-embedding-shaper-skill",
-            description="Shaper skill to ensure image embeddings are in the correct format",
-            context="/document/normalized_images/*",
-            inputs=[
-                InputFieldMappingEntry(name="embedding", source="/document/normalized_images/*/image_vector"),
-                InputFieldMappingEntry(
-                    name="url",
-                    source=f'="{self.blob_manager.endpoint}/images/"+$(/document/normalized_images/*/imagePath)',
-                ),
-            ],
-            outputs=[OutputFieldMappingEntry(name="output", target_name="images")],
-        )
-
-        merge_skill = MergeSkill(
-            name="merge-skill",
-            description="Merge skill to create source page",
-            insert_post_tag="",
-            insert_pre_tag="",
-            context="/document/text_sections/*/locationMetadata",
-            inputs=[
-                InputFieldMappingEntry(
-                    name="itemsToInsert",
-                    source='=[$(/document/metadata_storage_name), "#page=", $(/document/text_sections/*/locationMetadata/pageNumber)]',
-                )
-            ],
-            outputs=[OutputFieldMappingEntry(name="mergedText", target_name="citation")],
-        )
-
-        indexer_skills = [
-            document_layout_skill,
-            split_skill,
-            embedding_skill,
-            vision_embedding_skill,
-            vision_embedding_shaper_skill,
-            merge_skill,
-        ]
-
-        index_projection = SearchIndexerIndexProjection(
-            selectors=[
-                SearchIndexerIndexProjectionSelector(
-                    target_index_name=index_name,
-                    parent_key_field_name="parent_id",
-                    source_context="/document/text_sections/*/pages/*",
-                    mappings=[
-                        InputFieldMappingEntry(name="content", source="/document/text_sections/*/pages/*"),
-                        InputFieldMappingEntry(name="sourcefile", source="/document/metadata_storage_name"),
-                        InputFieldMappingEntry(
-                            name="sourcepage", source="/document/text_sections/*/locationMetadata/citation"
-                        ),
-                        InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"),
-                        InputFieldMappingEntry(
-                            name=self.search_field_name_embedding, source="/document/text_sections/*/pages/*/vector"
-                        ),
-                        InputFieldMappingEntry(name="images", source="/document/normalized_images/*/images"),
-                    ],
-                )
-            ],
-            parameters=SearchIndexerIndexProjectionsParameters(
-                projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
-            ),
-        )
-
-        skillset = SearchIndexerSkillset(
-            name=self.skillset_name,
-            description="Skillset to chunk documents and generate embeddings",
-            skills=indexer_skills,
-            index_projection=index_projection,
-            cognitive_services_account=AIServicesAccountIdentity(subdomain_url=self.image_embeddings.endpoint),
-            knowledge_store=SearchIndexerKnowledgeStore(
-                storage_connection_string=self.blob_manager.get_managedidentity_connectionstring(),
-                projections=[
-                    SearchIndexerKnowledgeStoreProjection(
-                        files=[
-                            SearchIndexerKnowledgeStoreFileProjectionSelector(
-                                storage_container=self.blob_manager.image_container,
-                                source="/document/normalized_images/*",
-                            )
-                        ]
-                    )
-                ],
-            ),
-        )
-
-        return skillset
-
     async def setup(self):
         logger.info("Setting up search index using integrated vectorization...")
         search_manager = SearchManager(
@@ -294,7 +137,7 @@ async def setup(self):
             use_int_vectorization=True,
             embeddings=self.embeddings,
             field_name_embedding=self.search_field_name_embedding,
-            search_images=self.use_multimodal,
+            search_images=False,
         )
 
         await search_manager.create_index()
@@ -311,11 +154,8 @@ async def setup(self):
 
         await ds_client.create_or_update_data_source_connection(data_source_connection)
 
-        if self.use_multimodal:
-            skillset = await self.create_multimodal_skillset(self.search_info.index_name)
-        else:
-            skillset = await self.create_skillset(self.search_info.index_name)
-        await ds_client.create_or_update_skillset(skillset)
+        embedding_skillset = await self.create_embedding_skill(self.search_info.index_name)
+        await ds_client.create_or_update_skillset(embedding_skillset)
         await ds_client.close()
 
     async def run(self):
@@ -334,22 +174,13 @@ async def run(self):
         elif self.document_action == DocumentAction.RemoveAll:
             await self.blob_manager.remove_blob()
 
-        indexing_parameters = None
-        if self.use_multimodal:
-            indexing_parameters = IndexingParameters(
-                configuration=IndexingParametersConfiguration(
-                    query_timeout=None, allow_skillset_to_read_file_data=True  # type: ignore
-                ),
-                max_failed_items=-1,
-            )
-
+        # Create an indexer
         indexer = SearchIndexer(
             name=self.indexer_name,
             description="Indexer to index documents and generate embeddings",
             skillset_name=self.skillset_name,
             target_index_name=self.search_info.index_name,
             data_source_name=self.data_source_name,
-            parameters=indexing_parameters,  # Properly pass the parameters
         )
 
         indexer_client = self.search_info.create_search_indexer_client()
diff --git a/docs/multimodal.md b/docs/multimodal.md
@@ -98,8 +98,8 @@ With this feature enabled, the following changes are made:
 
 ## Compatibility
 
+* This feature is **not** compatible with [integrated vectorization](./deploy_features.md#enabling-integrated-vectorization), as the currently configured built-in skills do not process images or store image embeddings. Azure AI Search does now offer built-in skills for multimodal support, as demonstrated in [azure-ai-search-multimodal-sample](https://github.com/Azure-Samples/azure-ai-search-multimodal-sample), but we were not able to customize them enough to meet the requirements of this feature. Instead, we are working on making a custom skill based off the data ingestion code in this repository, and hosting that skill on Azure Functions. Stay tuned to the releases to find out when that's available.
 * This feature is **not** fully compatible with the [agentic retrieval](./agentic_retrieval.md) feature.
 The agent *will* perform the multimodal vector embedding search, but it will not return images in the response,
 so we cannot send the images to the chat completion model.
 * This feature *is* compatible with the [reasoning models](./reasoning.md) feature, as long as you use a model that [supports image inputs](https://learn.microsoft.com/azure/ai-services/openai/how-to/reasoning?tabs=python-secure%2Cpy#api--feature-support).
-* This feature is *mostly* compatible with [integrated vectorization](./deploy_features.md#enabling-integrated-vectorization). The extraction process will not be exactly the same, so the chunks will not be identical, and the extracted images will not contain citations.
diff --git a/todo.txt b/todo.txt