Azure-Samples
diff --git a/‎app/backend/app.py‎
Lines changed: 2 additions & 1 deletion b/‎app/backend/app.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎app/backend/prepdocslib/integratedvectorizerstrategy.py‎
Lines changed: 136 additions & 45 deletions b/‎app/backend/prepdocslib/integratedvectorizerstrategy.py‎
Lines changed: 136 additions & 45 deletions
@@ -717,7 +717,8 @@ async def setup_clients():
 async def close_clients():
     await current_app.config[CONFIG_SEARCH_CLIENT].close()
     await current_app.config[CONFIG_GLOBAL_BLOB_MANAGER].close_clients()
-    await current_app.config[CONFIG_USER_BLOB_MANAGER].close_clients()
+    if user_blob_manager := current_app.config.get(CONFIG_USER_BLOB_MANAGER):
+        await user_blob_manager.close_clients()
 
 
 def create_app():
 
@@ -7,11 +7,13 @@
 from azure.search.documents.indexes.models import (
     AIServicesAccountIdentity,
     AzureOpenAIEmbeddingSkill,
-    BlobIndexerImageAction,
+    DocumentIntelligenceLayoutSkill,
+    DocumentIntelligenceLayoutSkillChunkingProperties,
     IndexingParameters,
     IndexingParametersConfiguration,
     IndexProjectionMode,
     InputFieldMappingEntry,
+    MergeSkill,
     OutputFieldMappingEntry,
     SearchIndexer,
     SearchIndexerDataContainer,
@@ -75,11 +77,10 @@ def __init__(
         self.data_source_name = f"{prefix}-blob"
         self.use_multimodal = use_multimodal and image_embeddings is not None
 
-    async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset:
+    async def create_skillset(self, index_name: str) -> SearchIndexerSkillset:
         """
         Create a skillset for the indexer to chunk documents and generate embeddings
         """
-
         split_skill = SplitSkill(
             name="split-skill",
             description="Split skill to chunk documents",
@@ -107,6 +108,83 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
             outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
         )
 
+        index_projection = SearchIndexerIndexProjection(
+            selectors=[
+                SearchIndexerIndexProjectionSelector(
+                    target_index_name=index_name,
+                    parent_key_field_name="parent_id",
+                    source_context="/document/pages/*",
+                    mappings=[
+                        InputFieldMappingEntry(name="content", source="/document/pages/*"),
+                        InputFieldMappingEntry(name="sourcepage", source="/document/metadata_storage_name"),
+                        InputFieldMappingEntry(name="sourcefile", source="/document/metadata_storage_name"),
+                        InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"),
+                        InputFieldMappingEntry(
+                            name=self.search_field_name_embedding, source="/document/pages/*/vector"
+                        ),
+                    ],
+                ),
+            ],
+            parameters=SearchIndexerIndexProjectionsParameters(
+                projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
+            ),
+        )
+
+        skillset = SearchIndexerSkillset(
+            name=self.skillset_name,
+            description="Skillset to chunk documents and generate embeddings",
+            skills=[split_skill, embedding_skill],
+            index_projection=index_projection,
+        )
+
+        return skillset
+
+    async def create_multimodal_skillset(self, index_name: str) -> SearchIndexerSkillset:
+        document_layout_skill = DocumentIntelligenceLayoutSkill(
+            description="Layout skill to read documents",
+            context="/document",
+            output_mode="oneToMany",
+            output_format="text",
+            markdown_header_depth="",  # Necessary so that SDK doesnt send a header depth
+            extraction_options=["images", "locationMetadata"],
+            chunking_properties=DocumentIntelligenceLayoutSkillChunkingProperties(
+                unit="characters",
+                maximum_length=2000,
+                overlap_length=200,
+            ),
+            inputs=[InputFieldMappingEntry(name="file_data", source="/document/file_data")],
+            outputs=[
+                OutputFieldMappingEntry(name="text_sections", target_name="text_sections"),
+                OutputFieldMappingEntry(name="normalized_images", target_name="normalized_images"),
+            ],
+        )
+
+        split_skill = SplitSkill(
+            description="Split skill to chunk pages of documents",
+            text_split_mode="pages",
+            context="/document/text_sections/*",
+            maximum_page_length=2000,
+            page_overlap_length=500,
+            inputs=[
+                InputFieldMappingEntry(name="text", source="/document/text_sections/*/content"),
+            ],
+            outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
+        )
+
+        embedding_skill = AzureOpenAIEmbeddingSkill(
+            name="embedding-skill",
+            description="Skill to generate embeddings via Azure OpenAI",
+            context="/document/text_sections/*/pages/*",
+            resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
+            deployment_name=self.embeddings.open_ai_deployment,
+            model_name=self.embeddings.open_ai_model_name,
+            dimensions=self.embeddings.open_ai_dimensions,
+            inputs=[
+                InputFieldMappingEntry(name="text", source="/document/text_sections/*/pages/*"),
+            ],
+            outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
+        )
+
         vision_embedding_skill = VisionVectorizeSkill(
             name="vision-embedding-skill",
             description="Skill to generate image embeddings via Azure AI Vision",
@@ -115,6 +193,7 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
             inputs=[InputFieldMappingEntry(name="image", source="/document/normalized_images/*")],
             outputs=[OutputFieldMappingEntry(name="vector", target_name="image_vector")],
         )
+
         vision_embedding_shaper_skill = ShaperSkill(
             name="vision-embedding-shaper-skill",
             description="Shaper skill to ensure image embeddings are in the correct format",
@@ -123,70 +202,80 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
                 InputFieldMappingEntry(name="embedding", source="/document/normalized_images/*/image_vector"),
                 InputFieldMappingEntry(
                     name="url",
-                    # source=f'="{self.blob_manager.endpoint}/images/"+$(/document/normalized_images/*/imagePath)'
-                    source="=$(/document/normalized_images/*/imagePath)",
+                    source=f'="{self.blob_manager.endpoint}/images/"+$(/document/normalized_images/*/imagePath)',
                 ),
             ],
             outputs=[OutputFieldMappingEntry(name="output", target_name="images")],
         )
 
+        merge_skill = MergeSkill(
+            name="merge-skill",
+            description="Merge skill to create source page",
+            insert_post_tag="",
+            insert_pre_tag="",
+            context="/document/text_sections/*/locationMetadata",
+            inputs=[
+                InputFieldMappingEntry(
+                    name="itemsToInsert",
+                    source='=[$(/document/metadata_storage_name), "#page=", $(/document/text_sections/*/locationMetadata/pageNumber)]',
+                )
+            ],
+            outputs=[OutputFieldMappingEntry(name="mergedText", target_name="citation")],
+        )
+
+        indexer_skills = [
+            document_layout_skill,
+            split_skill,
+            embedding_skill,
+            vision_embedding_skill,
+            vision_embedding_shaper_skill,
+            merge_skill,
+        ]
+
         index_projection = SearchIndexerIndexProjection(
             selectors=[
                 SearchIndexerIndexProjectionSelector(
                     target_index_name=index_name,
                     parent_key_field_name="parent_id",
-                    source_context="/document/pages/*",
+                    source_context="/document/text_sections/*/pages/*",
                     mappings=[
-                        InputFieldMappingEntry(name="content", source="/document/pages/*"),
-                        InputFieldMappingEntry(name="sourcepage", source="/document/metadata_storage_name"),
+                        InputFieldMappingEntry(name="content", source="/document/text_sections/*/pages/*"),
                         InputFieldMappingEntry(name="sourcefile", source="/document/metadata_storage_name"),
+                        InputFieldMappingEntry(
+                            name="sourcepage", source="/document/text_sections/*/locationMetadata/citation"
+                        ),
                         InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"),
                         InputFieldMappingEntry(
-                            name=self.search_field_name_embedding, source="/document/pages/*/vector"
+                            name=self.search_field_name_embedding, source="/document/text_sections/*/pages/*/vector"
                         ),
                         InputFieldMappingEntry(name="images", source="/document/normalized_images/*/images"),
                     ],
-                ),
+                )
             ],
             parameters=SearchIndexerIndexProjectionsParameters(
                 projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
             ),
         )
 
-        indexer_skills = [split_skill, embedding_skill]
-        if self.use_multimodal:
-            indexer_skills.extend([vision_embedding_skill, vision_embedding_shaper_skill])
-        extra_params = {}
-        if self.use_multimodal:
-            extra_params = {
-                "cognitive_services_account": AIServicesAccountIdentity(subdomain_url=self.image_embeddings.endpoint),
-                "knowledge_store": SearchIndexerKnowledgeStore(
-                    storage_connection_string=self.blob_manager.get_managedidentity_connectionstring(),
-                    projections=[
-                        SearchIndexerKnowledgeStoreProjection(
-                            files=[
-                                SearchIndexerKnowledgeStoreFileProjectionSelector(
-                                    storage_container=self.blob_manager.image_container,
-                                    source="/document/normalized_images/*",
-                                )
-                            ]
-                        )
-                    ],
-                ),
-            }
-
-        # We still need to map the images onto url in the images complex field type
-        # something about key path
-        # id = "feb5e192afb6_aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L05vcnRod2luZF9IZWFsdGhfUGx1c19CZW5lZml0c19EZXRhaWxzLnBkZg2_pages_65",
-        # parent_id = is the folder name
-        # https://stxxk4qzq3tahic2.blob.core.windows.net/images/aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L0JlbmVmaXRfT3B0aW9ucy5wZGY1/normalized_images_1.jpg
-
         skillset = SearchIndexerSkillset(
             name=self.skillset_name,
             description="Skillset to chunk documents and generate embeddings",
             skills=indexer_skills,
             index_projection=index_projection,
-            **extra_params,
+            cognitive_services_account=AIServicesAccountIdentity(subdomain_url=self.image_embeddings.endpoint),
+            knowledge_store=SearchIndexerKnowledgeStore(
+                storage_connection_string=self.blob_manager.get_managedidentity_connectionstring(),
+                projections=[
+                    SearchIndexerKnowledgeStoreProjection(
+                        files=[
+                            SearchIndexerKnowledgeStoreFileProjectionSelector(
+                                storage_container=self.blob_manager.image_container,
+                                source="/document/normalized_images/*",
+                            )
+                        ]
+                    )
+                ],
+            ),
         )
 
         return skillset
@@ -217,8 +306,11 @@ async def setup(self):
 
         await ds_client.create_or_update_data_source_connection(data_source_connection)
 
-        embedding_skillset = await self.create_embedding_skill(self.search_info.index_name)
-        await ds_client.create_or_update_skillset(embedding_skillset)
+        if self.use_multimodal:
+            skillset = await self.create_multimodal_skillset(self.search_info.index_name)
+        else:
+            skillset = await self.create_skillset(self.search_info.index_name)
+        await ds_client.create_or_update_skillset(skillset)
         await ds_client.close()
 
     async def run(self):
@@ -237,15 +329,14 @@ async def run(self):
         elif self.document_action == DocumentAction.RemoveAll:
             await self.blob_manager.remove_blob()
 
-        # Create an indexer
         extra_params = {}
         if self.use_multimodal:
             extra_params = {
                 "parameters": IndexingParameters(
                     configuration=IndexingParametersConfiguration(
-                        query_timeout=None,  # Current bug in AI Search SDK
-                        image_action=BlobIndexerImageAction.GENERATE_NORMALIZED_IMAGES,
-                    )
+                        query_timeout=None, allow_skillset_to_read_file_data=True  # Current bug in AI Search SDK
+                    ),
+                    max_failed_items=-1,
                 )
             }