Update ai search

BenConstable9 · BenConstable9 · commit 8e6b61d00bf3 · 2025-01-14T19:37:56.000Z
diff --git a/deploy_ai_search/src/deploy_ai_search/ai_search.py b/deploy_ai_search/src/deploy_ai_search/ai_search.py
@@ -322,8 +322,10 @@ def get_text_split_skill(
 
         return semantic_text_chunker_skill
 
-    def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill:
-        """Get the custom skill for adi.
+    def get_layout_analysis_skill(
+        self, chunk_by_page=False, extract_figures=True
+    ) -> WebApiSkill:
+        """Get the custom skill for layout analysis.
 
         Args:
         -----
@@ -343,25 +345,24 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill:
 
         if chunk_by_page:
             output = [
-                OutputFieldMappingEntry(name="extracted_content", target_name="chunks")
+                OutputFieldMappingEntry(name="layout", target_name="page_wise_layout")
             ]
         else:
-            output = [
-                OutputFieldMappingEntry(
-                    name="extracted_content", target_name="extracted_content"
-                )
-            ]
+            output = [OutputFieldMappingEntry(name="layout", target_name="layout")]
 
-        adi_skill = WebApiSkill(
-            name="ADI Skill",
+        layout_analysis_skill = WebApiSkill(
+            name="Layout Analysis Skill",
             description="Skill to generate ADI",
             context="/document",
-            uri=self.environment.get_custom_skill_function_url("adi"),
+            uri=self.environment.get_custom_skill_function_url("layout_analysis"),
             timeout="PT230S",
             batch_size=batch_size,
             degree_of_parallelism=degree_of_parallelism,
             http_method="POST",
-            http_headers={"chunk_by_page": chunk_by_page},
+            http_headers={
+                "chunk_by_page": chunk_by_page,
+                "extract_figures": extract_figures,
+            },
             inputs=[
                 InputFieldMappingEntry(
                     name="source", source="/document/metadata_storage_path"
@@ -371,100 +372,150 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill:
         )
 
         if self.environment.identity_type != IdentityType.KEY:
-            adi_skill.auth_identity = (
+            layout_analysis_skill.auth_identity = (
                 self.environment.function_app_app_registration_resource_id
             )
 
         if self.environment.identity_type == IdentityType.USER_ASSIGNED:
-            adi_skill.auth_identity = self.environment.ai_search_user_assigned_identity
+            layout_analysis_skill.auth_identity = (
+                self.environment.ai_search_user_assigned_identity
+            )
 
-        return adi_skill
+        return layout_analysis_skill
 
-    def get_vector_skill(
-        self, context, source, target_name="vector"
-    ) -> AzureOpenAIEmbeddingSkill:
-        """Get the vector skill for the indexer.
+    def get_figure_analysis_skill(self, figure_source) -> WebApiSkill:
+        """Get the custom skill for figure analysis.
+
+        Args:
+        -----
+            chunk_by_page (bool, optional): Whether to chunk by page. Defaults to False.
 
         Returns:
-            AzureOpenAIEmbeddingSkill: The vector skill for the indexer"""
+        --------
+            WebApiSkill: The custom skill for adi"""
 
-        embedding_skill_inputs = [
-            InputFieldMappingEntry(name="text", source=source),
-        ]
-        embedding_skill_outputs = [
-            OutputFieldMappingEntry(name="embedding", target_name=target_name)
+        if self.test:
+            batch_size = 1
+            degree_of_parallelism = 4
+        else:
+            # Depending on your GPT Token limit, you may need to adjust the batch size and degree of parallelism
+            batch_size = 1
+            degree_of_parallelism = 8
+
+        output = [
+            OutputFieldMappingEntry(name="updated_figure", target_name="updated_figure")
         ]
 
-        vector_skill = AzureOpenAIEmbeddingSkill(
-            name="Vector Skill",
-            description="Skill to generate embeddings",
-            context=context,
-            deployment_name=self.environment.open_ai_embedding_deployment,
-            model_name=self.environment.open_ai_embedding_model,
-            resource_url=self.environment.open_ai_endpoint,
-            inputs=embedding_skill_inputs,
-            outputs=embedding_skill_outputs,
-            dimensions=self.environment.open_ai_embedding_dimensions,
+        figure_analysis_skill = WebApiSkill(
+            name="Figure Analysis Skill",
+            description="Skill to generate figure analysis",
+            context=figure_source,
+            uri=self.environment.get_custom_skill_function_url("figure_analysis"),
+            timeout="PT230S",
+            batch_size=batch_size,
+            degree_of_parallelism=degree_of_parallelism,
+            http_method="POST",
+            inputs=[InputFieldMappingEntry(name="figure", source=figure_source)],
+            outputs=output,
         )
 
-        if self.environment.identity_type == IdentityType.KEY:
-            vector_skill.api_key = self.environment.open_ai_api_key
-        elif self.environment.identity_type == IdentityType.USER_ASSIGNED:
-            vector_skill.auth_identity = (
+        if self.environment.identity_type != IdentityType.KEY:
+            figure_analysis_skill.auth_identity = (
+                self.environment.function_app_app_registration_resource_id
+            )
+
+        if self.environment.identity_type == IdentityType.USER_ASSIGNED:
+            figure_analysis_skill.auth_identity = (
                 self.environment.ai_search_user_assigned_identity
             )
 
-        return vector_skill
+        return figure_analysis_skill
 
-    def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill:
-        """Get the key phrase extraction skill.
+    def get_layout_and_figure_merger_skill(self, figure_source) -> WebApiSkill:
+        """Get the custom skill for layout and figure merger.
 
         Args:
         -----
-            context (str): The context of the skill
-            source (str): The source of the skill
+            chunk_by_page (bool, optional): Whether to chunk by page. Defaults to False.
 
         Returns:
         --------
-            WebApiSkill: The key phrase extraction skill"""
+            WebApiSkill: The custom skill for adi"""
 
         if self.test:
-            batch_size = 4
+            batch_size = 1
             degree_of_parallelism = 4
         else:
-            batch_size = 16
-            degree_of_parallelism = 16
+            # Depending on your GPT Token limit, you may need to adjust the batch size and degree of parallelism
+            batch_size = 1
+            degree_of_parallelism = 8
 
-        key_phrase_extraction_skill_inputs = [
-            InputFieldMappingEntry(name="text", source=source),
-        ]
-        key_phrase_extraction__skill_outputs = [
-            OutputFieldMappingEntry(name="key_phrases", target_name="keywords")
+        output = [
+            OutputFieldMappingEntry(name="updated_figure", target_name="updated_figure")
         ]
-        key_phrase_extraction_skill = WebApiSkill(
-            name="Key phrase extraction API",
-            description="Skill to extract keyphrases",
-            context=context,
-            uri=self.environment.get_custom_skill_function_url("key_phrase_extraction"),
+
+        figure_analysis_skill = WebApiSkill(
+            name="Layout and Figure Merger Skill",
+            description="Skill to merge layout and figure analysis",
+            context=figure_source,
+            uri=self.environment.get_custom_skill_function_url(
+                "layout_and_figure_merger"
+            ),
             timeout="PT230S",
             batch_size=batch_size,
             degree_of_parallelism=degree_of_parallelism,
             http_method="POST",
-            inputs=key_phrase_extraction_skill_inputs,
-            outputs=key_phrase_extraction__skill_outputs,
+            inputs=[InputFieldMappingEntry(name="figure", source=figure_source)],
+            outputs=output,
         )
 
         if self.environment.identity_type != IdentityType.KEY:
-            key_phrase_extraction_skill.auth_identity = (
+            figure_analysis_skill.auth_identity = (
                 self.environment.function_app_app_registration_resource_id
             )
 
         if self.environment.identity_type == IdentityType.USER_ASSIGNED:
-            key_phrase_extraction_skill.auth_identity = (
+            figure_analysis_skill.auth_identity = (
+                self.environment.ai_search_user_assigned_identity
+            )
+
+        return figure_analysis_skill
+
+    def get_vector_skill(
+        self, context, source, target_name="vector"
+    ) -> AzureOpenAIEmbeddingSkill:
+        """Get the vector skill for the indexer.
+
+        Returns:
+            AzureOpenAIEmbeddingSkill: The vector skill for the indexer"""
+
+        embedding_skill_inputs = [
+            InputFieldMappingEntry(name="text", source=source),
+        ]
+        embedding_skill_outputs = [
+            OutputFieldMappingEntry(name="embedding", target_name=target_name)
+        ]
+
+        vector_skill = AzureOpenAIEmbeddingSkill(
+            name="Vector Skill",
+            description="Skill to generate embeddings",
+            context=context,
+            deployment_name=self.environment.open_ai_embedding_deployment,
+            model_name=self.environment.open_ai_embedding_model,
+            resource_url=self.environment.open_ai_endpoint,
+            inputs=embedding_skill_inputs,
+            outputs=embedding_skill_outputs,
+            dimensions=self.environment.open_ai_embedding_dimensions,
+        )
+
+        if self.environment.identity_type == IdentityType.KEY:
+            vector_skill.api_key = self.environment.open_ai_api_key
+        elif self.environment.identity_type == IdentityType.USER_ASSIGNED:
+            vector_skill.auth_identity = (
                 self.environment.ai_search_user_assigned_identity
             )
 
-        return key_phrase_extraction_skill
+        return vector_skill
 
     def get_vector_search(self) -> VectorSearch:
         """Get the vector search configuration for compass.
diff --git a/deploy_ai_search/src/deploy_ai_search/rag_documents.py b/deploy_ai_search/src/deploy_ai_search/rag_documents.py
@@ -98,7 +98,7 @@ def get_index_fields(self) -> list[SearchableField]:
                 facetable=True,
             ),
             ComplexField(
-                name="Figures",
+                name="ChunkFigures",
                 collection=True,
                 fields=[
                     SearchableField(
@@ -107,31 +107,11 @@ def get_index_fields(self) -> list[SearchableField]:
                         collection=True,
                         searchable=False,
                     ),
-                    SimpleField(
-                        name="Container",
-                        type=SearchFieldDataType.String,
-                        filterable=True,
-                    ),
-                    SimpleField(
-                        name="ImageBlob",
-                        type=SearchFieldDataType.String,
-                        filterable=True,
-                    ),
                     SimpleField(
                         name="Caption",
                         type=SearchFieldDataType.String,
                         filterable=True,
                     ),
-                    SimpleField(
-                        name="Offset",
-                        type=SearchFieldDataType.Int64,
-                        filterable=True,
-                    ),
-                    SimpleField(
-                        name="Length",
-                        type=SearchFieldDataType.Int64,
-                        filterable=True,
-                    ),
                     SimpleField(
                         name="PageNumber",
                         type=SearchFieldDataType.Int64,
@@ -258,16 +238,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
             ),
             InputFieldMappingEntry(
                 name="Figures",
-                source_context="/document/chunks/*/figures/*",
-                inputs=[
-                    InputFieldMappingEntry(
-                        name="FigureId", source="/document/chunks/*/figures/*/figure_id"
-                    ),
-                    InputFieldMappingEntry(
-                        name="FigureUri",
-                        source="/document/chunks/*/figures/*/figure_uri",
-                    ),
-                ],
+                source_context="/document/chunks/*/chunk_figures/*",
             ),
             InputFieldMappingEntry(
                 name="DateLastModified", source="/document/DateLastModified"
diff --git a/image_processing/src/image_processing/layout_analysis.py b/image_processing/src/image_processing/layout_analysis.py
@@ -247,7 +247,7 @@ async def process_figures_from_extracted_content(
                         )
                     )
 
-                    image_blob = f"{self.blob}/{figure.id}.png"
+                    blob = f"{self.blob}/{figure.id}.png"
 
                     caption = (
                         figure.caption.content if figure.caption is not None else None
@@ -257,15 +257,15 @@ async def process_figures_from_extracted_content(
                     uri = "{}/{}/{}".format(
                         storage_account_helper.account_url,
                         self.images_container,
-                        image_blob,
+                        blob,
                     )
 
                     offset = figure.spans[0].offset - text_holder.page_offsets
 
                     image_processing_data = FigureHolder(
                         figure_id=figure.id,
                         container=self.images_container,
-                        image_blob=image_blob,
+                        blob=blob,
                         caption=caption,
                         offset=offset,
                         length=figure.spans[0].length,
@@ -293,7 +293,7 @@ async def process_figures_from_extracted_content(
             figure_upload_tasks.append(
                 storage_account_helper.upload_blob(
                     figure_processing_data.container,
-                    figure_processing_data.image_blob,
+                    figure_processing_data.blob,
                     image_data,
                     "image/png",
                 )
diff --git a/image_processing/src/image_processing/layout_holders.py b/image_processing/src/image_processing/layout_holders.py
@@ -9,16 +9,16 @@ class FigureHolder(BaseModel):
 
     """A class to hold the figure extracted from the document."""
 
-    figure_id: str = Field(alias="FigureId")
-    container: str = Field(default="Container")
-    image_blob: str = Field(default="ImageBlob")
-    caption: Optional[str] = Field(default=None, alias="Caption")
-    offset: int = Field(alias="Offset")
-    length: int = Field(alias="Length")
-    page_number: Optional[int] = Field(default=None, alias="PageNumber")
-    uri: str = Field(alias="Uri")
-    description: Optional[str] = Field(default="", alias="Description")
-    data: Optional[str] = Field(default=None, alias="Data")
+    figure_id: str
+    container: str = Field(exclude=True)
+    blob: str = Field(exclude=True)
+    caption: Optional[str] = Field(default=None)
+    offset: int
+    length: int
+    page_number: Optional[int] = Field(default=None)
+    uri: str
+    description: Optional[str] = Field(default="")
+    data: Optional[str] = Field(default=None)
 
     @property
     def markdown(self) -> str:
diff --git a/image_processing/src/image_processing/mark_up_cleaner.py b/image_processing/src/image_processing/mark_up_cleaner.py