Azure-Samples · Prajwal-Microsoft · Sep 30, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025
@@ -0,0 +1,58 @@
+import logging
+import azure.functions as func
+import json
+
+bp_combine_pages_and_chunknos = func.Blueprint()
+
+
+@bp_combine_pages_and_chunknos.route(route="combine_pages_and_chunknos", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
+def combine_pages_and_chunknos(req: func.HttpRequest) -> func.HttpResponse:
+    """
+    This function is designed to be called by an Azure Cognitive Search WebApiSkill.
+    It expects a JSON payload with two arrays ("pages" and "chunk_nos") and
+    combines them into a single array of objects.
+    """
+    logging.info("Combine pages and chunk numbers function processed a request.")
+
+    try:
+        req_body = req.get_json()
+        values = req_body.get("values", [])
+
+        response_values = []
+
+        for value in values:
+            record_id = value.get("recordId")
+            data = value.get("data", {})
+
+            pages = data.get("pages", [])
+            chunk_nos = data.get("chunk_nos", [])
+
+            # Zip the two arrays together
+            zipped_data = [
+                {"page_text": page, "chunk_no": chunk}
+                for page, chunk in zip(pages, chunk_nos)
+            ]
+
+            response_values.append(
+                {
+                    "recordId": record_id,
+                    "data": {"pages_with_chunks": zipped_data},
+                    "errors": None,
+                    "warnings": None,
+                }
+            )
+
+        # Return the response in the format expected by the WebApiSkill
+        return func.HttpResponse(
+            body=json.dumps({"values": response_values}),
+            mimetype="application/json",
+            status_code=200,
+        )
+
+    except Exception as e:
+        logging.error(f"Error in combine_pages_and_chunknos function: {e}")
+        return func.HttpResponse(
+            body=json.dumps({"values": [{"recordId": "error", "data": {}, "errors": [{"message": str(e)}], "warnings": []}]}),
+            mimetype="application/json",
+            status_code=500,
+        )
@@ -5,6 +5,7 @@
 from batch_push_results import bp_batch_push_results
 from batch_start_processing import bp_batch_start_processing
 from get_conversation_response import bp_get_conversation_response
+from combine_pages_chunknos import bp_combine_pages_and_chunknos
 from azure.monitor.opentelemetry import configure_azure_monitor
 
 logging.captureWarnings(True)
@@ -20,3 +21,4 @@
 app.register_functions(bp_batch_push_results)
 app.register_functions(bp_batch_start_processing)
 app.register_functions(bp_get_conversation_response)
+app.register_functions(bp_combine_pages_and_chunknos)
@@ -1,5 +1,5 @@
 import logging
-from azure.search.documents.indexes.models import SearchIndexer, FieldMapping
+from azure.search.documents.indexes.models import SearchIndexer, FieldMapping, FieldMappingFunction
 from azure.search.documents.indexes import SearchIndexerClient
 from ..helpers.env_helper import EnvHelper
 from ..helpers.azure_credential_utils import get_azure_credential
@@ -35,6 +35,13 @@ def create_or_update_indexer(self, indexer_name: str, skillset_name: str):
                 }
             },
             field_mappings=[
+                FieldMapping(
+                    source_field_name="metadata_storage_path",
+                    target_field_name="id",
+                    mapping_function=FieldMappingFunction(
+                        name="base64Encode", parameters={"useHttpServerUtilityUrlTokenEncode": False}
+                    )
+                ),
                 FieldMapping(
                     source_field_name="metadata_storage_path",
                     target_field_name="source",

@@ -6,6 +6,8 @@
     AzureOpenAIEmbeddingSkill,
     OcrSkill,
     MergeSkill,
+    ShaperSkill,
+    WebApiSkill,
     SearchIndexerIndexProjections,
     SearchIndexerIndexProjectionSelector,
     SearchIndexerIndexProjectionsParameters,
@@ -83,12 +85,30 @@ def create_skillset(self):
             inputs=[
                 InputFieldMappingEntry(name="text", source="/document/merged_content"),
             ],
-            outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
+            outputs=[
+                OutputFieldMappingEntry(name="textItems", target_name="pages"),
+                OutputFieldMappingEntry(name="ordinalPositions", target_name="chunk_nos"),
+            ],
+        )
+
+        # Custom WebApi skill to combine pages and chunk numbers into a single structure
+        combine_pages_and_chunk_nos_skill = WebApiSkill(
+            description="Combine pages and chunk numbers together",
+            context="/document",
+            uri=f"{self.env_helper.BACKEND_URL}/api/combine_pages_and_chunknos",
+            http_method="POST",
+            inputs=[
+                InputFieldMappingEntry(name="pages", source="/document/pages"),
+                InputFieldMappingEntry(name="chunk_nos", source="/document/chunk_nos"),
+            ],
+            outputs=[
+                OutputFieldMappingEntry(name="pages_with_chunks", target_name="pages_with_chunks")
+            ]
         )
 
         embedding_skill = AzureOpenAIEmbeddingSkill(
             description="Skill to generate embeddings via Azure OpenAI",
-            context="/document/pages/*",
+            context="/document/pages_with_chunks/*",
             resource_uri=self.env_helper.AZURE_OPENAI_ENDPOINT,
             deployment_id=self.env_helper.AZURE_OPENAI_EMBEDDING_MODEL,
             api_key=(
@@ -104,31 +124,49 @@ def create_skillset(self):
                 )
             ),
             inputs=[
-                InputFieldMappingEntry(name="text", source="/document/pages/*"),
+                InputFieldMappingEntry(name="text", source="/document/pages_with_chunks/*/page_text"),
             ],
             outputs=[
                 OutputFieldMappingEntry(name="embedding", target_name="content_vector")
             ],
         )
 
+        metadata_shaper = ShaperSkill(
+            description="Structure metadata fields into a complex object",
+            context="/document/pages_with_chunks/*",
+            inputs=[
+                InputFieldMappingEntry(name="id", source="/document/id"),
+                InputFieldMappingEntry(name="source", source="/document/metadata_storage_path"),
+                InputFieldMappingEntry(name="title", source="/document/title"),
+                InputFieldMappingEntry(name="chunk", source="/document/pages_with_chunks/*/chunk_no"),
+            ],
+            outputs=[
+                OutputFieldMappingEntry(name="output", target_name="metadata_object")
+            ]
+        )
+
         index_projections = SearchIndexerIndexProjections(
             selectors=[
                 SearchIndexerIndexProjectionSelector(
                     target_index_name=self.env_helper.AZURE_SEARCH_INDEX,
                     parent_key_field_name="id",
-                    source_context="/document/pages/*",
+                    source_context="/document/pages_with_chunks/*",
                     mappings=[
                         InputFieldMappingEntry(
-                            name="content", source="/document/pages/*"
+                            name="content", source="/document/pages_with_chunks/*/page_text"
                         ),
                         InputFieldMappingEntry(
                             name="content_vector",
-                            source="/document/pages/*/content_vector",
+                            source="/document/pages_with_chunks/*/content_vector",
                         ),
                         InputFieldMappingEntry(name="title", source="/document/title"),
                         InputFieldMappingEntry(
                             name="source", source="/document/metadata_storage_path"
                         ),
+                        InputFieldMappingEntry(
+                            name="metadata",
+                            source="/document/pages_with_chunks/*/metadata_object",
+                        )
                     ],
                 ),
             ],
@@ -140,7 +178,7 @@ def create_skillset(self):
         skillset = SearchIndexerSkillset(
             name=skillset_name,
             description="Skillset to chunk documents and generating embeddings",
-            skills=[ocr_skill, merge_skill, split_skill, embedding_skill],
+            skills=[ocr_skill, merge_skill, split_skill, combine_pages_and_chunk_nos_skill, embedding_skill, metadata_shaper],
             index_projections=index_projections,
         )
 

@@ -56,14 +56,17 @@ def get_citations(citation_list):
             else citation["url"]
         )
         title = citation["title"]
-        url = get_markdown_url(metadata["source"], title, container_sas)
+        source = metadata["source"]
+        if "_SAS_TOKEN_PLACEHOLDER_" not in source:
+            source += "_SAS_TOKEN_PLACEHOLDER_"
+        url = get_markdown_url(source, title, container_sas)
         citations_dict["citations"].append(
             {
                 "content": url + "\n\n\n" + citation["content"],
                 "id": metadata["id"],
                 "chunk_id": (
                     re.findall(r"\d+", metadata["chunk_id"])[-1]
-                    if metadata["chunk_id"] is not None
+                    if metadata.get("chunk_id") is not None
                     else metadata["chunk"]
                 ),
                 "title": title,
@@ -196,7 +199,8 @@ def conversation_with_data(conversation: Request, env_helper: EnvHelper):
                             }
                             if env_helper.is_auth_type_keys()
                             else {
-                                "type": "system_assigned_managed_identity",
+                                "type": "user_assigned_managed_identity",
+                                "managed_identity_resource_id": env_helper.MANAGED_IDENTITY_RESOURCE_ID,
                             }
                         ),
                         "endpoint": env_helper.AZURE_SEARCH_SERVICE,
@@ -211,11 +215,6 @@ def conversation_with_data(conversation: Request, env_helper: EnvHelper):
                                 env_helper.AZURE_SEARCH_CONTENT_VECTOR_COLUMN
                             ],
                             "title_field": env_helper.AZURE_SEARCH_TITLE_COLUMN or None,
-                            "source_field": env_helper.AZURE_SEARCH_SOURCE_COLUMN
-                            or None,
-                            "text_field": env_helper.AZURE_SEARCH_TEXT_COLUMN or None,
-                            "layoutText_field": env_helper.AZURE_SEARCH_LAYOUT_TEXT_COLUMN
-                            or None,
                             "url_field": env_helper.AZURE_SEARCH_FIELDS_METADATA
                             or None,
                             "filepath_field": (