Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions code/backend/batch/combine_pages_chunknos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import logging
import azure.functions as func
import json

bp_combine_pages_and_chunknos = func.Blueprint()


@bp_combine_pages_and_chunknos.route(route="combine_pages_and_chunknos", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
def combine_pages_and_chunknos(req: func.HttpRequest) -> func.HttpResponse:
"""
This function is designed to be called by an Azure Cognitive Search WebApiSkill.
It expects a JSON payload with two arrays ("pages" and "chunk_nos") and
combines them into a single array of objects.
"""
logging.info("Combine pages and chunk numbers function processed a request.")

try:
req_body = req.get_json()
values = req_body.get("values", [])

response_values = []

for value in values:
record_id = value.get("recordId")
data = value.get("data", {})

pages = data.get("pages", [])
chunk_nos = data.get("chunk_nos", [])

# Zip the two arrays together
zipped_data = [
{"page_text": page, "chunk_no": chunk}
for page, chunk in zip(pages, chunk_nos)
]

response_values.append(
{
"recordId": record_id,
"data": {"pages_with_chunks": zipped_data},
"errors": None,
"warnings": None,
}
)

# Return the response in the format expected by the WebApiSkill
return func.HttpResponse(
body=json.dumps({"values": response_values}),
mimetype="application/json",
status_code=200,
)

except Exception as e:
logging.error(f"Error in combine_pages_and_chunknos function: {e}")
return func.HttpResponse(
body=json.dumps({"values": [{"recordId": "error", "data": {}, "errors": [{"message": str(e)}], "warnings": []}]}),
mimetype="application/json",
status_code=500,
)
2 changes: 2 additions & 0 deletions code/backend/batch/function_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from batch_push_results import bp_batch_push_results
from batch_start_processing import bp_batch_start_processing
from get_conversation_response import bp_get_conversation_response
from combine_pages_chunknos import bp_combine_pages_and_chunknos
from azure.monitor.opentelemetry import configure_azure_monitor

logging.captureWarnings(True)
Expand All @@ -20,3 +21,4 @@
app.register_functions(bp_batch_push_results)
app.register_functions(bp_batch_start_processing)
app.register_functions(bp_get_conversation_response)
app.register_functions(bp_combine_pages_and_chunknos)
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from azure.search.documents.indexes.models import SearchIndexer, FieldMapping
from azure.search.documents.indexes.models import SearchIndexer, FieldMapping, FieldMappingFunction
from azure.search.documents.indexes import SearchIndexerClient
from ..helpers.env_helper import EnvHelper
from ..helpers.azure_credential_utils import get_azure_credential
Expand Down Expand Up @@ -35,6 +35,13 @@ def create_or_update_indexer(self, indexer_name: str, skillset_name: str):
}
},
field_mappings=[
FieldMapping(
source_field_name="metadata_storage_path",
target_field_name="id",
mapping_function=FieldMappingFunction(
name="base64Encode", parameters={"useHttpServerUtilityUrlTokenEncode": False}
)
),
FieldMapping(
source_field_name="metadata_storage_path",
target_field_name="source",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
AzureOpenAIEmbeddingSkill,
OcrSkill,
MergeSkill,
ShaperSkill,
WebApiSkill,
SearchIndexerIndexProjections,
SearchIndexerIndexProjectionSelector,
SearchIndexerIndexProjectionsParameters,
Expand Down Expand Up @@ -83,12 +85,30 @@ def create_skillset(self):
inputs=[
InputFieldMappingEntry(name="text", source="/document/merged_content"),
],
outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
outputs=[
OutputFieldMappingEntry(name="textItems", target_name="pages"),
OutputFieldMappingEntry(name="ordinalPositions", target_name="chunk_nos"),
],
)

# Custom WebApi skill to combine pages and chunk numbers into a single structure
combine_pages_and_chunk_nos_skill = WebApiSkill(
description="Combine pages and chunk numbers together",
context="/document",
uri=f"{self.env_helper.BACKEND_URL}/api/combine_pages_and_chunknos",
http_method="POST",
inputs=[
InputFieldMappingEntry(name="pages", source="/document/pages"),
InputFieldMappingEntry(name="chunk_nos", source="/document/chunk_nos"),
],
outputs=[
OutputFieldMappingEntry(name="pages_with_chunks", target_name="pages_with_chunks")
]
)

embedding_skill = AzureOpenAIEmbeddingSkill(
description="Skill to generate embeddings via Azure OpenAI",
context="/document/pages/*",
context="/document/pages_with_chunks/*",
resource_uri=self.env_helper.AZURE_OPENAI_ENDPOINT,
deployment_id=self.env_helper.AZURE_OPENAI_EMBEDDING_MODEL,
api_key=(
Expand All @@ -104,31 +124,49 @@ def create_skillset(self):
)
),
inputs=[
InputFieldMappingEntry(name="text", source="/document/pages/*"),
InputFieldMappingEntry(name="text", source="/document/pages_with_chunks/*/page_text"),
],
outputs=[
OutputFieldMappingEntry(name="embedding", target_name="content_vector")
],
)

metadata_shaper = ShaperSkill(
description="Structure metadata fields into a complex object",
context="/document/pages_with_chunks/*",
inputs=[
InputFieldMappingEntry(name="id", source="/document/id"),
InputFieldMappingEntry(name="source", source="/document/metadata_storage_path"),
InputFieldMappingEntry(name="title", source="/document/title"),
InputFieldMappingEntry(name="chunk", source="/document/pages_with_chunks/*/chunk_no"),
],
outputs=[
OutputFieldMappingEntry(name="output", target_name="metadata_object")
]
)

index_projections = SearchIndexerIndexProjections(
selectors=[
SearchIndexerIndexProjectionSelector(
target_index_name=self.env_helper.AZURE_SEARCH_INDEX,
parent_key_field_name="id",
source_context="/document/pages/*",
source_context="/document/pages_with_chunks/*",
mappings=[
InputFieldMappingEntry(
name="content", source="/document/pages/*"
name="content", source="/document/pages_with_chunks/*/page_text"
),
InputFieldMappingEntry(
name="content_vector",
source="/document/pages/*/content_vector",
source="/document/pages_with_chunks/*/content_vector",
),
InputFieldMappingEntry(name="title", source="/document/title"),
InputFieldMappingEntry(
name="source", source="/document/metadata_storage_path"
),
InputFieldMappingEntry(
name="metadata",
source="/document/pages_with_chunks/*/metadata_object",
)
],
),
],
Expand All @@ -140,7 +178,7 @@ def create_skillset(self):
skillset = SearchIndexerSkillset(
name=skillset_name,
description="Skillset to chunk documents and generating embeddings",
skills=[ocr_skill, merge_skill, split_skill, embedding_skill],
skills=[ocr_skill, merge_skill, split_skill, combine_pages_and_chunk_nos_skill, embedding_skill, metadata_shaper],
index_projections=index_projections,
)

Expand Down
15 changes: 7 additions & 8 deletions code/create_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,17 @@ def get_citations(citation_list):
else citation["url"]
)
title = citation["title"]
url = get_markdown_url(metadata["source"], title, container_sas)
source = metadata["source"]
if "_SAS_TOKEN_PLACEHOLDER_" not in source:
source += "_SAS_TOKEN_PLACEHOLDER_"
url = get_markdown_url(source, title, container_sas)
citations_dict["citations"].append(
{
"content": url + "\n\n\n" + citation["content"],
"id": metadata["id"],
"chunk_id": (
re.findall(r"\d+", metadata["chunk_id"])[-1]
if metadata["chunk_id"] is not None
if metadata.get("chunk_id") is not None
else metadata["chunk"]
),
"title": title,
Expand Down Expand Up @@ -196,7 +199,8 @@ def conversation_with_data(conversation: Request, env_helper: EnvHelper):
}
if env_helper.is_auth_type_keys()
else {
"type": "system_assigned_managed_identity",
"type": "user_assigned_managed_identity",
"managed_identity_resource_id": env_helper.MANAGED_IDENTITY_RESOURCE_ID,
}
),
"endpoint": env_helper.AZURE_SEARCH_SERVICE,
Expand All @@ -211,11 +215,6 @@ def conversation_with_data(conversation: Request, env_helper: EnvHelper):
env_helper.AZURE_SEARCH_CONTENT_VECTOR_COLUMN
],
"title_field": env_helper.AZURE_SEARCH_TITLE_COLUMN or None,
"source_field": env_helper.AZURE_SEARCH_SOURCE_COLUMN
or None,
"text_field": env_helper.AZURE_SEARCH_TEXT_COLUMN or None,
"layoutText_field": env_helper.AZURE_SEARCH_LAYOUT_TEXT_COLUMN
or None,
"url_field": env_helper.AZURE_SEARCH_FIELDS_METADATA
or None,
"filepath_field": (
Expand Down
Loading