Skip to content

Commit 28e0a1e

Browse files
fix: fix byod flow and update integrated vectorization to work with byod flow (#1905)
1 parent baf6c6d commit 28e0a1e

File tree

9 files changed

+262
-106
lines changed

9 files changed

+262
-106
lines changed
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import logging
2+
import azure.functions as func
3+
import json
4+
5+
bp_combine_pages_and_chunknos = func.Blueprint()
6+
7+
8+
@bp_combine_pages_and_chunknos.route(route="combine_pages_and_chunknos", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
9+
def combine_pages_and_chunknos(req: func.HttpRequest) -> func.HttpResponse:
10+
"""
11+
This function is designed to be called by an Azure Cognitive Search WebApiSkill.
12+
It expects a JSON payload with two arrays ("pages" and "chunk_nos") and
13+
combines them into a single array of objects.
14+
"""
15+
logging.info("Combine pages and chunk numbers function processed a request.")
16+
17+
try:
18+
req_body = req.get_json()
19+
values = req_body.get("values", [])
20+
21+
response_values = []
22+
23+
for value in values:
24+
record_id = value.get("recordId")
25+
data = value.get("data", {})
26+
27+
pages = data.get("pages", [])
28+
chunk_nos = data.get("chunk_nos", [])
29+
30+
# Zip the two arrays together
31+
zipped_data = [
32+
{"page_text": page, "chunk_no": chunk}
33+
for page, chunk in zip(pages, chunk_nos)
34+
]
35+
36+
response_values.append(
37+
{
38+
"recordId": record_id,
39+
"data": {"pages_with_chunks": zipped_data},
40+
"errors": None,
41+
"warnings": None,
42+
}
43+
)
44+
45+
# Return the response in the format expected by the WebApiSkill
46+
return func.HttpResponse(
47+
body=json.dumps({"values": response_values}),
48+
mimetype="application/json",
49+
status_code=200,
50+
)
51+
52+
except Exception as e:
53+
logging.error(f"Error in combine_pages_and_chunknos function: {e}")
54+
return func.HttpResponse(
55+
body=json.dumps({"values": [{"recordId": "error", "data": {}, "errors": [{"message": str(e)}], "warnings": []}]}),
56+
mimetype="application/json",
57+
status_code=500,
58+
)

code/backend/batch/function_app.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from batch_push_results import bp_batch_push_results
66
from batch_start_processing import bp_batch_start_processing
77
from get_conversation_response import bp_get_conversation_response
8+
from combine_pages_chunknos import bp_combine_pages_and_chunknos
89
from azure.monitor.opentelemetry import configure_azure_monitor
910

1011
logging.captureWarnings(True)
@@ -20,3 +21,4 @@
2021
app.register_functions(bp_batch_push_results)
2122
app.register_functions(bp_batch_start_processing)
2223
app.register_functions(bp_get_conversation_response)
24+
app.register_functions(bp_combine_pages_and_chunknos)

code/backend/batch/utilities/integrated_vectorization/azure_search_indexer.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import logging
2-
from azure.search.documents.indexes.models import SearchIndexer, FieldMapping
2+
from azure.search.documents.indexes.models import SearchIndexer, FieldMapping, FieldMappingFunction
33
from azure.search.documents.indexes import SearchIndexerClient
44
from ..helpers.env_helper import EnvHelper
55
from ..helpers.azure_credential_utils import get_azure_credential
@@ -35,6 +35,13 @@ def create_or_update_indexer(self, indexer_name: str, skillset_name: str):
3535
}
3636
},
3737
field_mappings=[
38+
FieldMapping(
39+
source_field_name="metadata_storage_path",
40+
target_field_name="id",
41+
mapping_function=FieldMappingFunction(
42+
name="base64Encode", parameters={"useHttpServerUtilityUrlTokenEncode": False}
43+
)
44+
),
3845
FieldMapping(
3946
source_field_name="metadata_storage_path",
4047
target_field_name="source",

code/backend/batch/utilities/integrated_vectorization/azure_search_skillset.py

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
AzureOpenAIEmbeddingSkill,
77
OcrSkill,
88
MergeSkill,
9+
ShaperSkill,
10+
WebApiSkill,
911
SearchIndexerIndexProjections,
1012
SearchIndexerIndexProjectionSelector,
1113
SearchIndexerIndexProjectionsParameters,
@@ -83,12 +85,30 @@ def create_skillset(self):
8385
inputs=[
8486
InputFieldMappingEntry(name="text", source="/document/merged_content"),
8587
],
86-
outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
88+
outputs=[
89+
OutputFieldMappingEntry(name="textItems", target_name="pages"),
90+
OutputFieldMappingEntry(name="ordinalPositions", target_name="chunk_nos"),
91+
],
92+
)
93+
94+
# Custom WebApi skill to combine pages and chunk numbers into a single structure
95+
combine_pages_and_chunk_nos_skill = WebApiSkill(
96+
description="Combine pages and chunk numbers together",
97+
context="/document",
98+
uri=f"{self.env_helper.BACKEND_URL}/api/combine_pages_and_chunknos",
99+
http_method="POST",
100+
inputs=[
101+
InputFieldMappingEntry(name="pages", source="/document/pages"),
102+
InputFieldMappingEntry(name="chunk_nos", source="/document/chunk_nos"),
103+
],
104+
outputs=[
105+
OutputFieldMappingEntry(name="pages_with_chunks", target_name="pages_with_chunks")
106+
]
87107
)
88108

89109
embedding_skill = AzureOpenAIEmbeddingSkill(
90110
description="Skill to generate embeddings via Azure OpenAI",
91-
context="/document/pages/*",
111+
context="/document/pages_with_chunks/*",
92112
resource_uri=self.env_helper.AZURE_OPENAI_ENDPOINT,
93113
deployment_id=self.env_helper.AZURE_OPENAI_EMBEDDING_MODEL,
94114
api_key=(
@@ -104,31 +124,49 @@ def create_skillset(self):
104124
)
105125
),
106126
inputs=[
107-
InputFieldMappingEntry(name="text", source="/document/pages/*"),
127+
InputFieldMappingEntry(name="text", source="/document/pages_with_chunks/*/page_text"),
108128
],
109129
outputs=[
110130
OutputFieldMappingEntry(name="embedding", target_name="content_vector")
111131
],
112132
)
113133

134+
metadata_shaper = ShaperSkill(
135+
description="Structure metadata fields into a complex object",
136+
context="/document/pages_with_chunks/*",
137+
inputs=[
138+
InputFieldMappingEntry(name="id", source="/document/id"),
139+
InputFieldMappingEntry(name="source", source="/document/metadata_storage_path"),
140+
InputFieldMappingEntry(name="title", source="/document/title"),
141+
InputFieldMappingEntry(name="chunk", source="/document/pages_with_chunks/*/chunk_no"),
142+
],
143+
outputs=[
144+
OutputFieldMappingEntry(name="output", target_name="metadata_object")
145+
]
146+
)
147+
114148
index_projections = SearchIndexerIndexProjections(
115149
selectors=[
116150
SearchIndexerIndexProjectionSelector(
117151
target_index_name=self.env_helper.AZURE_SEARCH_INDEX,
118152
parent_key_field_name="id",
119-
source_context="/document/pages/*",
153+
source_context="/document/pages_with_chunks/*",
120154
mappings=[
121155
InputFieldMappingEntry(
122-
name="content", source="/document/pages/*"
156+
name="content", source="/document/pages_with_chunks/*/page_text"
123157
),
124158
InputFieldMappingEntry(
125159
name="content_vector",
126-
source="/document/pages/*/content_vector",
160+
source="/document/pages_with_chunks/*/content_vector",
127161
),
128162
InputFieldMappingEntry(name="title", source="/document/title"),
129163
InputFieldMappingEntry(
130164
name="source", source="/document/metadata_storage_path"
131165
),
166+
InputFieldMappingEntry(
167+
name="metadata",
168+
source="/document/pages_with_chunks/*/metadata_object",
169+
)
132170
],
133171
),
134172
],
@@ -140,7 +178,7 @@ def create_skillset(self):
140178
skillset = SearchIndexerSkillset(
141179
name=skillset_name,
142180
description="Skillset to chunk documents and generating embeddings",
143-
skills=[ocr_skill, merge_skill, split_skill, embedding_skill],
181+
skills=[ocr_skill, merge_skill, split_skill, combine_pages_and_chunk_nos_skill, embedding_skill, metadata_shaper],
144182
index_projections=index_projections,
145183
)
146184

code/create_app.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,17 @@ def get_citations(citation_list):
5656
else citation["url"]
5757
)
5858
title = citation["title"]
59-
url = get_markdown_url(metadata["source"], title, container_sas)
59+
source = metadata["source"]
60+
if "_SAS_TOKEN_PLACEHOLDER_" not in source:
61+
source += "_SAS_TOKEN_PLACEHOLDER_"
62+
url = get_markdown_url(source, title, container_sas)
6063
citations_dict["citations"].append(
6164
{
6265
"content": url + "\n\n\n" + citation["content"],
6366
"id": metadata["id"],
6467
"chunk_id": (
6568
re.findall(r"\d+", metadata["chunk_id"])[-1]
66-
if metadata["chunk_id"] is not None
69+
if metadata.get("chunk_id") is not None
6770
else metadata["chunk"]
6871
),
6972
"title": title,
@@ -196,7 +199,8 @@ def conversation_with_data(conversation: Request, env_helper: EnvHelper):
196199
}
197200
if env_helper.is_auth_type_keys()
198201
else {
199-
"type": "system_assigned_managed_identity",
202+
"type": "user_assigned_managed_identity",
203+
"managed_identity_resource_id": env_helper.MANAGED_IDENTITY_RESOURCE_ID,
200204
}
201205
),
202206
"endpoint": env_helper.AZURE_SEARCH_SERVICE,
@@ -211,11 +215,6 @@ def conversation_with_data(conversation: Request, env_helper: EnvHelper):
211215
env_helper.AZURE_SEARCH_CONTENT_VECTOR_COLUMN
212216
],
213217
"title_field": env_helper.AZURE_SEARCH_TITLE_COLUMN or None,
214-
"source_field": env_helper.AZURE_SEARCH_SOURCE_COLUMN
215-
or None,
216-
"text_field": env_helper.AZURE_SEARCH_TEXT_COLUMN or None,
217-
"layoutText_field": env_helper.AZURE_SEARCH_LAYOUT_TEXT_COLUMN
218-
or None,
219218
"url_field": env_helper.AZURE_SEARCH_FIELDS_METADATA
220219
or None,
221220
"filepath_field": (

0 commit comments

Comments
 (0)