Skip to content

Commit e074113

Browse files
committed
Add latest int vect changes
1 parent 13e85ee commit e074113

File tree

4 files changed

+142
-56
lines changed

4 files changed

+142
-56
lines changed

app/backend/app.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -717,7 +717,8 @@ async def setup_clients():
717717
async def close_clients():
718718
await current_app.config[CONFIG_SEARCH_CLIENT].close()
719719
await current_app.config[CONFIG_GLOBAL_BLOB_MANAGER].close_clients()
720-
await current_app.config[CONFIG_USER_BLOB_MANAGER].close_clients()
720+
if user_blob_manager := current_app.config.get(CONFIG_USER_BLOB_MANAGER):
721+
await user_blob_manager.close_clients()
721722

722723

723724
def create_app():

app/backend/prepdocslib/integratedvectorizerstrategy.py

Lines changed: 136 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@
77
from azure.search.documents.indexes.models import (
88
AIServicesAccountIdentity,
99
AzureOpenAIEmbeddingSkill,
10-
BlobIndexerImageAction,
10+
DocumentIntelligenceLayoutSkill,
11+
DocumentIntelligenceLayoutSkillChunkingProperties,
1112
IndexingParameters,
1213
IndexingParametersConfiguration,
1314
IndexProjectionMode,
1415
InputFieldMappingEntry,
16+
MergeSkill,
1517
OutputFieldMappingEntry,
1618
SearchIndexer,
1719
SearchIndexerDataContainer,
@@ -75,11 +77,10 @@ def __init__(
7577
self.data_source_name = f"{prefix}-blob"
7678
self.use_multimodal = use_multimodal and image_embeddings is not None
7779

78-
async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset:
80+
async def create_skillset(self, index_name: str) -> SearchIndexerSkillset:
7981
"""
8082
Create a skillset for the indexer to chunk documents and generate embeddings
8183
"""
82-
8384
split_skill = SplitSkill(
8485
name="split-skill",
8586
description="Split skill to chunk documents",
@@ -107,6 +108,83 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
107108
outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
108109
)
109110

111+
index_projection = SearchIndexerIndexProjection(
112+
selectors=[
113+
SearchIndexerIndexProjectionSelector(
114+
target_index_name=index_name,
115+
parent_key_field_name="parent_id",
116+
source_context="/document/pages/*",
117+
mappings=[
118+
InputFieldMappingEntry(name="content", source="/document/pages/*"),
119+
InputFieldMappingEntry(name="sourcepage", source="/document/metadata_storage_name"),
120+
InputFieldMappingEntry(name="sourcefile", source="/document/metadata_storage_name"),
121+
InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"),
122+
InputFieldMappingEntry(
123+
name=self.search_field_name_embedding, source="/document/pages/*/vector"
124+
),
125+
],
126+
),
127+
],
128+
parameters=SearchIndexerIndexProjectionsParameters(
129+
projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
130+
),
131+
)
132+
133+
skillset = SearchIndexerSkillset(
134+
name=self.skillset_name,
135+
description="Skillset to chunk documents and generate embeddings",
136+
skills=[split_skill, embedding_skill],
137+
index_projection=index_projection,
138+
)
139+
140+
return skillset
141+
142+
async def create_multimodal_skillset(self, index_name: str) -> SearchIndexerSkillset:
143+
document_layout_skill = DocumentIntelligenceLayoutSkill(
144+
description="Layout skill to read documents",
145+
context="/document",
146+
output_mode="oneToMany",
147+
output_format="text",
148+
markdown_header_depth="", # Necessary so that SDK doesnt send a header depth
149+
extraction_options=["images", "locationMetadata"],
150+
chunking_properties=DocumentIntelligenceLayoutSkillChunkingProperties(
151+
unit="characters",
152+
maximum_length=2000,
153+
overlap_length=200,
154+
),
155+
inputs=[InputFieldMappingEntry(name="file_data", source="/document/file_data")],
156+
outputs=[
157+
OutputFieldMappingEntry(name="text_sections", target_name="text_sections"),
158+
OutputFieldMappingEntry(name="normalized_images", target_name="normalized_images"),
159+
],
160+
)
161+
162+
split_skill = SplitSkill(
163+
description="Split skill to chunk pages of documents",
164+
text_split_mode="pages",
165+
context="/document/text_sections/*",
166+
maximum_page_length=2000,
167+
page_overlap_length=500,
168+
inputs=[
169+
InputFieldMappingEntry(name="text", source="/document/text_sections/*/content"),
170+
],
171+
outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
172+
)
173+
174+
embedding_skill = AzureOpenAIEmbeddingSkill(
175+
name="embedding-skill",
176+
description="Skill to generate embeddings via Azure OpenAI",
177+
context="/document/text_sections/*/pages/*",
178+
resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
179+
deployment_name=self.embeddings.open_ai_deployment,
180+
model_name=self.embeddings.open_ai_model_name,
181+
dimensions=self.embeddings.open_ai_dimensions,
182+
inputs=[
183+
InputFieldMappingEntry(name="text", source="/document/text_sections/*/pages/*"),
184+
],
185+
outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
186+
)
187+
110188
vision_embedding_skill = VisionVectorizeSkill(
111189
name="vision-embedding-skill",
112190
description="Skill to generate image embeddings via Azure AI Vision",
@@ -115,6 +193,7 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
115193
inputs=[InputFieldMappingEntry(name="image", source="/document/normalized_images/*")],
116194
outputs=[OutputFieldMappingEntry(name="vector", target_name="image_vector")],
117195
)
196+
118197
vision_embedding_shaper_skill = ShaperSkill(
119198
name="vision-embedding-shaper-skill",
120199
description="Shaper skill to ensure image embeddings are in the correct format",
@@ -123,70 +202,80 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
123202
InputFieldMappingEntry(name="embedding", source="/document/normalized_images/*/image_vector"),
124203
InputFieldMappingEntry(
125204
name="url",
126-
# source=f'="{self.blob_manager.endpoint}/images/"+$(/document/normalized_images/*/imagePath)'
127-
source="=$(/document/normalized_images/*/imagePath)",
205+
source=f'="{self.blob_manager.endpoint}/images/"+$(/document/normalized_images/*/imagePath)',
128206
),
129207
],
130208
outputs=[OutputFieldMappingEntry(name="output", target_name="images")],
131209
)
132210

211+
merge_skill = MergeSkill(
212+
name="merge-skill",
213+
description="Merge skill to create source page",
214+
insert_post_tag="",
215+
insert_pre_tag="",
216+
context="/document/text_sections/*/locationMetadata",
217+
inputs=[
218+
InputFieldMappingEntry(
219+
name="itemsToInsert",
220+
source='=[$(/document/metadata_storage_name), "#page=", $(/document/text_sections/*/locationMetadata/pageNumber)]',
221+
)
222+
],
223+
outputs=[OutputFieldMappingEntry(name="mergedText", target_name="citation")],
224+
)
225+
226+
indexer_skills = [
227+
document_layout_skill,
228+
split_skill,
229+
embedding_skill,
230+
vision_embedding_skill,
231+
vision_embedding_shaper_skill,
232+
merge_skill,
233+
]
234+
133235
index_projection = SearchIndexerIndexProjection(
134236
selectors=[
135237
SearchIndexerIndexProjectionSelector(
136238
target_index_name=index_name,
137239
parent_key_field_name="parent_id",
138-
source_context="/document/pages/*",
240+
source_context="/document/text_sections/*/pages/*",
139241
mappings=[
140-
InputFieldMappingEntry(name="content", source="/document/pages/*"),
141-
InputFieldMappingEntry(name="sourcepage", source="/document/metadata_storage_name"),
242+
InputFieldMappingEntry(name="content", source="/document/text_sections/*/pages/*"),
142243
InputFieldMappingEntry(name="sourcefile", source="/document/metadata_storage_name"),
244+
InputFieldMappingEntry(
245+
name="sourcepage", source="/document/text_sections/*/locationMetadata/citation"
246+
),
143247
InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"),
144248
InputFieldMappingEntry(
145-
name=self.search_field_name_embedding, source="/document/pages/*/vector"
249+
name=self.search_field_name_embedding, source="/document/text_sections/*/pages/*/vector"
146250
),
147251
InputFieldMappingEntry(name="images", source="/document/normalized_images/*/images"),
148252
],
149-
),
253+
)
150254
],
151255
parameters=SearchIndexerIndexProjectionsParameters(
152256
projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
153257
),
154258
)
155259

156-
indexer_skills = [split_skill, embedding_skill]
157-
if self.use_multimodal:
158-
indexer_skills.extend([vision_embedding_skill, vision_embedding_shaper_skill])
159-
extra_params = {}
160-
if self.use_multimodal:
161-
extra_params = {
162-
"cognitive_services_account": AIServicesAccountIdentity(subdomain_url=self.image_embeddings.endpoint),
163-
"knowledge_store": SearchIndexerKnowledgeStore(
164-
storage_connection_string=self.blob_manager.get_managedidentity_connectionstring(),
165-
projections=[
166-
SearchIndexerKnowledgeStoreProjection(
167-
files=[
168-
SearchIndexerKnowledgeStoreFileProjectionSelector(
169-
storage_container=self.blob_manager.image_container,
170-
source="/document/normalized_images/*",
171-
)
172-
]
173-
)
174-
],
175-
),
176-
}
177-
178-
# We still need to map the images onto url in the images complex field type
179-
# something about key path
180-
# id = "feb5e192afb6_aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L05vcnRod2luZF9IZWFsdGhfUGx1c19CZW5lZml0c19EZXRhaWxzLnBkZg2_pages_65",
181-
# parent_id = is the folder name
182-
# https://stxxk4qzq3tahic2.blob.core.windows.net/images/aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L0JlbmVmaXRfT3B0aW9ucy5wZGY1/normalized_images_1.jpg
183-
184260
skillset = SearchIndexerSkillset(
185261
name=self.skillset_name,
186262
description="Skillset to chunk documents and generate embeddings",
187263
skills=indexer_skills,
188264
index_projection=index_projection,
189-
**extra_params,
265+
cognitive_services_account=AIServicesAccountIdentity(subdomain_url=self.image_embeddings.endpoint),
266+
knowledge_store=SearchIndexerKnowledgeStore(
267+
storage_connection_string=self.blob_manager.get_managedidentity_connectionstring(),
268+
projections=[
269+
SearchIndexerKnowledgeStoreProjection(
270+
files=[
271+
SearchIndexerKnowledgeStoreFileProjectionSelector(
272+
storage_container=self.blob_manager.image_container,
273+
source="/document/normalized_images/*",
274+
)
275+
]
276+
)
277+
],
278+
),
190279
)
191280

192281
return skillset
@@ -217,8 +306,11 @@ async def setup(self):
217306

218307
await ds_client.create_or_update_data_source_connection(data_source_connection)
219308

220-
embedding_skillset = await self.create_embedding_skill(self.search_info.index_name)
221-
await ds_client.create_or_update_skillset(embedding_skillset)
309+
if self.use_multimodal:
310+
skillset = await self.create_multimodal_skillset(self.search_info.index_name)
311+
else:
312+
skillset = await self.create_skillset(self.search_info.index_name)
313+
await ds_client.create_or_update_skillset(skillset)
222314
await ds_client.close()
223315

224316
async def run(self):
@@ -237,15 +329,14 @@ async def run(self):
237329
elif self.document_action == DocumentAction.RemoveAll:
238330
await self.blob_manager.remove_blob()
239331

240-
# Create an indexer
241332
extra_params = {}
242333
if self.use_multimodal:
243334
extra_params = {
244335
"parameters": IndexingParameters(
245336
configuration=IndexingParametersConfiguration(
246-
query_timeout=None, # Current bug in AI Search SDK
247-
image_action=BlobIndexerImageAction.GENERATE_NORMALIZED_IMAGES,
248-
)
337+
query_timeout=None, allow_skillset_to_read_file_data=True # Current bug in AI Search SDK
338+
),
339+
max_failed_items=-1,
249340
)
250341
}
251342

0 commit comments

Comments
 (0)