7
7
from azure .search .documents .indexes .models import (
8
8
AIServicesAccountIdentity ,
9
9
AzureOpenAIEmbeddingSkill ,
10
- BlobIndexerImageAction ,
10
+ DocumentIntelligenceLayoutSkill ,
11
+ DocumentIntelligenceLayoutSkillChunkingProperties ,
11
12
IndexingParameters ,
12
13
IndexingParametersConfiguration ,
13
14
IndexProjectionMode ,
14
15
InputFieldMappingEntry ,
16
+ MergeSkill ,
15
17
OutputFieldMappingEntry ,
16
18
SearchIndexer ,
17
19
SearchIndexerDataContainer ,
@@ -75,11 +77,10 @@ def __init__(
75
77
self .data_source_name = f"{ prefix } -blob"
76
78
self .use_multimodal = use_multimodal and image_embeddings is not None
77
79
78
- async def create_embedding_skill (self , index_name : str ) -> SearchIndexerSkillset :
80
+ async def create_skillset (self , index_name : str ) -> SearchIndexerSkillset :
79
81
"""
80
82
Create a skillset for the indexer to chunk documents and generate embeddings
81
83
"""
82
-
83
84
split_skill = SplitSkill (
84
85
name = "split-skill" ,
85
86
description = "Split skill to chunk documents" ,
@@ -107,6 +108,83 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
107
108
outputs = [OutputFieldMappingEntry (name = "embedding" , target_name = "vector" )],
108
109
)
109
110
111
+ index_projection = SearchIndexerIndexProjection (
112
+ selectors = [
113
+ SearchIndexerIndexProjectionSelector (
114
+ target_index_name = index_name ,
115
+ parent_key_field_name = "parent_id" ,
116
+ source_context = "/document/pages/*" ,
117
+ mappings = [
118
+ InputFieldMappingEntry (name = "content" , source = "/document/pages/*" ),
119
+ InputFieldMappingEntry (name = "sourcepage" , source = "/document/metadata_storage_name" ),
120
+ InputFieldMappingEntry (name = "sourcefile" , source = "/document/metadata_storage_name" ),
121
+ InputFieldMappingEntry (name = "storageUrl" , source = "/document/metadata_storage_path" ),
122
+ InputFieldMappingEntry (
123
+ name = self .search_field_name_embedding , source = "/document/pages/*/vector"
124
+ ),
125
+ ],
126
+ ),
127
+ ],
128
+ parameters = SearchIndexerIndexProjectionsParameters (
129
+ projection_mode = IndexProjectionMode .SKIP_INDEXING_PARENT_DOCUMENTS
130
+ ),
131
+ )
132
+
133
+ skillset = SearchIndexerSkillset (
134
+ name = self .skillset_name ,
135
+ description = "Skillset to chunk documents and generate embeddings" ,
136
+ skills = [split_skill , embedding_skill ],
137
+ index_projection = index_projection ,
138
+ )
139
+
140
+ return skillset
141
+
142
+ async def create_multimodal_skillset (self , index_name : str ) -> SearchIndexerSkillset :
143
+ document_layout_skill = DocumentIntelligenceLayoutSkill (
144
+ description = "Layout skill to read documents" ,
145
+ context = "/document" ,
146
+ output_mode = "oneToMany" ,
147
+ output_format = "text" ,
148
+ markdown_header_depth = "" , # Necessary so that SDK doesnt send a header depth
149
+ extraction_options = ["images" , "locationMetadata" ],
150
+ chunking_properties = DocumentIntelligenceLayoutSkillChunkingProperties (
151
+ unit = "characters" ,
152
+ maximum_length = 2000 ,
153
+ overlap_length = 200 ,
154
+ ),
155
+ inputs = [InputFieldMappingEntry (name = "file_data" , source = "/document/file_data" )],
156
+ outputs = [
157
+ OutputFieldMappingEntry (name = "text_sections" , target_name = "text_sections" ),
158
+ OutputFieldMappingEntry (name = "normalized_images" , target_name = "normalized_images" ),
159
+ ],
160
+ )
161
+
162
+ split_skill = SplitSkill (
163
+ description = "Split skill to chunk pages of documents" ,
164
+ text_split_mode = "pages" ,
165
+ context = "/document/text_sections/*" ,
166
+ maximum_page_length = 2000 ,
167
+ page_overlap_length = 500 ,
168
+ inputs = [
169
+ InputFieldMappingEntry (name = "text" , source = "/document/text_sections/*/content" ),
170
+ ],
171
+ outputs = [OutputFieldMappingEntry (name = "textItems" , target_name = "pages" )],
172
+ )
173
+
174
+ embedding_skill = AzureOpenAIEmbeddingSkill (
175
+ name = "embedding-skill" ,
176
+ description = "Skill to generate embeddings via Azure OpenAI" ,
177
+ context = "/document/text_sections/*/pages/*" ,
178
+ resource_url = f"https://{ self .embeddings .open_ai_service } .openai.azure.com" ,
179
+ deployment_name = self .embeddings .open_ai_deployment ,
180
+ model_name = self .embeddings .open_ai_model_name ,
181
+ dimensions = self .embeddings .open_ai_dimensions ,
182
+ inputs = [
183
+ InputFieldMappingEntry (name = "text" , source = "/document/text_sections/*/pages/*" ),
184
+ ],
185
+ outputs = [OutputFieldMappingEntry (name = "embedding" , target_name = "vector" )],
186
+ )
187
+
110
188
vision_embedding_skill = VisionVectorizeSkill (
111
189
name = "vision-embedding-skill" ,
112
190
description = "Skill to generate image embeddings via Azure AI Vision" ,
@@ -115,6 +193,7 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
115
193
inputs = [InputFieldMappingEntry (name = "image" , source = "/document/normalized_images/*" )],
116
194
outputs = [OutputFieldMappingEntry (name = "vector" , target_name = "image_vector" )],
117
195
)
196
+
118
197
vision_embedding_shaper_skill = ShaperSkill (
119
198
name = "vision-embedding-shaper-skill" ,
120
199
description = "Shaper skill to ensure image embeddings are in the correct format" ,
@@ -123,70 +202,80 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
123
202
InputFieldMappingEntry (name = "embedding" , source = "/document/normalized_images/*/image_vector" ),
124
203
InputFieldMappingEntry (
125
204
name = "url" ,
126
- # source=f'="{self.blob_manager.endpoint}/images/"+$(/document/normalized_images/*/imagePath)'
127
- source = "=$(/document/normalized_images/*/imagePath)" ,
205
+ source = f'="{ self .blob_manager .endpoint } /images/"+$(/document/normalized_images/*/imagePath)' ,
128
206
),
129
207
],
130
208
outputs = [OutputFieldMappingEntry (name = "output" , target_name = "images" )],
131
209
)
132
210
211
+ merge_skill = MergeSkill (
212
+ name = "merge-skill" ,
213
+ description = "Merge skill to create source page" ,
214
+ insert_post_tag = "" ,
215
+ insert_pre_tag = "" ,
216
+ context = "/document/text_sections/*/locationMetadata" ,
217
+ inputs = [
218
+ InputFieldMappingEntry (
219
+ name = "itemsToInsert" ,
220
+ source = '=[$(/document/metadata_storage_name), "#page=", $(/document/text_sections/*/locationMetadata/pageNumber)]' ,
221
+ )
222
+ ],
223
+ outputs = [OutputFieldMappingEntry (name = "mergedText" , target_name = "citation" )],
224
+ )
225
+
226
+ indexer_skills = [
227
+ document_layout_skill ,
228
+ split_skill ,
229
+ embedding_skill ,
230
+ vision_embedding_skill ,
231
+ vision_embedding_shaper_skill ,
232
+ merge_skill ,
233
+ ]
234
+
133
235
index_projection = SearchIndexerIndexProjection (
134
236
selectors = [
135
237
SearchIndexerIndexProjectionSelector (
136
238
target_index_name = index_name ,
137
239
parent_key_field_name = "parent_id" ,
138
- source_context = "/document/pages/*" ,
240
+ source_context = "/document/text_sections/*/ pages/*" ,
139
241
mappings = [
140
- InputFieldMappingEntry (name = "content" , source = "/document/pages/*" ),
141
- InputFieldMappingEntry (name = "sourcepage" , source = "/document/metadata_storage_name" ),
242
+ InputFieldMappingEntry (name = "content" , source = "/document/text_sections/*/pages/*" ),
142
243
InputFieldMappingEntry (name = "sourcefile" , source = "/document/metadata_storage_name" ),
244
+ InputFieldMappingEntry (
245
+ name = "sourcepage" , source = "/document/text_sections/*/locationMetadata/citation"
246
+ ),
143
247
InputFieldMappingEntry (name = "storageUrl" , source = "/document/metadata_storage_path" ),
144
248
InputFieldMappingEntry (
145
- name = self .search_field_name_embedding , source = "/document/pages/*/vector"
249
+ name = self .search_field_name_embedding , source = "/document/text_sections/*/ pages/*/vector"
146
250
),
147
251
InputFieldMappingEntry (name = "images" , source = "/document/normalized_images/*/images" ),
148
252
],
149
- ),
253
+ )
150
254
],
151
255
parameters = SearchIndexerIndexProjectionsParameters (
152
256
projection_mode = IndexProjectionMode .SKIP_INDEXING_PARENT_DOCUMENTS
153
257
),
154
258
)
155
259
156
- indexer_skills = [split_skill , embedding_skill ]
157
- if self .use_multimodal :
158
- indexer_skills .extend ([vision_embedding_skill , vision_embedding_shaper_skill ])
159
- extra_params = {}
160
- if self .use_multimodal :
161
- extra_params = {
162
- "cognitive_services_account" : AIServicesAccountIdentity (subdomain_url = self .image_embeddings .endpoint ),
163
- "knowledge_store" : SearchIndexerKnowledgeStore (
164
- storage_connection_string = self .blob_manager .get_managedidentity_connectionstring (),
165
- projections = [
166
- SearchIndexerKnowledgeStoreProjection (
167
- files = [
168
- SearchIndexerKnowledgeStoreFileProjectionSelector (
169
- storage_container = self .blob_manager .image_container ,
170
- source = "/document/normalized_images/*" ,
171
- )
172
- ]
173
- )
174
- ],
175
- ),
176
- }
177
-
178
- # We still need to map the images onto url in the images complex field type
179
- # something about key path
180
- # id = "feb5e192afb6_aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L05vcnRod2luZF9IZWFsdGhfUGx1c19CZW5lZml0c19EZXRhaWxzLnBkZg2_pages_65",
181
- # parent_id = is the folder name
182
- # https://stxxk4qzq3tahic2.blob.core.windows.net/images/aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L0JlbmVmaXRfT3B0aW9ucy5wZGY1/normalized_images_1.jpg
183
-
184
260
skillset = SearchIndexerSkillset (
185
261
name = self .skillset_name ,
186
262
description = "Skillset to chunk documents and generate embeddings" ,
187
263
skills = indexer_skills ,
188
264
index_projection = index_projection ,
189
- ** extra_params ,
265
+ cognitive_services_account = AIServicesAccountIdentity (subdomain_url = self .image_embeddings .endpoint ),
266
+ knowledge_store = SearchIndexerKnowledgeStore (
267
+ storage_connection_string = self .blob_manager .get_managedidentity_connectionstring (),
268
+ projections = [
269
+ SearchIndexerKnowledgeStoreProjection (
270
+ files = [
271
+ SearchIndexerKnowledgeStoreFileProjectionSelector (
272
+ storage_container = self .blob_manager .image_container ,
273
+ source = "/document/normalized_images/*" ,
274
+ )
275
+ ]
276
+ )
277
+ ],
278
+ ),
190
279
)
191
280
192
281
return skillset
@@ -217,8 +306,11 @@ async def setup(self):
217
306
218
307
await ds_client .create_or_update_data_source_connection (data_source_connection )
219
308
220
- embedding_skillset = await self .create_embedding_skill (self .search_info .index_name )
221
- await ds_client .create_or_update_skillset (embedding_skillset )
309
+ if self .use_multimodal :
310
+ skillset = await self .create_multimodal_skillset (self .search_info .index_name )
311
+ else :
312
+ skillset = await self .create_skillset (self .search_info .index_name )
313
+ await ds_client .create_or_update_skillset (skillset )
222
314
await ds_client .close ()
223
315
224
316
async def run (self ):
@@ -237,15 +329,14 @@ async def run(self):
237
329
elif self .document_action == DocumentAction .RemoveAll :
238
330
await self .blob_manager .remove_blob ()
239
331
240
- # Create an indexer
241
332
extra_params = {}
242
333
if self .use_multimodal :
243
334
extra_params = {
244
335
"parameters" : IndexingParameters (
245
336
configuration = IndexingParametersConfiguration (
246
- query_timeout = None , # Current bug in AI Search SDK
247
- image_action = BlobIndexerImageAction . GENERATE_NORMALIZED_IMAGES ,
248
- )
337
+ query_timeout = None , allow_skillset_to_read_file_data = True # Current bug in AI Search SDK
338
+ ) ,
339
+ max_failed_items = - 1 ,
249
340
)
250
341
}
251
342
0 commit comments