5
5
NativeBlobSoftDeleteDeletionDetectionPolicy ,
6
6
)
7
7
from azure .search .documents .indexes .models import (
8
- AIServicesAccountIdentity ,
9
8
AzureOpenAIEmbeddingSkill ,
10
- DocumentIntelligenceLayoutSkill ,
11
- DocumentIntelligenceLayoutSkillChunkingProperties ,
12
- IndexingParameters ,
13
- IndexingParametersConfiguration ,
14
9
IndexProjectionMode ,
15
10
InputFieldMappingEntry ,
16
- MergeSkill ,
17
11
OutputFieldMappingEntry ,
18
12
SearchIndexer ,
19
13
SearchIndexerDataContainer ,
22
16
SearchIndexerIndexProjection ,
23
17
SearchIndexerIndexProjectionSelector ,
24
18
SearchIndexerIndexProjectionsParameters ,
25
- SearchIndexerKnowledgeStore ,
26
- SearchIndexerKnowledgeStoreFileProjectionSelector ,
27
- SearchIndexerKnowledgeStoreProjection ,
28
19
SearchIndexerSkillset ,
29
- ShaperSkill ,
30
20
SplitSkill ,
31
- VisionVectorizeSkill ,
32
21
)
33
22
34
23
from .blobmanager import BlobManager
35
- from .embeddings import AzureOpenAIEmbeddingService , ImageEmbeddings
24
+ from .embeddings import AzureOpenAIEmbeddingService
36
25
from .listfilestrategy import ListFileStrategy
37
26
from .searchmanager import SearchManager
38
27
from .strategy import DocumentAction , SearchInfo , Strategy
@@ -53,20 +42,20 @@ def __init__(
53
42
embeddings : AzureOpenAIEmbeddingService ,
54
43
search_field_name_embedding : str ,
55
44
subscription_id : str ,
45
+ search_service_user_assigned_id : str ,
56
46
document_action : DocumentAction = DocumentAction .Add ,
57
47
search_analyzer_name : Optional [str ] = None ,
58
48
use_acls : bool = False ,
59
49
category : Optional [str ] = None ,
60
- use_multimodal : bool = False ,
61
- image_embeddings : Optional [ImageEmbeddings ] = None ,
62
50
):
51
+
63
52
self .list_file_strategy = list_file_strategy
64
53
self .blob_manager = blob_manager
65
54
self .document_action = document_action
66
55
self .embeddings = embeddings
67
- self .image_embeddings = image_embeddings
68
56
self .search_field_name_embedding = search_field_name_embedding
69
57
self .subscription_id = subscription_id
58
+ self .search_user_assigned_identity = search_service_user_assigned_id
70
59
self .search_analyzer_name = search_analyzer_name
71
60
self .use_acls = use_acls
72
61
self .category = category
@@ -75,12 +64,12 @@ def __init__(
75
64
self .skillset_name = f"{ prefix } -skillset"
76
65
self .indexer_name = f"{ prefix } -indexer"
77
66
self .data_source_name = f"{ prefix } -blob"
78
- self .use_multimodal = use_multimodal and image_embeddings is not None
79
67
80
- async def create_skillset (self , index_name : str ) -> SearchIndexerSkillset :
68
+ async def create_embedding_skill (self , index_name : str ) -> SearchIndexerSkillset :
81
69
"""
82
70
Create a skillset for the indexer to chunk documents and generate embeddings
83
71
"""
72
+
84
73
split_skill = SplitSkill (
85
74
name = "split-skill" ,
86
75
description = "Split skill to chunk documents" ,
@@ -139,152 +128,6 @@ async def create_skillset(self, index_name: str) -> SearchIndexerSkillset:
139
128
140
129
return skillset
141
130
142
- async def create_multimodal_skillset (self , index_name : str ) -> SearchIndexerSkillset :
143
- if self .image_embeddings is None :
144
- raise ValueError ("Image embeddings client must be provided for multimodal skillset creation." )
145
- if self .blob_manager .image_container is None :
146
- raise ValueError ("Blob manager must have an image container set for multimodal skillset creation." )
147
-
148
- document_layout_skill = DocumentIntelligenceLayoutSkill (
149
- description = "Layout skill to read documents" ,
150
- context = "/document" ,
151
- output_mode = "oneToMany" ,
152
- output_format = "text" ,
153
- markdown_header_depth = "" , # Necessary so that SDK doesnt send a header depth
154
- extraction_options = ["images" , "locationMetadata" ],
155
- chunking_properties = DocumentIntelligenceLayoutSkillChunkingProperties (
156
- unit = "characters" ,
157
- maximum_length = 2000 ,
158
- overlap_length = 200 ,
159
- ),
160
- inputs = [InputFieldMappingEntry (name = "file_data" , source = "/document/file_data" )],
161
- outputs = [
162
- OutputFieldMappingEntry (name = "text_sections" , target_name = "text_sections" ),
163
- OutputFieldMappingEntry (name = "normalized_images" , target_name = "normalized_images" ),
164
- ],
165
- )
166
-
167
- split_skill = SplitSkill (
168
- description = "Split skill to chunk pages of documents" ,
169
- text_split_mode = "pages" ,
170
- context = "/document/text_sections/*" ,
171
- maximum_page_length = 2000 ,
172
- page_overlap_length = 500 ,
173
- inputs = [
174
- InputFieldMappingEntry (name = "text" , source = "/document/text_sections/*/content" ),
175
- ],
176
- outputs = [OutputFieldMappingEntry (name = "textItems" , target_name = "pages" )],
177
- )
178
-
179
- embedding_skill = AzureOpenAIEmbeddingSkill (
180
- name = "embedding-skill" ,
181
- description = "Skill to generate embeddings via Azure OpenAI" ,
182
- context = "/document/text_sections/*/pages/*" ,
183
- resource_url = f"https://{ self .embeddings .open_ai_service } .openai.azure.com" ,
184
- deployment_name = self .embeddings .open_ai_deployment ,
185
- model_name = self .embeddings .open_ai_model_name ,
186
- dimensions = self .embeddings .open_ai_dimensions ,
187
- inputs = [
188
- InputFieldMappingEntry (name = "text" , source = "/document/text_sections/*/pages/*" ),
189
- ],
190
- outputs = [OutputFieldMappingEntry (name = "embedding" , target_name = "vector" )],
191
- )
192
-
193
- vision_embedding_skill = VisionVectorizeSkill (
194
- name = "vision-embedding-skill" ,
195
- description = "Skill to generate image embeddings via Azure AI Vision" ,
196
- context = "/document/normalized_images/*" ,
197
- model_version = "2023-04-15" ,
198
- inputs = [InputFieldMappingEntry (name = "image" , source = "/document/normalized_images/*" )],
199
- outputs = [OutputFieldMappingEntry (name = "vector" , target_name = "image_vector" )],
200
- )
201
-
202
- vision_embedding_shaper_skill = ShaperSkill (
203
- name = "vision-embedding-shaper-skill" ,
204
- description = "Shaper skill to ensure image embeddings are in the correct format" ,
205
- context = "/document/normalized_images/*" ,
206
- inputs = [
207
- InputFieldMappingEntry (name = "embedding" , source = "/document/normalized_images/*/image_vector" ),
208
- InputFieldMappingEntry (
209
- name = "url" ,
210
- source = f'="{ self .blob_manager .endpoint } /images/"+$(/document/normalized_images/*/imagePath)' ,
211
- ),
212
- ],
213
- outputs = [OutputFieldMappingEntry (name = "output" , target_name = "images" )],
214
- )
215
-
216
- merge_skill = MergeSkill (
217
- name = "merge-skill" ,
218
- description = "Merge skill to create source page" ,
219
- insert_post_tag = "" ,
220
- insert_pre_tag = "" ,
221
- context = "/document/text_sections/*/locationMetadata" ,
222
- inputs = [
223
- InputFieldMappingEntry (
224
- name = "itemsToInsert" ,
225
- source = '=[$(/document/metadata_storage_name), "#page=", $(/document/text_sections/*/locationMetadata/pageNumber)]' ,
226
- )
227
- ],
228
- outputs = [OutputFieldMappingEntry (name = "mergedText" , target_name = "citation" )],
229
- )
230
-
231
- indexer_skills = [
232
- document_layout_skill ,
233
- split_skill ,
234
- embedding_skill ,
235
- vision_embedding_skill ,
236
- vision_embedding_shaper_skill ,
237
- merge_skill ,
238
- ]
239
-
240
- index_projection = SearchIndexerIndexProjection (
241
- selectors = [
242
- SearchIndexerIndexProjectionSelector (
243
- target_index_name = index_name ,
244
- parent_key_field_name = "parent_id" ,
245
- source_context = "/document/text_sections/*/pages/*" ,
246
- mappings = [
247
- InputFieldMappingEntry (name = "content" , source = "/document/text_sections/*/pages/*" ),
248
- InputFieldMappingEntry (name = "sourcefile" , source = "/document/metadata_storage_name" ),
249
- InputFieldMappingEntry (
250
- name = "sourcepage" , source = "/document/text_sections/*/locationMetadata/citation"
251
- ),
252
- InputFieldMappingEntry (name = "storageUrl" , source = "/document/metadata_storage_path" ),
253
- InputFieldMappingEntry (
254
- name = self .search_field_name_embedding , source = "/document/text_sections/*/pages/*/vector"
255
- ),
256
- InputFieldMappingEntry (name = "images" , source = "/document/normalized_images/*/images" ),
257
- ],
258
- )
259
- ],
260
- parameters = SearchIndexerIndexProjectionsParameters (
261
- projection_mode = IndexProjectionMode .SKIP_INDEXING_PARENT_DOCUMENTS
262
- ),
263
- )
264
-
265
- skillset = SearchIndexerSkillset (
266
- name = self .skillset_name ,
267
- description = "Skillset to chunk documents and generate embeddings" ,
268
- skills = indexer_skills ,
269
- index_projection = index_projection ,
270
- cognitive_services_account = AIServicesAccountIdentity (subdomain_url = self .image_embeddings .endpoint ),
271
- knowledge_store = SearchIndexerKnowledgeStore (
272
- storage_connection_string = self .blob_manager .get_managedidentity_connectionstring (),
273
- projections = [
274
- SearchIndexerKnowledgeStoreProjection (
275
- files = [
276
- SearchIndexerKnowledgeStoreFileProjectionSelector (
277
- storage_container = self .blob_manager .image_container ,
278
- source = "/document/normalized_images/*" ,
279
- )
280
- ]
281
- )
282
- ],
283
- ),
284
- )
285
-
286
- return skillset
287
-
288
131
async def setup (self ):
289
132
logger .info ("Setting up search index using integrated vectorization..." )
290
133
search_manager = SearchManager (
@@ -294,7 +137,7 @@ async def setup(self):
294
137
use_int_vectorization = True ,
295
138
embeddings = self .embeddings ,
296
139
field_name_embedding = self .search_field_name_embedding ,
297
- search_images = self . use_multimodal ,
140
+ search_images = False ,
298
141
)
299
142
300
143
await search_manager .create_index ()
@@ -311,11 +154,8 @@ async def setup(self):
311
154
312
155
await ds_client .create_or_update_data_source_connection (data_source_connection )
313
156
314
- if self .use_multimodal :
315
- skillset = await self .create_multimodal_skillset (self .search_info .index_name )
316
- else :
317
- skillset = await self .create_skillset (self .search_info .index_name )
318
- await ds_client .create_or_update_skillset (skillset )
157
+ embedding_skillset = await self .create_embedding_skill (self .search_info .index_name )
158
+ await ds_client .create_or_update_skillset (embedding_skillset )
319
159
await ds_client .close ()
320
160
321
161
async def run (self ):
@@ -334,22 +174,13 @@ async def run(self):
334
174
elif self .document_action == DocumentAction .RemoveAll :
335
175
await self .blob_manager .remove_blob ()
336
176
337
- indexing_parameters = None
338
- if self .use_multimodal :
339
- indexing_parameters = IndexingParameters (
340
- configuration = IndexingParametersConfiguration (
341
- query_timeout = None , allow_skillset_to_read_file_data = True # type: ignore
342
- ),
343
- max_failed_items = - 1 ,
344
- )
345
-
177
+ # Create an indexer
346
178
indexer = SearchIndexer (
347
179
name = self .indexer_name ,
348
180
description = "Indexer to index documents and generate embeddings" ,
349
181
skillset_name = self .skillset_name ,
350
182
target_index_name = self .search_info .index_name ,
351
183
data_source_name = self .data_source_name ,
352
- parameters = indexing_parameters , # Properly pass the parameters
353
184
)
354
185
355
186
indexer_client = self .search_info .create_search_indexer_client ()
0 commit comments