6
6
)
7
7
from azure .search .documents .indexes .models import (
8
8
AzureOpenAIEmbeddingSkill ,
9
- FieldMapping ,
10
9
IndexProjectionMode ,
11
10
InputFieldMappingEntry ,
12
11
OutputFieldMappingEntry ,
@@ -63,15 +62,18 @@ def __init__(
63
62
self .use_acls = use_acls
64
63
self .category = category
65
64
self .search_info = search_info
65
+ prefix = f"{ self .search_info .index_name } -{ self .search_field_name_embedding } "
66
+ self .skillset_name = f"{ prefix } -skillset"
67
+ self .indexer_name = f"{ prefix } -indexer"
68
+ self .data_source_name = f"{ prefix } -blob"
66
69
67
70
async def create_embedding_skill (self , index_name : str ) -> SearchIndexerSkillset :
68
71
"""
69
72
Create a skillset for the indexer to chunk documents and generate embeddings
70
73
"""
71
- skillset_name = f"{ index_name } -skillset"
72
74
73
75
split_skill = SplitSkill (
74
- name = f" { index_name } - split-skill" ,
76
+ name = " split-skill" ,
75
77
description = "Split skill to chunk documents" ,
76
78
text_split_mode = "pages" ,
77
79
context = "/document" ,
@@ -84,7 +86,7 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
84
86
)
85
87
86
88
embedding_skill = AzureOpenAIEmbeddingSkill (
87
- name = f" { index_name } - embedding-skill" ,
89
+ name = " embedding-skill" ,
88
90
description = "Skill to generate embeddings via Azure OpenAI" ,
89
91
context = "/document/pages/*" ,
90
92
resource_url = f"https://{ self .embeddings .open_ai_service } .openai.azure.com" ,
@@ -94,7 +96,7 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
94
96
inputs = [
95
97
InputFieldMappingEntry (name = "text" , source = "/document/pages/*" ),
96
98
],
97
- outputs = [OutputFieldMappingEntry (name = self . search_field_name_embedding , target_name = "vector" )],
99
+ outputs = [OutputFieldMappingEntry (name = "embedding" , target_name = "vector" )],
98
100
)
99
101
100
102
index_projection = SearchIndexerIndexProjection (
@@ -106,6 +108,8 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
106
108
mappings = [
107
109
InputFieldMappingEntry (name = "content" , source = "/document/pages/*" ),
108
110
InputFieldMappingEntry (name = "sourcepage" , source = "/document/metadata_storage_name" ),
111
+ InputFieldMappingEntry (name = "sourcefile" , source = "/document/metadata_storage_name" ),
112
+ InputFieldMappingEntry (name = "storageUrl" , source = "/document/metadata_storage_path" ),
109
113
InputFieldMappingEntry (
110
114
name = self .search_field_name_embedding , source = "/document/pages/*/vector"
111
115
),
@@ -118,7 +122,7 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
118
122
)
119
123
120
124
skillset = SearchIndexerSkillset (
121
- name = skillset_name ,
125
+ name = self . skillset_name ,
122
126
description = "Skillset to chunk documents and generate embeddings" ,
123
127
skills = [split_skill , embedding_skill ],
124
128
index_projection = index_projection ,
@@ -144,7 +148,7 @@ async def setup(self):
144
148
ds_client = self .search_info .create_search_indexer_client ()
145
149
ds_container = SearchIndexerDataContainer (name = self .blob_manager .container )
146
150
data_source_connection = SearchIndexerDataSourceConnection (
147
- name = f" { self .search_info . index_name } -blob" ,
151
+ name = self .data_source_name ,
148
152
type = SearchIndexerDataSourceType .AZURE_BLOB ,
149
153
connection_string = self .blob_manager .get_managedidentity_connectionstring (),
150
154
container = ds_container ,
@@ -174,23 +178,19 @@ async def run(self):
174
178
await self .blob_manager .remove_blob ()
175
179
176
180
# Create an indexer
177
- indexer_name = f"{ self .search_info .index_name } -indexer"
178
-
179
181
indexer = SearchIndexer (
180
- name = indexer_name ,
182
+ name = self . indexer_name ,
181
183
description = "Indexer to index documents and generate embeddings" ,
182
- skillset_name = f" { self .search_info . index_name } -skillset" ,
184
+ skillset_name = self .skillset_name ,
183
185
target_index_name = self .search_info .index_name ,
184
- data_source_name = f"{ self .search_info .index_name } -blob" ,
185
- # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results
186
- field_mappings = [FieldMapping (source_field_name = "metadata_storage_name" , target_field_name = "title" )],
186
+ data_source_name = self .data_source_name ,
187
187
)
188
188
189
189
indexer_client = self .search_info .create_search_indexer_client ()
190
190
indexer_result = await indexer_client .create_or_update_indexer (indexer )
191
191
192
192
# Run the indexer
193
- await indexer_client .run_indexer (indexer_name )
193
+ await indexer_client .run_indexer (self . indexer_name )
194
194
await indexer_client .close ()
195
195
196
196
logger .info (
0 commit comments