1
- from typing import Optional
1
+ from typing import Any , Optional
2
2
3
3
import glob
4
4
import csv
5
5
import json
6
+ import os
6
7
7
8
from azure .core .credentials_async import AsyncTokenCredential
8
9
from azure .search .documents .aio import SearchClient
9
10
from azure .search .documents .indexes .aio import SearchIndexClient
10
- from azure .search . documents . models import VectorizedQuery
11
+ from azure .core . exceptions import HttpResponseError
11
12
from azure .search .documents .indexes .models import (
13
+ AzureOpenAIVectorizer ,
14
+ AzureOpenAIVectorizerParameters ,
15
+ HnswAlgorithmConfiguration ,
12
16
SearchField ,
13
- SearchFieldDataType ,
14
- SimpleField ,
17
+ SearchFieldDataType ,
15
18
SearchIndex ,
16
- VectorSearch ,
17
- VectorSearchProfile ,
18
- HnswAlgorithmConfiguration )
19
- from azure .core .exceptions import ResourceNotFoundError , HttpResponseError
20
- from azure .search .documents .indexes .models import (
19
+ SearchIndexerDataUserAssignedIdentity ,
21
20
SemanticSearch ,
22
21
SemanticConfiguration ,
23
22
SemanticPrioritizedFields ,
24
23
SemanticField ,
25
- AzureOpenAIVectorizer ,
26
- AzureOpenAIVectorizerParameters
24
+ SimpleField ,
25
+ #VectorizedQuery,
26
+ VectorSearch ,
27
+ VectorSearchProfile ,
27
28
)
28
29
29
30
@@ -40,7 +41,9 @@ class SearchIndexManager:
40
41
must be the same as one use to build the file with embeddings.
41
42
:param deployment_name: The name of the embedding deployment.
42
43
:param embeddings_endpoint: The the endpoint used for embedding.
43
- :param auth_identity: the managed identity used to access the embedding deployment.
44
+ :param auth_identity: the managed identity used to access the embedding deployment.
45
+ :param embedding_client: The embedding client, used t build the embedding. Needed only
46
+ to create embedding file. Not used in inference time.
44
47
"""
45
48
46
49
MIN_DIFF_CHARACTERS_IN_LINE = 5
@@ -55,7 +58,8 @@ def __init__(
55
58
model : str ,
56
59
deployment_name : str ,
57
60
embedding_endpoint : str ,
58
- auth_identity : str
61
+ auth_identity : str ,
62
+ embedding_client : Optional [Any ] = None
59
63
) -> None :
60
64
"""Constructor."""
61
65
self ._dimensions = dimensions
@@ -68,6 +72,7 @@ def __init__(
68
72
self ._embedding_deployment = deployment_name
69
73
self ._auth_identity = auth_identity
70
74
self ._client = None
75
+ self ._embedding_client = embedding_client
71
76
72
77
def _get_client (self ):
73
78
"""Get search client if it is absent."""
@@ -184,7 +189,9 @@ async def _index_create(self) -> SearchIndex:
184
189
parameters = AzureOpenAIVectorizerParameters (
185
190
resource_url = self ._embeddings_endpoint ,
186
191
deployment_name = self ._embedding_deployment ,
187
- auth_identity = self ._auth_identity ,
192
+ auth_identity = SearchIndexerDataUserAssignedIdentity (
193
+ resource_id = self ._auth_identity
194
+ ),
188
195
model_name = self ._embedding_model
189
196
)
190
197
)
@@ -194,9 +201,9 @@ async def _index_create(self) -> SearchIndex:
194
201
default_configuration_name = "index_search" ,
195
202
configurations = [
196
203
SemanticConfiguration (
197
- name = "search_contents " ,
204
+ name = "index_search " ,
198
205
prioritized_fields = SemanticPrioritizedFields (
199
- title_field = "embedId" ,
206
+ title_field = SemanticField ( field_name = "embedId" ) ,
200
207
content_fields = [SemanticField (field_name = "token" )]
201
208
)
202
209
)
@@ -215,7 +222,7 @@ async def build_embeddings_file(
215
222
self ,
216
223
input_directory : str ,
217
224
output_file : str ,
218
- sentences_per_embedding : int = 4
225
+ sentences_per_embedding : int = 4 ,
219
226
) -> None :
220
227
"""
221
228
In this method we do lazy loading of nltk and download the needed data set to split
@@ -230,14 +237,14 @@ async def build_embeddings_file(
230
237
:param embeddings_client: The embedding client, used to create embeddings.
231
238
Must be the same as the one used for SearchIndexManager creation.
232
239
:param sentences_per_embedding: The number of sentences used to build embedding.
233
- :param model: The embedding model to be used.
234
240
"""
235
241
import nltk
236
242
nltk .download ('punkt' )
237
243
238
244
from nltk .tokenize import sent_tokenize
239
245
# Split the data to sentence tokens.
240
246
sentence_tokens = []
247
+ references = []
241
248
globs = glob .glob (input_directory + '/*.md' , recursive = True )
242
249
index = 0
243
250
for fle in globs :
@@ -250,6 +257,7 @@ async def build_embeddings_file(
250
257
for sentence in sent_tokenize (line ):
251
258
if index % sentences_per_embedding == 0 :
252
259
sentence_tokens .append (sentence )
260
+ references .append (os .path .split (fle )[- 1 ])
253
261
else :
254
262
sentence_tokens [- 1 ] += ' '
255
263
sentence_tokens [- 1 ] += sentence
@@ -259,16 +267,19 @@ async def build_embeddings_file(
259
267
# For each token build the embedding, which will be used in the search.
260
268
batch_size = 2000
261
269
with open (output_file , 'w' ) as fp :
262
- writer = csv .DictWriter (fp , fieldnames = ['token' , 'embedding' ])
270
+ writer = csv .DictWriter (fp , fieldnames = ['token' , 'embedding' , 'document_reference' ])
263
271
writer .writeheader ()
264
272
for i in range (0 , len (sentence_tokens ), batch_size ):
265
- emedding = (await self ._embeddings_client .embed (
273
+ emedding = (await self ._embedding_client .embed (
266
274
input = sentence_tokens [i :i + min (batch_size , len (sentence_tokens ))],
267
275
dimensions = self ._dimensions ,
268
276
model = self ._embedding_model
269
277
))["data" ]
270
- for token , float_data in zip (sentence_tokens , emedding ):
271
- writer .writerow ({'token' : token , 'embedding' : json .dumps (float_data ['embedding' ])})
278
+ for token , float_data , reference in zip (sentence_tokens , emedding , references ):
279
+ writer .writerow ({
280
+ 'token' : token ,
281
+ 'embedding' : json .dumps (float_data ['embedding' ]),
282
+ 'document_reference' : reference })
272
283
273
284
async def close (self ):
274
285
"""Close the closeable resources, associated with SearchIndexManager."""
0 commit comments