30
30
)
31
31
from azure .storage .blob import BlobServiceClient
32
32
from pypdf import PdfReader , PdfWriter
33
- from tenacity import retry , stop_after_attempt , wait_random_exponential
33
+ from tenacity import (
34
+ retry ,
35
+ retry_if_exception_type ,
36
+ stop_after_attempt ,
37
+ wait_random_exponential ,
38
+ )
39
+
40
+ args = argparse .Namespace (verbose = False )
34
41
35
42
MAX_SECTION_LENGTH = 1000
36
43
SENTENCE_SEARCH_LIMIT = 100
@@ -225,7 +232,7 @@ def filename_to_id(filename):
225
232
filename_hash = base64 .b16encode (filename .encode ('utf-8' )).decode ('ascii' )
226
233
return f"file-{ filename_ascii } -{ filename_hash } "
227
234
228
- def create_sections (filename , page_map , use_vectors ):
235
+ def create_sections (filename , page_map , use_vectors , embedding_deployment : str = None ):
229
236
file_id = filename_to_id (filename )
230
237
for i , (content , pagenum ) in enumerate (split_text (page_map , filename )):
231
238
section = {
@@ -236,16 +243,16 @@ def create_sections(filename, page_map, use_vectors):
236
243
"sourcefile" : filename
237
244
}
238
245
if use_vectors :
239
- section ["embedding" ] = compute_embedding (content )
246
+ section ["embedding" ] = compute_embedding (content , embedding_deployment )
240
247
yield section
241
248
242
249
def before_retry_sleep (retry_state ):
243
250
if args .verbose : print ("Rate limited on the OpenAI embeddings API, sleeping before retrying..." )
244
251
245
- @retry (wait = wait_random_exponential (min = 15 , max = 60 ), stop = stop_after_attempt (15 ), before_sleep = before_retry_sleep )
246
- def compute_embedding (text ):
252
+ @retry (retry = retry_if_exception_type ( openai . error . RateLimitError ), wait = wait_random_exponential (min = 15 , max = 60 ), stop = stop_after_attempt (15 ), before_sleep = before_retry_sleep )
253
+ def compute_embedding (text , embedding_deployment ):
247
254
refresh_openai_token ()
248
- return openai .Embedding .create (engine = args . openaideployment , input = text )["data" ][0 ]["embedding" ]
255
+ return openai .Embedding .create (engine = embedding_deployment , input = text )["data" ][0 ]["embedding" ]
249
256
250
257
@retry (wait = wait_random_exponential (min = 15 , max = 60 ), stop = stop_after_attempt (15 ), before_sleep = before_retry_sleep )
251
258
def compute_embedding_in_batch (texts ):
@@ -314,7 +321,7 @@ def update_embeddings_in_batch(sections):
314
321
if args .verbose : print (f"Batch Completed. Batch size { len (batch_queue )} Token count { token_count } " )
315
322
for emb , item in zip (emb_responses , batch_queue ):
316
323
batch_response [item ["id" ]] = emb
317
-
324
+
318
325
for s in copy_s :
319
326
s ["embedding" ] = batch_response [s ["id" ]]
320
327
yield s
@@ -355,14 +362,18 @@ def remove_from_index(filename):
355
362
# It can take a few seconds for search results to reflect changes, so wait a bit
356
363
time .sleep (2 )
357
364
358
- # refresh open ai token every 5 minutes
365
+
359
366
def refresh_openai_token ():
360
- if open_ai_token_cache [CACHE_KEY_TOKEN_TYPE ] == 'azure_ad' and open_ai_token_cache [CACHE_KEY_CREATED_TIME ] + 300 < time .time ():
367
+ """
368
+ Refresh OpenAI token every 5 minutes
369
+ """
370
+ if CACHE_KEY_TOKEN_TYPE in open_ai_token_cache and open_ai_token_cache [CACHE_KEY_TOKEN_TYPE ] == 'azure_ad' and open_ai_token_cache [CACHE_KEY_CREATED_TIME ] + 300 < time .time ():
361
371
token_cred = open_ai_token_cache [CACHE_KEY_TOKEN_CRED ]
362
372
openai .api_key = token_cred .get_token ("https://cognitiveservices.azure.com/.default" ).token
363
373
open_ai_token_cache [CACHE_KEY_CREATED_TIME ] = time .time ()
364
374
365
- def read_files (path_pattern : str , use_vectors : bool , vectors_batch_support : bool ):
375
+
376
+ def read_files (path_pattern : str , use_vectors : bool , vectors_batch_support : bool , embedding_deployment : str = None ):
366
377
"""
367
378
Recursively read directory structure under `path_pattern`
368
379
and execute indexing for the individual files
@@ -380,8 +391,7 @@ def read_files(path_pattern: str, use_vectors: bool, vectors_batch_support: bool
380
391
if not args .skipblobs :
381
392
upload_blobs (filename )
382
393
page_map = get_document_text (filename )
383
- sections = create_sections (os .path .basename (filename ), page_map , use_vectors and not vectors_batch_support )
384
- print (use_vectors and vectors_batch_support )
394
+ sections = create_sections (os .path .basename (filename ), page_map , use_vectors and not vectors_batch_support , embedding_deployment )
385
395
if use_vectors and vectors_batch_support :
386
396
sections = update_embeddings_in_batch (sections )
387
397
index_sections (os .path .basename (filename ), sections )
@@ -456,4 +466,4 @@ def read_files(path_pattern: str, use_vectors: bool, vectors_batch_support: bool
456
466
create_search_index ()
457
467
458
468
print ("Processing files..." )
459
- read_files (args .files , use_vectors , compute_vectors_in_batch )
469
+ read_files (args .files , use_vectors , compute_vectors_in_batch , args . openaideployment )
0 commit comments