Skip to content

Commit e1a077d

Browse files
authored
Indexing: Improve vectors throughput (#556)
* Add batch support to prepdocs * updates * address pr build * Address comments * Revert * add model name through deployment * save * Refactor
1 parent 62c5962 commit e1a077d

File tree

5 files changed

+63
-7
lines changed

5 files changed

+63
-7
lines changed

infra/main.bicep

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,7 @@ output AZURE_OPENAI_RESOURCE_GROUP string = openAiResourceGroup.name
346346
output AZURE_OPENAI_CHATGPT_DEPLOYMENT string = chatGptDeploymentName
347347
output AZURE_OPENAI_CHATGPT_MODEL string = chatGptModelName
348348
output AZURE_OPENAI_EMB_DEPLOYMENT string = embeddingDeploymentName
349+
output AZURE_OPENAI_EMB_MODEL_NAME string = embeddingModelName
349350

350351
output AZURE_FORMRECOGNIZER_SERVICE string = formRecognizer.outputs.name
351352
output AZURE_FORMRECOGNIZER_RESOURCE_GROUP string = formRecognizerResourceGroup.name

scripts/prepdocs.ps1

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,4 @@ Start-Process -FilePath $venvPythonPath -ArgumentList "-m pip install -r ./scrip
3636

3737
Write-Host 'Running "prepdocs.py"'
3838
$cwd = (Get-Location)
39-
Start-Process -FilePath $venvPythonPath -ArgumentList "./scripts/prepdocs.py `"$cwd/data/*`" --storageaccount $env:AZURE_STORAGE_ACCOUNT --container $env:AZURE_STORAGE_CONTAINER --searchservice $env:AZURE_SEARCH_SERVICE --openaiservice $env:AZURE_OPENAI_SERVICE --openaideployment $env:AZURE_OPENAI_EMB_DEPLOYMENT --index $env:AZURE_SEARCH_INDEX --formrecognizerservice $env:AZURE_FORMRECOGNIZER_SERVICE --tenantid $env:AZURE_TENANT_ID -v" -Wait -NoNewWindow
39+
Start-Process -FilePath $venvPythonPath -ArgumentList "./scripts/prepdocs.py `"$cwd/data/*`" --storageaccount $env:AZURE_STORAGE_ACCOUNT --container $env:AZURE_STORAGE_CONTAINER --searchservice $env:AZURE_SEARCH_SERVICE --openaiservice $env:AZURE_OPENAI_SERVICE --openaideployment $env:AZURE_OPENAI_EMB_DEPLOYMENT --index $env:AZURE_SEARCH_INDEX --formrecognizerservice $env:AZURE_FORMRECOGNIZER_SERVICE --tenantid $env:AZURE_TENANT_ID --openaimodelname $env:AZURE_OPENAI_EMB_MODEL_NAME -v" -Wait -NoNewWindow

scripts/prepdocs.py

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import time
99

1010
import openai
11+
import tiktoken
1112
from azure.ai.formrecognizer import DocumentAnalysisClient
1213
from azure.core.credentials import AzureKeyCredential
1314
from azure.identity import AzureDeveloperCliCredential
@@ -40,6 +41,18 @@
4041
CACHE_KEY_CREATED_TIME = 'created_time'
4142
CACHE_KEY_TOKEN_TYPE = 'token_type'
4243

44+
#Embedding batch support section
45+
SUPPORTED_BATCH_AOAI_MODEL = {
46+
'text-embedding-ada-002': {
47+
'token_limit' : 8100,
48+
'max_batch_size' : 16
49+
}
50+
}
51+
52+
def calculate_tokens_emb_aoai(input: str):
53+
encoding = tiktoken.encoding_for_model(args.openaimodelname)
54+
return len(encoding.encode(input))
55+
4356
def blob_name_from_file_page(filename, page = 0):
4457
if os.path.splitext(filename)[1].lower() == ".pdf":
4558
return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
@@ -229,11 +242,17 @@ def create_sections(filename, page_map, use_vectors):
229242
def before_retry_sleep(retry_state):
230243
if args.verbose: print("Rate limited on the OpenAI embeddings API, sleeping before retrying...")
231244

232-
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(15), before_sleep=before_retry_sleep)
245+
@retry(wait=wait_random_exponential(min=15, max=60), stop=stop_after_attempt(15), before_sleep=before_retry_sleep)
233246
def compute_embedding(text):
234247
refresh_openai_token()
235248
return openai.Embedding.create(engine=args.openaideployment, input=text)["data"][0]["embedding"]
236249

250+
@retry(wait=wait_random_exponential(min=15, max=60), stop=stop_after_attempt(15), before_sleep=before_retry_sleep)
251+
def compute_embedding_in_batch(texts):
252+
refresh_openai_token()
253+
emb_response = openai.Embedding.create(engine=args.openaideployment, input=texts)
254+
return [data.embedding for data in emb_response.data]
255+
237256
def create_search_index():
238257
if args.verbose: print(f"Ensuring search index {args.index} exists")
239258
index_client = SearchIndexClient(endpoint=f"https://{args.searchservice}.search.windows.net/",
@@ -271,6 +290,35 @@ def create_search_index():
271290
else:
272291
if args.verbose: print(f"Search index {args.index} already exists")
273292

293+
def update_embeddings_in_batch(sections):
294+
batch_queue = []
295+
copy_s = []
296+
batch_response = {}
297+
token_count = 0
298+
for s in sections:
299+
token_count += calculate_tokens_emb_aoai(s["content"])
300+
if token_count <= SUPPORTED_BATCH_AOAI_MODEL[args.openaimodelname]['token_limit'] and len(batch_queue) < SUPPORTED_BATCH_AOAI_MODEL[args.openaimodelname]['max_batch_size']:
301+
batch_queue.append(s)
302+
copy_s.append(s)
303+
else:
304+
emb_responses = compute_embedding_in_batch([item["content"] for item in batch_queue])
305+
if args.verbose: print(f"Batch Completed. Batch size {len(batch_queue)} Token count {token_count}")
306+
for emb, item in zip(emb_responses, batch_queue):
307+
batch_response[item["id"]] = emb
308+
batch_queue = []
309+
batch_queue.append(s)
310+
token_count = calculate_tokens_emb_aoai(s["content"])
311+
312+
if batch_queue:
313+
emb_responses = compute_embedding_in_batch([item["content"] for item in batch_queue])
314+
if args.verbose: print(f"Batch Completed. Batch size {len(batch_queue)} Token count {token_count}")
315+
for emb, item in zip(emb_responses, batch_queue):
316+
batch_response[item["id"]] = emb
317+
318+
for s in copy_s:
319+
s["embedding"] = batch_response[s["id"]]
320+
yield s
321+
274322
def index_sections(filename, sections):
275323
if args.verbose: print(f"Indexing sections from '{filename}' into search index '{args.index}'")
276324
search_client = SearchClient(endpoint=f"https://{args.searchservice}.search.windows.net/",
@@ -314,7 +362,7 @@ def refresh_openai_token():
314362
openai.api_key = token_cred.get_token("https://cognitiveservices.azure.com/.default").token
315363
open_ai_token_cache[CACHE_KEY_CREATED_TIME] = time.time()
316364

317-
def read_files(path_pattern: str, use_vectors: bool):
365+
def read_files(path_pattern: str, use_vectors: bool, vectors_batch_support: bool):
318366
"""
319367
Recursively read directory structure under `path_pattern`
320368
and execute indexing for the individual files
@@ -326,13 +374,16 @@ def read_files(path_pattern: str, use_vectors: bool):
326374
remove_from_index(filename)
327375
else:
328376
if os.path.isdir(filename):
329-
read_files(filename + "/*", use_vectors)
377+
read_files(filename + "/*", use_vectors, vectors_batch_support)
330378
continue
331379
try:
332380
if not args.skipblobs:
333381
upload_blobs(filename)
334382
page_map = get_document_text(filename)
335-
sections = create_sections(os.path.basename(filename), page_map, use_vectors)
383+
sections = create_sections(os.path.basename(filename), page_map, use_vectors and not vectors_batch_support)
384+
print (use_vectors and vectors_batch_support)
385+
if use_vectors and vectors_batch_support:
386+
sections = update_embeddings_in_batch(sections)
336387
index_sections(os.path.basename(filename), sections)
337388
except Exception as e:
338389
print(f"\tGot an error while reading {filename} -> {e} --> skipping file")
@@ -355,7 +406,9 @@ def read_files(path_pattern: str, use_vectors: bool):
355406
parser.add_argument("--searchkey", required=False, help="Optional. Use this Azure Cognitive Search account key instead of the current user identity to login (use az login to set current user for Azure)")
356407
parser.add_argument("--openaiservice", help="Name of the Azure OpenAI service used to compute embeddings")
357408
parser.add_argument("--openaideployment", help="Name of the Azure OpenAI model deployment for an embedding model ('text-embedding-ada-002' recommended)")
409+
parser.add_argument("--openaimodelname", help="Name of the Azure OpenAI embedding model ('text-embedding-ada-002' recommended)")
358410
parser.add_argument("--novectors", action="store_true", help="Don't compute embeddings for the sections (e.g. don't call the OpenAI embeddings API during indexing)")
411+
parser.add_argument("--disablebatchvectors", action="store_true", help="Don't compute embeddings in batch for the sections")
359412
parser.add_argument("--openaikey", required=False, help="Optional. Use this Azure OpenAI account key instead of the current user identity to login (use az login to set current user for Azure)")
360413
parser.add_argument("--remove", action="store_true", help="Remove references to this document from blob storage and the search index")
361414
parser.add_argument("--removeall", action="store_true", help="Remove all blobs from blob storage and documents from the search index")
@@ -370,6 +423,7 @@ def read_files(path_pattern: str, use_vectors: bool):
370423
default_creds = azd_credential if args.searchkey is None or args.storagekey is None else None
371424
search_creds = default_creds if args.searchkey is None else AzureKeyCredential(args.searchkey)
372425
use_vectors = not args.novectors
426+
compute_vectors_in_batch = not args.disablebatchvectors and args.openaimodelname in SUPPORTED_BATCH_AOAI_MODEL
373427

374428
if not args.skipblobs:
375429
storage_creds = default_creds if args.storagekey is None else args.storagekey
@@ -402,4 +456,4 @@ def read_files(path_pattern: str, use_vectors: bool):
402456
create_search_index()
403457

404458
print("Processing files...")
405-
read_files(args.files, use_vectors)
459+
read_files(args.files, use_vectors, compute_vectors_in_batch)

scripts/prepdocs.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ echo 'Installing dependencies from "requirements.txt" into virtual environment'
1818
./scripts/.venv/bin/python -m pip install -r scripts/requirements.txt
1919

2020
echo 'Running "prepdocs.py"'
21-
./scripts/.venv/bin/python ./scripts/prepdocs.py './data/*' --storageaccount "$AZURE_STORAGE_ACCOUNT" --container "$AZURE_STORAGE_CONTAINER" --searchservice "$AZURE_SEARCH_SERVICE" --openaiservice "$AZURE_OPENAI_SERVICE" --openaideployment "$AZURE_OPENAI_EMB_DEPLOYMENT" --index "$AZURE_SEARCH_INDEX" --formrecognizerservice "$AZURE_FORMRECOGNIZER_SERVICE" --tenantid "$AZURE_TENANT_ID" -v
21+
./scripts/.venv/bin/python ./scripts/prepdocs.py './data/*' --storageaccount "$AZURE_STORAGE_ACCOUNT" --container "$AZURE_STORAGE_CONTAINER" --searchservice "$AZURE_SEARCH_SERVICE" --openaiservice "$AZURE_OPENAI_SERVICE" --openaideployment "$AZURE_OPENAI_EMB_DEPLOYMENT" --index "$AZURE_SEARCH_INDEX" --formrecognizerservice "$AZURE_FORMRECOGNIZER_SERVICE" --openaimodelname "$AZURE_OPENAI_EMB_MODEL_NAME" --tenantid "$AZURE_TENANT_ID" -v

scripts/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ azure-search-documents==11.4.0b6
44
azure-ai-formrecognizer==3.2.1
55
azure-storage-blob==12.14.1
66
openai[datalib]==0.27.8
7+
tiktoken==0.4.0
78
tenacity==8.2.2

0 commit comments

Comments
 (0)