Skip to content

Commit e1c4338

Browse files
author
Harmanpreet Kaur
committed
edit 2
1 parent 152c927 commit e1c4338

File tree

1 file changed

+67
-24
lines changed

1 file changed

+67
-24
lines changed

infra/scripts/index_scripts/02_process_data.py

Lines changed: 67 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
from openai import AzureOpenAI
55
import re
66
import time
7-
from azure.search.documents import SearchClient
8-
from azure.storage.filedatalake import DataLakeServiceClient
97
import pypdf
108
from io import BytesIO
9+
from azure.search.documents import SearchClient
10+
from azure.storage.filedatalake import DataLakeServiceClient
1111
from azure.search.documents.indexes import SearchIndexClient
1212

1313
key_vault_name = 'kv_to-be-replaced'
@@ -17,116 +17,159 @@
1717

1818

1919
def get_secrets_from_kv(kv_name, secret_name):
20+
# Set the name of the Azure Key Vault
2021
key_vault_name = kv_name
2122
credential = DefaultAzureCredential(managed_identity_client_id=managed_identity_client_id)
23+
24+
# Create a secret client object using the credential and Key Vault name
2225
secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)
23-
return secret_client.get_secret(secret_name).value
26+
return (secret_client.get_secret(secret_name).value)
2427

2528

2629
search_endpoint = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-ENDPOINT")
2730
search_key = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-KEY")
2831
openai_api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY")
2932
openai_api_base = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-ENDPOINT")
3033
openai_api_version = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION")
31-
deployment = get_secrets_from_kv(key_vault_name, "AZURE-OPEN-AI-DEPLOYMENT-MODEL")
34+
deployment = get_secrets_from_kv(key_vault_name, "AZURE-OPEN-AI-DEPLOYMENT-MODEL") # "gpt-4o-mini"
3235

3336

37+
# Function: Get Embeddings
3438
def get_embeddings(text: str, openai_api_base, openai_api_version, openai_api_key):
3539
model_id = "text-embedding-ada-002"
3640
client = AzureOpenAI(
3741
api_version=openai_api_version,
3842
azure_endpoint=openai_api_base,
3943
api_key=openai_api_key
4044
)
41-
return client.embeddings.create(input=text, model=model_id).data[0].embedding
4245

46+
embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding
47+
48+
return embedding
4349

50+
51+
# Function: Clean Spaces with Regex -
4452
def clean_spaces_with_regex(text):
53+
# Use a regular expression to replace multiple spaces with a single space
4554
cleaned_text = re.sub(r'\s+', ' ', text)
55+
# Use a regular expression to replace consecutive dots with a single dot
4656
cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
4757
return cleaned_text
4858

4959

5060
def chunk_data(text):
51-
tokens_per_chunk = 1024
61+
tokens_per_chunk = 1024 # 500
5262
text = clean_spaces_with_regex(text)
53-
sentences = text.split('. ')
63+
64+
sentences = text.split('. ') # Split text into sentences
5465
chunks = []
5566
current_chunk = ''
5667
current_chunk_token_count = 0
5768

69+
# Iterate through each sentence
5870
for sentence in sentences:
71+
# Split sentence into tokens
5972
tokens = sentence.split()
73+
74+
# Check if adding the current sentence exceeds tokens_per_chunk
6075
if current_chunk_token_count + len(tokens) <= tokens_per_chunk:
76+
# Add the sentence to the current chunk
6177
if current_chunk:
6278
current_chunk += '. ' + sentence
6379
else:
6480
current_chunk += sentence
6581
current_chunk_token_count += len(tokens)
6682
else:
83+
# Add current chunk to chunks list and start a new chunk
6784
chunks.append(current_chunk)
6885
current_chunk = sentence
6986
current_chunk_token_count = len(tokens)
7087

88+
# Add the last chunk
7189
if current_chunk:
7290
chunks.append(current_chunk)
7391

7492
return chunks
7593

7694

7795
account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
96+
7897
account_url = f"https://{account_name}.dfs.core.windows.net"
98+
7999
credential = DefaultAzureCredential()
80100
service_client = DataLakeServiceClient(account_url, credential=credential, api_version='2023-01-03')
101+
81102
file_system_client = service_client.get_file_system_client(file_system_client_name)
82-
paths = file_system_client.get_paths(path=directory)
103+
104+
directory_name = directory
105+
paths = file_system_client.get_paths(path=directory_name)
106+
print(paths)
107+
83108
index_name = "pdf_index"
109+
84110
search_credential = AzureKeyCredential(search_key)
111+
85112
search_client = SearchClient(search_endpoint, index_name, search_credential)
86113
index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
87114

88115

89116
def prepare_search_doc(content, document_id):
90117
chunks = chunk_data(content)
91-
docs = []
92-
for chunk_num, chunk in enumerate(chunks, start=1):
93-
chunk_id = f"{document_id}_{str(chunk_num).zfill(2)}"
118+
chunk_num = 0
119+
for chunk in chunks:
120+
chunk_num += 1
121+
chunk_id = document_id + '_' + str(chunk_num).zfill(2)
122+
94123
try:
95-
v_contentVector = get_embeddings(chunk, openai_api_base, openai_api_version, openai_api_key)
96-
except Exception:
124+
v_contentVector = get_embeddings(str(chunk), openai_api_base, openai_api_version, openai_api_key)
125+
except Exception as e:
126+
print(f"Error occurred: {e}. Retrying after 30 seconds...")
97127
time.sleep(30)
98128
try:
99-
v_contentVector = get_embeddings(chunk, openai_api_base, openai_api_version, openai_api_key)
100-
except Exception:
129+
v_contentVector = get_embeddings(str(chunk), openai_api_base, openai_api_version, openai_api_key)
130+
except Exception as e:
131+
print(f"Retry failed: {e}. Setting v_contentVector to an empty list.")
101132
v_contentVector = []
133+
102134
result = {
103135
"id": chunk_id,
104136
"chunk_id": chunk_id,
105137
"content": chunk,
106138
"sourceurl": path.name.split('/')[-1],
107139
"contentVector": v_contentVector
108140
}
109-
docs.append(result)
110-
return docs
141+
return result
111142

112143

144+
# conversationIds = []
113145
docs = []
114146
counter = 0
147+
148+
115149
for path in paths:
116150
file_client = file_system_client.get_file_client(path.name)
117151
pdf_file = file_client.download_file()
152+
118153
stream = BytesIO()
119154
pdf_file.readinto(stream)
120155
pdf_reader = pypdf.PdfReader(stream)
121156
filename = path.name.split('/')[-1]
122157
document_id = filename.split('_')[1].replace('.pdf', '')
123-
text = ''.join(page.extract_text() for page in pdf_reader.pages)
124-
docs.extend(prepare_search_doc(text, document_id))
158+
159+
text = ''
160+
num_pages = len(pdf_reader.pages)
161+
for page_num in range(num_pages):
162+
163+
page = pdf_reader.pages[page_num]
164+
text += page.extract_text()
165+
result = prepare_search_doc(text, document_id)
166+
docs.append(result)
167+
125168
counter += 1
126-
if docs and counter % 10 == 0:
127-
search_client.upload_documents(documents=docs)
169+
if docs != [] and counter % 10 == 0:
170+
result = search_client.upload_documents(documents=docs)
128171
docs = []
129-
print(f'{counter} uploaded')
172+
print(f' {str(counter)} uploaded')
130173

131-
if docs:
132-
search_client.upload_documents(documents=docs)
174+
if docs != []:
175+
results = search_client.upload_documents(documents=docs)

0 commit comments

Comments
 (0)