1- import json
21from azure .core .credentials import AzureKeyCredential
3- from azure .identity import DefaultAzureCredential , get_bearer_token_provider
4- from azure .keyvault .secrets import SecretClient
2+ from azure .identity import DefaultAzureCredential
3+ from azure .keyvault .secrets import SecretClient
54from openai import AzureOpenAI
6- import pandas as pd
7- import re
5+ import re
86import time
7+ import pypdf
8+ from io import BytesIO
9+ from azure .search .documents import SearchClient
10+ from azure .storage .filedatalake import DataLakeServiceClient
11+ from azure .search .documents .indexes import SearchIndexClient
912
1013key_vault_name = 'kv_to-be-replaced'
1114managed_identity_client_id = 'mici_to-be-replaced'
12- file_system_client_name = "data"
15+ file_system_client_name = "data"
1316directory = 'pdf'
1417
1518
16-
1719def get_secrets_from_kv (kv_name , secret_name ):
18- # Set the name of the Azure Key Vault
19- key_vault_name = kv_name
20+ # Set the name of the Azure Key Vault
21+ key_vault_name = kv_name
2022 credential = DefaultAzureCredential (managed_identity_client_id = managed_identity_client_id )
2123
22- # Create a secret client object using the credential and Key Vault name
23- secret_client = SecretClient (vault_url = f"https://{ key_vault_name } .vault.azure.net/" , credential = credential )
24- return (secret_client .get_secret (secret_name ).value )
24+ # Create a secret client object using the credential and Key Vault name
25+ secret_client = SecretClient (vault_url = f"https://{ key_vault_name } .vault.azure.net/" , credential = credential )
26+ return (secret_client .get_secret (secret_name ).value )
2527
2628
27- search_endpoint = get_secrets_from_kv (key_vault_name ,"AZURE-SEARCH-ENDPOINT" )
28- search_key = get_secrets_from_kv (key_vault_name ,"AZURE-SEARCH-KEY" )
29+ search_endpoint = get_secrets_from_kv (key_vault_name , "AZURE-SEARCH-ENDPOINT" )
30+ search_key = get_secrets_from_kv (key_vault_name , "AZURE-SEARCH-KEY" )
31+ openai_api_key = get_secrets_from_kv (key_vault_name , "AZURE-OPENAI-KEY" )
32+ openai_api_base = get_secrets_from_kv (key_vault_name , "AZURE-OPENAI-ENDPOINT" )
33+ openai_api_version = get_secrets_from_kv (key_vault_name , "AZURE-OPENAI-PREVIEW-API-VERSION" )
34+ deployment = get_secrets_from_kv (key_vault_name , "AZURE-OPEN-AI-DEPLOYMENT-MODEL" ) # "gpt-4o-mini"
2935
30- openai_api_key = get_secrets_from_kv (key_vault_name ,"AZURE-OPENAI-KEY" )
31- openai_api_base = get_secrets_from_kv (key_vault_name ,"AZURE-OPENAI-ENDPOINT" )
32- openai_api_version = get_secrets_from_kv (key_vault_name ,"AZURE-OPENAI-PREVIEW-API-VERSION" )
33- deployment = get_secrets_from_kv (key_vault_name ,"AZURE-OPEN-AI-DEPLOYMENT-MODEL" ) #"gpt-4o-mini"
3436
35-
36- # Function: Get Embeddings
37- def get_embeddings (text : str ,openai_api_base ,openai_api_version ,openai_api_key ):
37+ # Function: Get Embeddings
38+ def get_embeddings (text : str , openai_api_base , openai_api_version , openai_api_key ):
3839 model_id = "text-embedding-ada-002"
3940 client = AzureOpenAI (
4041 api_version = openai_api_version ,
4142 azure_endpoint = openai_api_base ,
42- api_key = openai_api_key
43+ api_key = openai_api_key
4344 )
44-
45+
4546 embedding = client .embeddings .create (input = text , model = model_id ).data [0 ].embedding
4647
4748 return embedding
4849
49- # Function: Clean Spaces with Regex -
50+
51+ # Function: Clean Spaces with Regex -
5052def clean_spaces_with_regex (text ):
5153 # Use a regular expression to replace multiple spaces with a single space
5254 cleaned_text = re .sub (r'\s+' , ' ' , text )
5355 # Use a regular expression to replace consecutive dots with a single dot
5456 cleaned_text = re .sub (r'\.{2,}' , '.' , cleaned_text )
5557 return cleaned_text
5658
59+
5760def chunk_data (text ):
58- tokens_per_chunk = 1024 # 500
61+ tokens_per_chunk = 1024 # 500
5962 text = clean_spaces_with_regex (text )
60- SENTENCE_ENDINGS = ["." , "!" , "?" ]
61- WORDS_BREAKS = ['\n ' , '\t ' , '}' , '{' , ']' , '[' , ')' , '(' , ' ' , ':' , ';' , ',' ]
6263
63- sentences = text .split ('. ' ) # Split text into sentences
64+ sentences = text .split ('. ' ) # Split text into sentences
6465 chunks = []
6566 current_chunk = ''
6667 current_chunk_token_count = 0
67-
68+
6869 # Iterate through each sentence
6970 for sentence in sentences :
7071 # Split sentence into tokens
7172 tokens = sentence .split ()
72-
73+
7374 # Check if adding the current sentence exceeds tokens_per_chunk
7475 if current_chunk_token_count + len (tokens ) <= tokens_per_chunk :
7576 # Add the sentence to the current chunk
@@ -83,121 +84,92 @@ def chunk_data(text):
8384 chunks .append (current_chunk )
8485 current_chunk = sentence
8586 current_chunk_token_count = len (tokens )
86-
87+
8788 # Add the last chunk
8889 if current_chunk :
8990 chunks .append (current_chunk )
90-
91- return chunks
9291
93- from azure .search .documents import SearchClient
94- from azure .storage .filedatalake import (
95- DataLakeServiceClient ,
96- DataLakeDirectoryClient ,
97- FileSystemClient
98- )
92+ return chunks
9993
10094
101- account_name = get_secrets_from_kv (key_vault_name , "ADLS-ACCOUNT-NAME" )
95+ account_name = get_secrets_from_kv (key_vault_name , "ADLS-ACCOUNT-NAME" )
10296
10397account_url = f"https://{ account_name } .dfs.core.windows.net"
10498
10599credential = DefaultAzureCredential ()
106- service_client = DataLakeServiceClient (account_url , credential = credential ,api_version = '2023-01-03' )
100+ service_client = DataLakeServiceClient (account_url , credential = credential , api_version = '2023-01-03' )
107101
108- file_system_client = service_client .get_file_system_client (file_system_client_name )
102+ file_system_client = service_client .get_file_system_client (file_system_client_name )
109103
110104directory_name = directory
111105paths = file_system_client .get_paths (path = directory_name )
112106print (paths )
113107
114108index_name = "pdf_index"
115109
116-
117- from azure .search .documents .indexes import SearchIndexClient
118- from azure .search .documents .indexes .models import (
119- SimpleField ,
120- SearchFieldDataType ,
121- SearchableField ,
122- SearchField ,
123- VectorSearch ,
124- HnswAlgorithmConfiguration ,
125- VectorSearchProfile ,
126- SemanticConfiguration ,
127- SemanticPrioritizedFields ,
128- SemanticField ,
129- SemanticSearch ,
130- SearchIndex
131- )
132110search_credential = AzureKeyCredential (search_key )
133111
134112search_client = SearchClient (search_endpoint , index_name , search_credential )
135113index_client = SearchIndexClient (endpoint = search_endpoint , credential = search_credential )
136114
137115
138- def prepare_search_doc (content , document_id ):
116+ def prepare_search_doc (content , document_id ):
139117 chunks = chunk_data (content )
140118 chunk_num = 0
141119 for chunk in chunks :
142120 chunk_num += 1
143121 chunk_id = document_id + '_' + str (chunk_num ).zfill (2 )
144-
122+
145123 try :
146- v_contentVector = get_embeddings (str (chunk ),openai_api_base ,openai_api_version ,openai_api_key )
147- except :
124+ v_contentVector = get_embeddings (str (chunk ), openai_api_base , openai_api_version , openai_api_key )
125+ except Exception as e :
126+ print (f"Error occurred: { e } . Retrying after 30 seconds..." )
148127 time .sleep (30 )
149- try :
150- v_contentVector = get_embeddings (str (chunk ),openai_api_base ,openai_api_version ,openai_api_key )
151- except :
128+ try :
129+ v_contentVector = get_embeddings (str (chunk ), openai_api_base , openai_api_version , openai_api_key )
130+ except Exception as e :
131+ print (f"Retry failed: { e } . Setting v_contentVector to an empty list." )
152132 v_contentVector = []
133+
153134 result = {
154- "id" : chunk_id ,
155- "chunk_id" : chunk_id ,
156- "content" : chunk ,
157- "sourceurl" : path .name .split ('/' )[- 1 ],
158- "contentVector" : v_contentVector
159- }
135+ "id" : chunk_id ,
136+ "chunk_id" : chunk_id ,
137+ "content" : chunk ,
138+ "sourceurl" : path .name .split ('/' )[- 1 ],
139+ "contentVector" : v_contentVector
140+ }
160141 return result
161-
142+
143+
162144# conversationIds = []
163145docs = []
164146counter = 0
165- from datetime import datetime , timedelta
166- import pypdf
167- from io import BytesIO
147+
168148
169149for path in paths :
170150 file_client = file_system_client .get_file_client (path .name )
171151 pdf_file = file_client .download_file ()
172-
152+
173153 stream = BytesIO ()
174154 pdf_file .readinto (stream )
175155 pdf_reader = pypdf .PdfReader (stream )
176156 filename = path .name .split ('/' )[- 1 ]
177- document_id = filename .split ('_' )[1 ].replace ('.pdf' ,'' )
178-
157+ document_id = filename .split ('_' )[1 ].replace ('.pdf' , '' )
179158
180159 text = ''
181160 num_pages = len (pdf_reader .pages )
182161 for page_num in range (num_pages ):
183-
162+
184163 page = pdf_reader .pages [page_num ]
185164 text += page .extract_text ()
186-
187-
188-
189165 result = prepare_search_doc (text , document_id )
190166 docs .append (result )
191-
167+
192168 counter += 1
193169 if docs != [] and counter % 10 == 0 :
194170 result = search_client .upload_documents (documents = docs )
195171 docs = []
196172 print (f' { str (counter )} uploaded' )
197-
173+
198174if docs != []:
199175 results = search_client .upload_documents (documents = docs )
200-
201-
202-
203-
0 commit comments