1- import json
21from azure .core .credentials import AzureKeyCredential
3- from azure .identity import DefaultAzureCredential , get_bearer_token_provider
4- from azure .keyvault .secrets import SecretClient
2+ from azure .identity import DefaultAzureCredential
3+ from azure .keyvault .secrets import SecretClient
54from openai import AzureOpenAI
6- import pandas as pd
7- import re
5+ import re
86import time
7+ from azure .search .documents import SearchClient
8+ from azure .storage .filedatalake import DataLakeServiceClient
9+ import pypdf
10+ from io import BytesIO
11+ from azure .search .documents .indexes import SearchIndexClient
912
1013key_vault_name = 'kv_to-be-replaced'
1114managed_identity_client_id = 'mici_to-be-replaced'
12- file_system_client_name = "data"
15+ file_system_client_name = "data"
1316directory = 'pdf'
1417
1518
16-
1719def get_secrets_from_kv (kv_name , secret_name ):
18- # Set the name of the Azure Key Vault
19- key_vault_name = kv_name
2020 credential = DefaultAzureCredential (managed_identity_client_id = managed_identity_client_id )
21+ secret_client = SecretClient (vault_url = f"https://{ kv_name } .vault.azure.net/" , credential = credential )
22+ return secret_client .get_secret (secret_name ).value
2123
22- # Create a secret client object using the credential and Key Vault name
23- secret_client = SecretClient (vault_url = f"https://{ key_vault_name } .vault.azure.net/" , credential = credential )
24- return (secret_client .get_secret (secret_name ).value )
2524
25+ search_endpoint = get_secrets_from_kv (key_vault_name , "AZURE-SEARCH-ENDPOINT" )
26+ search_key = get_secrets_from_kv (key_vault_name , "AZURE-SEARCH-KEY" )
27+ openai_api_key = get_secrets_from_kv (key_vault_name , "AZURE-OPENAI-KEY" )
28+ openai_api_base = get_secrets_from_kv (key_vault_name , "AZURE-OPENAI-ENDPOINT" )
29+ openai_api_version = get_secrets_from_kv (key_vault_name , "AZURE-OPENAI-PREVIEW-API-VERSION" )
30+ deployment = get_secrets_from_kv (key_vault_name , "AZURE-OPEN-AI-DEPLOYMENT-MODEL" )
2631
27- search_endpoint = get_secrets_from_kv (key_vault_name ,"AZURE-SEARCH-ENDPOINT" )
28- search_key = get_secrets_from_kv (key_vault_name ,"AZURE-SEARCH-KEY" )
2932
30- openai_api_key = get_secrets_from_kv (key_vault_name ,"AZURE-OPENAI-KEY" )
31- openai_api_base = get_secrets_from_kv (key_vault_name ,"AZURE-OPENAI-ENDPOINT" )
32- openai_api_version = get_secrets_from_kv (key_vault_name ,"AZURE-OPENAI-PREVIEW-API-VERSION" )
33- deployment = get_secrets_from_kv (key_vault_name ,"AZURE-OPEN-AI-DEPLOYMENT-MODEL" ) #"gpt-4o-mini"
34-
35-
36- # Function: Get Embeddings
37- def get_embeddings (text : str ,openai_api_base ,openai_api_version ,openai_api_key ):
33+ def get_embeddings (text : str , openai_api_base , openai_api_version , openai_api_key ):
3834 model_id = "text-embedding-ada-002"
3935 client = AzureOpenAI (
4036 api_version = openai_api_version ,
4137 azure_endpoint = openai_api_base ,
42- api_key = openai_api_key
38+ api_key = openai_api_key
4339 )
44-
45- embedding = client .embeddings .create (input = text , model = model_id ).data [0 ].embedding
40+ return client .embeddings .create (input = text , model = model_id ).data [0 ].embedding
4641
47- return embedding
4842
49- # Function: Clean Spaces with Regex -
5043def clean_spaces_with_regex (text ):
51- # Use a regular expression to replace multiple spaces with a single space
5244 cleaned_text = re .sub (r'\s+' , ' ' , text )
53- # Use a regular expression to replace consecutive dots with a single dot
5445 cleaned_text = re .sub (r'\.{2,}' , '.' , cleaned_text )
5546 return cleaned_text
5647
48+
5749def chunk_data (text ):
58- tokens_per_chunk = 1024 #500
50+ tokens_per_chunk = 1024
5951 text = clean_spaces_with_regex (text )
60- SENTENCE_ENDINGS = ["." , "!" , "?" ]
61- WORDS_BREAKS = ['\n ' , '\t ' , '}' , '{' , ']' , '[' , ')' , '(' , ' ' , ':' , ';' , ',' ]
62-
63- sentences = text .split ('. ' ) # Split text into sentences
52+ sentences = text .split ('. ' )
6453 chunks = []
6554 current_chunk = ''
6655 current_chunk_token_count = 0
67-
68- # Iterate through each sentence
56+
6957 for sentence in sentences :
70- # Split sentence into tokens
7158 tokens = sentence .split ()
72-
73- # Check if adding the current sentence exceeds tokens_per_chunk
7459 if current_chunk_token_count + len (tokens ) <= tokens_per_chunk :
75- # Add the sentence to the current chunk
7660 if current_chunk :
7761 current_chunk += '. ' + sentence
7862 else :
7963 current_chunk += sentence
8064 current_chunk_token_count += len (tokens )
8165 else :
82- # Add current chunk to chunks list and start a new chunk
8366 chunks .append (current_chunk )
8467 current_chunk = sentence
8568 current_chunk_token_count = len (tokens )
86-
87- # Add the last chunk
69+
8870 if current_chunk :
8971 chunks .append (current_chunk )
90-
91- return chunks
92-
93- from azure .search .documents import SearchClient
94- from azure .storage .filedatalake import (
95- DataLakeServiceClient ,
96- DataLakeDirectoryClient ,
97- FileSystemClient
98- )
9972
73+ return chunks
10074
101- account_name = get_secrets_from_kv (key_vault_name , "ADLS-ACCOUNT-NAME" )
10275
76+ account_name = get_secrets_from_kv (key_vault_name , "ADLS-ACCOUNT-NAME" )
10377account_url = f"https://{ account_name } .dfs.core.windows.net"
104-
10578credential = DefaultAzureCredential ()
106- service_client = DataLakeServiceClient (account_url , credential = credential ,api_version = '2023-01-03' )
107-
108- file_system_client = service_client .get_file_system_client (file_system_client_name )
109-
110- directory_name = directory
111- paths = file_system_client .get_paths (path = directory_name )
112- print (paths )
113-
79+ service_client = DataLakeServiceClient (account_url , credential = credential , api_version = '2023-01-03' )
80+ file_system_client = service_client .get_file_system_client (file_system_client_name )
81+ paths = file_system_client .get_paths (path = directory )
11482index_name = "pdf_index"
115-
116-
117- from azure .search .documents .indexes import SearchIndexClient
118- from azure .search .documents .indexes .models import (
119- SimpleField ,
120- SearchFieldDataType ,
121- SearchableField ,
122- SearchField ,
123- VectorSearch ,
124- HnswAlgorithmConfiguration ,
125- VectorSearchProfile ,
126- SemanticConfiguration ,
127- SemanticPrioritizedFields ,
128- SemanticField ,
129- SemanticSearch ,
130- SearchIndex
131- )
13283search_credential = AzureKeyCredential (search_key )
133-
13484search_client = SearchClient (search_endpoint , index_name , search_credential )
13585index_client = SearchIndexClient (endpoint = search_endpoint , credential = search_credential )
13686
13787
138- def prepare_search_doc (content , document_id ):
88+ def prepare_search_doc (content , document_id ):
13989 chunks = chunk_data (content )
140- chunk_num = 0
141- for chunk in chunks :
142- chunk_num += 1
143- chunk_id = document_id + '_' + str (chunk_num ).zfill (2 )
144-
90+ docs = []
91+ for chunk_num , chunk in enumerate (chunks , start = 1 ):
92+ chunk_id = f"{ document_id } _{ str (chunk_num ).zfill (2 )} "
14593 try :
146- v_contentVector = get_embeddings (str ( chunk ), openai_api_base ,openai_api_version ,openai_api_key )
147- except :
94+ v_contentVector = get_embeddings (chunk , openai_api_base , openai_api_version , openai_api_key )
95+ except Exception :
14896 time .sleep (30 )
149- try :
150- v_contentVector = get_embeddings (str ( chunk ), openai_api_base ,openai_api_version ,openai_api_key )
151- except :
97+ try :
98+ v_contentVector = get_embeddings (chunk , openai_api_base , openai_api_version , openai_api_key )
99+ except Exception :
152100 v_contentVector = []
153101 result = {
154- "id" : chunk_id ,
155- "chunk_id" : chunk_id ,
156- "content" : chunk ,
157- "sourceurl" : path .name .split ('/' )[- 1 ],
158- "contentVector" : v_contentVector
159- }
160- return result
161-
162- # conversationIds = []
102+ "id" : chunk_id ,
103+ "chunk_id" : chunk_id ,
104+ "content" : chunk ,
105+ "sourceurl" : path .name .split ('/' )[- 1 ],
106+ "contentVector" : v_contentVector
107+ }
108+ docs .append (result )
109+ return docs
110+
111+
163112docs = []
164113counter = 0
165- from datetime import datetime , timedelta
166- import pypdf
167- from io import BytesIO
168-
169114for path in paths :
170115 file_client = file_system_client .get_file_client (path .name )
171116 pdf_file = file_client .download_file ()
172-
173117 stream = BytesIO ()
174118 pdf_file .readinto (stream )
175119 pdf_reader = pypdf .PdfReader (stream )
176120 filename = path .name .split ('/' )[- 1 ]
177- document_id = filename .split ('_' )[1 ].replace ('.pdf' ,'' )
178-
179-
180- text = ''
181- num_pages = len (pdf_reader .pages )
182- for page_num in range (num_pages ):
183-
184- page = pdf_reader .pages [page_num ]
185- text += page .extract_text ()
186-
187-
188-
189- result = prepare_search_doc (text , document_id )
190- docs .append (result )
191-
121+ document_id = filename .split ('_' )[1 ].replace ('.pdf' , '' )
122+ text = '' .join (page .extract_text () for page in pdf_reader .pages )
123+ docs .extend (prepare_search_doc (text , document_id ))
192124 counter += 1
193- if docs != [] and counter % 10 == 0 :
194- result = search_client .upload_documents (documents = docs )
125+ if docs and counter % 10 == 0 :
126+ search_client .upload_documents (documents = docs )
195127 docs = []
196- print (f' { str (counter )} uploaded' )
197-
198- if docs != []:
199- results = search_client .upload_documents (documents = docs )
200-
201-
128+ print (f'{ counter } uploaded' )
202129
203-
130+ if docs :
131+ search_client .upload_documents (documents = docs )
0 commit comments