3131 DataLakeServiceClient ,
3232 FileSystemClient ,
3333)
34- from openai import AzureOpenAI
35- from azure .storage .blob import BlobServiceClient
34+ from azure .ai .projects import AIProjectClient
3635
3736# Get Azure Key Vault Client
3837key_vault_name = "kv_to-be-replaced" #'nc6262-kv-2fpeafsylfd2e'
6261openai_api_version = secret_client .get_secret ("AZURE-OPENAI-PREVIEW-API-VERSION" ).value
6362openai_embedding_model = secret_client .get_secret ("AZURE-OPENAI-EMBEDDING-MODEL" ).value
6463account_name = secret_client .get_secret ("ADLS-ACCOUNT-NAME" ).value
64+ ai_project_endpoint = secret_client .get_secret ("AZURE-AI-AGENT-ENDPOINT" ).value
6565
6666# Create a search index
6767index_client = SearchIndexClient (endpoint = search_endpoint , credential = credential )
133133
134134
135135# Function: Get Embeddings
136- def get_embeddings (text : str , openai_api_base , openai_api_version , azure_token_provider ):
136+ def get_embeddings (text : str , ai_project_endpoint , openai_api_version , credential ):
137137 model_id = openai_embedding_model or "text-embedding-ada-002"
138- client = AzureOpenAI (
138+
139+ # Create AI Projects client
140+ project_client = AIProjectClient (
141+ endpoint = ai_project_endpoint ,
142+ credential = credential ,
139143 api_version = openai_api_version ,
140- azure_endpoint = openai_api_base ,
141- azure_ad_token_provider = azure_token_provider ,
144+ )
145+
146+ # Get the OpenAI client from the AI Projects client
147+ openai_client = project_client .get_openai_client (
148+ api_version = openai_api_version
142149 )
143150
144- embedding = client .embeddings .create (input = text , model = model_id ).data [0 ].embedding
151+ embedding = openai_client .embeddings .create (input = text , model = model_id ).data [0 ].embedding
145152
146153 return embedding
147154
@@ -200,13 +207,16 @@ def chunk_data(text):
200207# paths = os.listdir(path_name)
201208
202209
203- account_url = f"https://{ account_name } .blob.core.windows.net"
204- blob_service_client = BlobServiceClient (account_url , credential = credential )
205- container_client = blob_service_client .get_container_client (file_system_client_name )
210+ account_url = f"https://{ account_name } .dfs.core.windows.net"
206211
207- print (f"Listing blobs under '{ directory } ' using BlobServiceClient..." )
208- paths = [blob .name for blob in container_client .list_blobs (name_starts_with = directory )]
212+ service_client = DataLakeServiceClient (
213+ account_url , credential = credential , api_version = "2023-01-03"
214+ )
209215
216+ file_system_client = service_client .get_file_system_client (file_system_client_name )
217+ directory_name = directory
218+ paths = file_system_client .get_paths (path = directory_name )
219+ print (paths )
210220
211221search_client = SearchClient (search_endpoint , index_name , credential )
212222# index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)
@@ -219,22 +229,22 @@ def chunk_data(text):
219229# Read the CSV file into a Pandas DataFrame
220230file_path = csv_file_name
221231print (file_path )
222- blob_client = container_client . get_blob_client (file_path )
223- download_stream = blob_client . download_blob ()
224- df_metadata = pd .read_csv (download_stream , encoding = "utf-8" )
232+ file_client = file_system_client . get_file_client (file_path )
233+ csv_file = file_client . download_file ()
234+ df_metadata = pd .read_csv (csv_file , encoding = "utf-8" )
225235
226236docs = []
227237counter = 0
228- for blob_name in paths :
229- if not blob_name . endswith ( ".json" ):
230- continue
231-
232- blob_client = container_client . get_blob_client ( blob_name )
233- download_stream = blob_client . download_blob ()
234- data = json .loads ( download_stream . readall () )
235- text = data . get ( "Content" , "" )
236-
237- filename = blob_name .split ("/" )[- 1 ]
238+ for path in paths :
239+ # file_path = f'Data/{foldername}/meeting_transcripts/' + path
240+ # with open(file_path, "r") as file:
241+ # data = json.load(file)
242+ file_client = file_system_client . get_file_client ( path . name )
243+ data_file = file_client . download_file ()
244+ data = json .load ( data_file )
245+ text = data [ "Content" ]
246+
247+ filename = path . name .split ("/" )[- 1 ]
238248 document_id = filename .replace (".json" , "" ).replace ("convo_" , "" )
239249 # print(document_id)
240250 df_file_metadata = df_metadata [
@@ -258,12 +268,12 @@ def chunk_data(text):
258268
259269 try :
260270 v_contentVector = get_embeddings (
261- d ["content" ], openai_api_base , openai_api_version , token_provider
271+ d ["content" ], ai_project_endpoint , openai_api_version , credential
262272 )
263273 except :
264274 time .sleep (30 )
265275 v_contentVector = get_embeddings (
266- d ["content" ], openai_api_base , openai_api_version , token_provider
276+ d ["content" ], ai_project_endpoint , openai_api_version , credential
267277 )
268278
269279 docs .append (
@@ -274,15 +284,15 @@ def chunk_data(text):
274284 "chunk_id" : d ["chunk_id" ],
275285 "client_id" : d ["client_id" ],
276286 "content" : d ["content" ],
277- "sourceurl" : blob_name .split ("/" )[- 1 ],
287+ "sourceurl" : path . name .split ("/" )[- 1 ],
278288 "contentVector" : v_contentVector ,
279289 }
280290 )
281291
282292 if counter % 10 == 0 :
283293 result = search_client .upload_documents (documents = docs )
284294 docs = []
285- print (f"{ counter } documents uploaded... " )
295+ print (f" { str ( counter ) } uploaded" )
286296
287297# upload the last batch
288298if docs != []:
0 commit comments