3232 FileSystemClient ,
3333)
3434from openai import AzureOpenAI
35+ from azure .storage .blob import BlobServiceClient
3536
3637# Get Azure Key Vault Client
3738key_vault_name = "kv_to-be-replaced" #'nc6262-kv-2fpeafsylfd2e'
@@ -199,16 +200,13 @@ def chunk_data(text):
199200# paths = os.listdir(path_name)
200201
201202
202- account_url = f"https://{ account_name } .dfs.core.windows.net"
203+ account_url = f"https://{ account_name } .blob.core.windows.net"
204+ blob_service_client = BlobServiceClient (account_url , credential = credential )
205+ container_client = blob_service_client .get_container_client (file_system_client_name )
203206
204- service_client = DataLakeServiceClient (
205- account_url , credential = credential , api_version = "2023-01-03"
206- )
207+ print (f"Listing blobs under '{ directory } ' using BlobServiceClient..." )
208+ paths = [blob .name for blob in container_client .list_blobs (name_starts_with = directory )]
207209
208- file_system_client = service_client .get_file_system_client (file_system_client_name )
209- directory_name = directory
210- paths = file_system_client .get_paths (path = directory_name )
211- print (paths )
212210
213211search_client = SearchClient (search_endpoint , index_name , credential )
214212# index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)
@@ -221,22 +219,22 @@ def chunk_data(text):
221219# Read the CSV file into a Pandas DataFrame
222220file_path = csv_file_name
223221print (file_path )
224- file_client = file_system_client . get_file_client (file_path )
225- csv_file = file_client . download_file ()
226- df_metadata = pd .read_csv (csv_file , encoding = "utf-8" )
222+ blob_client = container_client . get_blob_client (file_path )
223+ download_stream = blob_client . download_blob ()
224+ df_metadata = pd .read_csv (download_stream , encoding = "utf-8" )
227225
228226docs = []
229227counter = 0
230- for path in paths :
231- # file_path = f'Data/{foldername}/meeting_transcripts/' + path
232- # with open(file_path, "r") as file:
233- # data = json.load(file)
234- file_client = file_system_client . get_file_client ( path . name )
235- data_file = file_client . download_file ()
236- data = json .load ( data_file )
237- text = data [ "Content" ]
238-
239- filename = path . name .split ("/" )[- 1 ]
228+ for blob_name in paths :
229+ if not blob_name . endswith ( ".json" ):
230+ continue
231+
232+ blob_client = container_client . get_blob_client ( blob_name )
233+ download_stream = blob_client . download_blob ()
234+ data = json .loads ( download_stream . readall () )
235+ text = data . get ( "Content" , "" )
236+
237+ filename = blob_name .split ("/" )[- 1 ]
240238 document_id = filename .replace (".json" , "" ).replace ("convo_" , "" )
241239 # print(document_id)
242240 df_file_metadata = df_metadata [
@@ -276,15 +274,15 @@ def chunk_data(text):
276274 "chunk_id" : d ["chunk_id" ],
277275 "client_id" : d ["client_id" ],
278276 "content" : d ["content" ],
279- "sourceurl" : path . name .split ("/" )[- 1 ],
277+ "sourceurl" : blob_name .split ("/" )[- 1 ],
280278 "contentVector" : v_contentVector ,
281279 }
282280 )
283281
284282 if counter % 10 == 0 :
285283 result = search_client .upload_documents (documents = docs )
286284 docs = []
287- print (f" { str ( counter ) } uploaded" )
285+ print (f"{ counter } documents uploaded... " )
288286
289287# upload the last batch
290288if docs != []:
0 commit comments