Skip to content

Commit 20b125b

Browse files
fetch upload/download data from block storage instead of data lake
1 parent c1325e8 commit 20b125b

File tree

2 files changed

+49
-51
lines changed

2 files changed

+49
-51
lines changed

infra/scripts/index_scripts/create_search_index.py

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
FileSystemClient,
3333
)
3434
from openai import AzureOpenAI
35+
from azure.storage.blob import BlobServiceClient
3536

3637
# Get Azure Key Vault Client
3738
key_vault_name = "kv_to-be-replaced" #'nc6262-kv-2fpeafsylfd2e'
@@ -199,16 +200,13 @@ def chunk_data(text):
199200
# paths = os.listdir(path_name)
200201

201202

202-
account_url = f"https://{account_name}.dfs.core.windows.net"
203+
account_url = f"https://{account_name}.blob.core.windows.net"
204+
blob_service_client = BlobServiceClient(account_url, credential=credential)
205+
container_client = blob_service_client.get_container_client(file_system_client_name)
203206

204-
service_client = DataLakeServiceClient(
205-
account_url, credential=credential, api_version="2023-01-03"
206-
)
207+
print(f"Listing blobs under '{directory}' using BlobServiceClient...")
208+
paths = [blob.name for blob in container_client.list_blobs(name_starts_with=directory)]
207209

208-
file_system_client = service_client.get_file_system_client(file_system_client_name)
209-
directory_name = directory
210-
paths = file_system_client.get_paths(path=directory_name)
211-
print(paths)
212210

213211
search_client = SearchClient(search_endpoint, index_name, credential)
214212
# index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)
@@ -221,22 +219,22 @@ def chunk_data(text):
221219
# Read the CSV file into a Pandas DataFrame
222220
file_path = csv_file_name
223221
print(file_path)
224-
file_client = file_system_client.get_file_client(file_path)
225-
csv_file = file_client.download_file()
226-
df_metadata = pd.read_csv(csv_file, encoding="utf-8")
222+
blob_client = container_client.get_blob_client(file_path)
223+
download_stream = blob_client.download_blob()
224+
df_metadata = pd.read_csv(download_stream, encoding="utf-8")
227225

228226
docs = []
229227
counter = 0
230-
for path in paths:
231-
# file_path = f'Data/{foldername}/meeting_transcripts/' + path
232-
# with open(file_path, "r") as file:
233-
# data = json.load(file)
234-
file_client = file_system_client.get_file_client(path.name)
235-
data_file = file_client.download_file()
236-
data = json.load(data_file)
237-
text = data["Content"]
238-
239-
filename = path.name.split("/")[-1]
228+
for blob_name in paths:
229+
if not blob_name.endswith(".json"):
230+
continue
231+
232+
blob_client = container_client.get_blob_client(blob_name)
233+
download_stream = blob_client.download_blob()
234+
data = json.loads(download_stream.readall())
235+
text = data.get("Content", "")
236+
237+
filename = blob_name.split("/")[-1]
240238
document_id = filename.replace(".json", "").replace("convo_", "")
241239
# print(document_id)
242240
df_file_metadata = df_metadata[
@@ -276,15 +274,15 @@ def chunk_data(text):
276274
"chunk_id": d["chunk_id"],
277275
"client_id": d["client_id"],
278276
"content": d["content"],
279-
"sourceurl": path.name.split("/")[-1],
277+
"sourceurl": blob_name.split("/")[-1],
280278
"contentVector": v_contentVector,
281279
}
282280
)
283281

284282
if counter % 10 == 0:
285283
result = search_client.upload_documents(documents=docs)
286284
docs = []
287-
print(f" {str(counter)} uploaded")
285+
print(f"{counter} documents uploaded...")
288286

289287
# upload the last batch
290288
if docs != []:

infra/scripts/process_sample_data.sh

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -332,41 +332,41 @@ get_values_from_azd_env() {
332332
}
333333

334334
get_values_from_az_deployment() {
335-
echo "Getting values from Azure deployment outputs..."
336-
337-
deploymentName=$(az group show --name "$resourceGroupName" --query "tags.DeploymentName" -o tsv)
338-
echo "Deployment Name (from tag): $deploymentName"
339-
335+
echo "Getting values from Azure deployment outputs..."
336+
337+
deploymentName=$(az group show --name "$resourceGroupName" --query "tags.DeploymentName" -o tsv)
338+
echo "Deployment Name (from tag): $deploymentName"
339+
340340
echo "Fetching deployment outputs..."
341-
341+
342342
# Get all outputs
343343
deploymentOutputs=$(az deployment group show \
344344
--name "$deploymentName" \
345345
--resource-group "$resourceGroupName" \
346346
--query "properties.outputs" -o json)
347-
347+
348348
# Extract each value
349-
cosmosDbAccountName=$(echo "$deploymentOutputs" | grep -A 3 '"cosmosdB_ACCOUNT_NAME"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
350-
storageAccount=$(echo "$deploymentOutputs" | grep -A 3 '"storagE_ACCOUNT_NAME"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
351-
fileSystem=$(echo "$deploymentOutputs" | grep -A 3 '"storagE_CONTAINER_NAME"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
352-
keyvaultName=$(echo "$deploymentOutputs" | grep -A 3 '"keY_VAULT_NAME"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
353-
sqlServerName=$(echo "$deploymentOutputs" | grep -A 3 '"sqldB_SERVER_NAME"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
354-
webAppManagedIdentityDisplayName=$(echo "$deploymentOutputs" | grep -A 3 '"managedidentitY_WEBAPP_NAME"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
355-
webAppManagedIdentityClientId=$(echo "$deploymentOutputs" | grep -A 3 '"managedidentitY_WEBAPP_CLIENTID"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
356-
SqlDatabaseName=$(echo "$deploymentOutputs" | grep -A 3 '"sqldB_DATABASE"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
357-
sqlManagedIdentityClientId=$(echo "$deploymentOutputs" | grep -A 3 '"managedidentitY_SQL_CLIENTID"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
358-
sqlManagedIdentityDisplayName=$(echo "$deploymentOutputs" | grep -A 3 '"managedidentitY_SQL_NAME"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
359-
aiSearchName=$(echo "$deploymentOutputs" | grep -A 3 '"aI_SEARCH_SERVICE_NAME"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
360-
aif_resource_id=$(echo "$deploymentOutputs" | grep -A 3 '"aI_FOUNDRY_RESOURCE_ID"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
361-
362-
# Validate that we extracted all required values
363-
if [ -z "$cosmosDbAccountName" ] || [ -z "$storageAccount" ] || [ -z "$fileSystem" ] || [ -z "$keyvaultName" ] || [ -z "$sqlServerName" ] || [ -z "$SqlDatabaseName" ] || [ -z "$sqlManagedIdentityClientId" ] || [ -z "$sqlManagedIdentityDisplayName" ] || [ -z "$aiSearchName" ] || [ -z "$aif_resource_id" ]; then
364-
echo "Error: One or more required values could not be retrieved from deployment outputs."
365-
return 1
366-
else
367-
echo "All values retrieved successfully from deployment outputs."
368-
return 0
369-
fi
349+
cosmosDbAccountName=$(echo "$deploymentOutputs" | grep -A 3 '"cosmosDbAccountName"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
350+
storageAccount=$(echo "$deploymentOutputs" | grep -A 3 '"storageAccountName"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
351+
fileSystem=$(echo "$deploymentOutputs" | grep -A 3 '"storageContainerName"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
352+
keyvaultName=$(echo "$deploymentOutputs" | grep -A 3 '"keyVaultName"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
353+
sqlServerName=$(echo "$deploymentOutputs" | grep -A 3 '"sqlDbServerName"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
354+
webAppManagedIdentityDisplayName=$(echo "$deploymentOutputs" | grep -A 3 '"managedIdentityWebAppName"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
355+
webAppManagedIdentityClientId=$(echo "$deploymentOutputs" | grep -A 3 '"managedIdentityWebAppClientId"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
356+
SqlDatabaseName=$(echo "$deploymentOutputs" | grep -A 3 '"sqlDbDatabase"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
357+
sqlManagedIdentityClientId=$(echo "$deploymentOutputs" | grep -A 3 '"managedIdentitySqlClientId"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
358+
sqlManagedIdentityDisplayName=$(echo "$deploymentOutputs" | grep -A 3 '"managedIdentitySqlName"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
359+
aiSearchName=$(echo "$deploymentOutputs" | grep -A 3 '"aiSearchServiceName"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
360+
aif_resource_id=$(echo "$deploymentOutputs" | grep -A 3 '"aiFoundryResourceId"' | grep '"value"' | sed 's/.*"value": *"\([^"]*\)".*/\1/')
361+
362+
# Validate that we extracted all required values
363+
if [ -z "$cosmosDbAccountName" ] || [ -z "$storageAccount" ] || [ -z "$fileSystem" ] || [ -z "$keyvaultName" ] || [ -z "$sqlServerName" ] || [ -z "$SqlDatabaseName" ] || [ -z "$sqlManagedIdentityClientId" ] || [ -z "$sqlManagedIdentityDisplayName" ] || [ -z "$aiSearchName" ] || [ -z "$aif_resource_id" ]; then
364+
echo "Error: One or more required values could not be retrieved from deployment outputs."
365+
return 1
366+
else
367+
echo "All values retrieved successfully from deployment outputs."
368+
return 0
369+
fi
370370
}
371371

372372
get_values_from_user() {

0 commit comments

Comments
 (0)