Skip to content

Commit 5497cce

Browse files
Add scripts for processing sample data and indexing in Azure Search
1 parent ce1d242 commit 5497cce

File tree

4 files changed

+412
-0
lines changed

4 files changed

+412
-0
lines changed
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
#Requires -Version 7.0
2+
3+
param(
4+
[string]$StorageAccount,
5+
[string]$BlobContainer,
6+
[string]$AiSearch
7+
)
8+
9+
# Get parameters from azd env, if not provided
10+
if (-not $StorageAccount) {
11+
$StorageAccount = $(azd env get-value AZURE_STORAGE_ACCOUNT_NAME)
12+
}
13+
14+
if (-not $BlobContainer) {
15+
$BlobContainer = $(azd env get-value AZURE_STORAGE_CONTAINER_NAME)
16+
}
17+
18+
if (-not $AiSearch) {
19+
$AiSearch = $(azd env get-value AZURE_SEARCH_ENDPOINT)
20+
}
21+
22+
$AzSubscriptionId = $(azd env get-value AZURE_SUBSCRIPTION_ID)
23+
24+
# Check if all required arguments are provided
25+
if (-not $StorageAccount -or -not $BlobContainer -or -not $AiSearch) {
26+
Write-Host "Usage: .\infra\scripts\Process-Sample-Data.ps1 -StorageAccount <StorageAccount> -BlobContainer <StorageContainerName> -AiSearch <AISearchName/AISearchEndpoint>"
27+
exit 1
28+
}
29+
30+
# Authenticate with Azure
31+
try {
32+
$currentAzContext = az account show | ConvertFrom-Json -ErrorAction Stop
33+
Write-Host "Already authenticated with Azure."
34+
}
35+
catch {
36+
Write-Host "Not authenticated with Azure. Attempting to authenticate..."
37+
Write-Host "Authenticating with Azure CLI..."
38+
az login
39+
if ($LASTEXITCODE -ne 0) {
40+
Write-Host "Authentication failed."
41+
exit 1
42+
}
43+
$currentAzContext = az account show | ConvertFrom-Json
44+
}
45+
46+
# Check if user has selected the correct subscription
47+
$currentSubscriptionId = $currentAzContext.id
48+
$currentSubscriptionName = $currentAzContext.name
49+
50+
if ($currentSubscriptionId -ne $AzSubscriptionId) {
51+
Write-Host "Current selected subscription is $currentSubscriptionName ( $currentSubscriptionId )."
52+
$confirmation = Read-Host "Do you want to continue with this subscription? (y/n)"
53+
54+
if ($confirmation.ToLower() -ne "y") {
55+
Write-Host "Fetching available subscriptions..."
56+
$availableSubscriptions = (az account list --query "[?state=='Enabled']" | ConvertFrom-Json -AsHashtable)
57+
58+
# Create a cleaner array of subscription objects
59+
$subscriptionArray = $availableSubscriptions | ForEach-Object {
60+
[PSCustomObject]@{
61+
Name = $_.name
62+
Id = $_.id
63+
}
64+
}
65+
66+
do {
67+
Write-Host ""
68+
Write-Host "Available Subscriptions:"
69+
Write-Host "========================"
70+
for ($i = 0; $i -lt $subscriptionArray.Count; $i++) {
71+
Write-Host "$($i+1). $($subscriptionArray[$i].Name) ( $($subscriptionArray[$i].Id) )"
72+
}
73+
Write-Host "========================"
74+
Write-Host ""
75+
76+
[int]$subscriptionIndex = Read-Host "Enter the number of the subscription (1-$($subscriptionArray.Count)) to use"
77+
78+
if ($subscriptionIndex -ge 1 -and $subscriptionIndex -le $subscriptionArray.Count) {
79+
$selectedSubscription = $subscriptionArray[$subscriptionIndex-1]
80+
$selectedSubscriptionName = $selectedSubscription.Name
81+
$selectedSubscriptionId = $selectedSubscription.Id
82+
83+
# Set the selected subscription
84+
$result = az account set --subscription $selectedSubscriptionId
85+
if ($LASTEXITCODE -eq 0) {
86+
Write-Host "Switched to subscription: $selectedSubscriptionName ( $selectedSubscriptionId )"
87+
break
88+
}
89+
else {
90+
Write-Host "Failed to switch to subscription: $selectedSubscriptionName ( $selectedSubscriptionId )."
91+
}
92+
}
93+
else {
94+
Write-Host "Invalid selection. Please try again."
95+
}
96+
} while ($true)
97+
}
98+
else {
99+
Write-Host "Proceeding with the current subscription: $currentSubscriptionName ( $currentSubscriptionId )"
100+
az account set --subscription $currentSubscriptionId
101+
}
102+
}
103+
else {
104+
Write-Host "Proceeding with the subscription: $currentSubscriptionName ( $currentSubscriptionId )"
105+
az account set --subscription $currentSubscriptionId
106+
}
107+
108+
# Upload sample files to blob storage
109+
Write-Host "Uploading sample files to blob storage..."
110+
$result = az storage blob upload-batch --account-name $StorageAccount --destination $BlobContainer --source "data/datasets" --auth-mode login --pattern "*" --overwrite --output none
111+
112+
if ($LASTEXITCODE -ne 0) {
113+
Write-Host "Error: Failed to upload files to blob storage."
114+
exit 1
115+
}
116+
Write-Host "Files uploaded successfully to blob storage."
117+
118+
# Determine the correct Python command
119+
$pythonCmd = $null
120+
121+
try {
122+
$pythonVersion = (python --version) 2>&1
123+
if ($pythonVersion -match "Python \d") {
124+
$pythonCmd = "python"
125+
}
126+
}
127+
catch {
128+
# Do nothing, try python3 next
129+
}
130+
131+
if (-not $pythonCmd) {
132+
try {
133+
$pythonVersion = (python3 --version) 2>&1
134+
if ($pythonVersion -match "Python \d") {
135+
$pythonCmd = "python3"
136+
}
137+
}
138+
catch {
139+
Write-Host "Python is not installed on this system or it is not added in the PATH."
140+
exit 1
141+
}
142+
}
143+
144+
if (-not $pythonCmd) {
145+
Write-Host "Python is not installed on this system or it is not added in the PATH."
146+
exit 1
147+
}
148+
149+
# Create virtual environment
150+
$venvPath = "infra/scripts/scriptenv"
151+
if (Test-Path $venvPath) {
152+
Write-Host "Virtual environment already exists. Skipping creation."
153+
}
154+
else {
155+
Write-Host "Creating virtual environment"
156+
& $pythonCmd -m venv $venvPath
157+
}
158+
159+
# Activate the virtual environment
160+
$activateScript = ""
161+
if (Test-Path (Join-Path -Path $venvPath -ChildPath "bin/Activate.ps1")) {
162+
$activateScript = Join-Path -Path $venvPath -ChildPath "bin/Activate.ps1"
163+
}
164+
elseif (Test-Path (Join-Path -Path $venvPath -ChildPath "Scripts/Activate.ps1")) {
165+
$activateScript = Join-Path -Path $venvPath -ChildPath "Scripts/Activate.ps1"
166+
}
167+
168+
if ($activateScript) {
169+
Write-Host "Activating virtual environment"
170+
. $activateScript # Use dot sourcing to run in the current scope
171+
}
172+
else {
173+
Write-Host "Error activating virtual environment. Requirements may be installed globally."
174+
}
175+
176+
# Install the requirements
177+
Write-Host "Installing requirements"
178+
pip install --quiet -r infra/scripts/requirements.txt
179+
Write-Host "Requirements installed"
180+
181+
# Run the Python script to index data
182+
Write-Host "Running the python script to index data"
183+
$process = Start-Process -FilePath $pythonCmd -ArgumentList "infra/scripts/index_datasets.py", $StorageAccount, $BlobContainer, $AiSearch -Wait -NoNewWindow -PassThru
184+
185+
if ($process.ExitCode -ne 0) {
186+
Write-Host "Error: Indexing python script execution failed."
187+
exit 1
188+
}
189+
190+
Write-Host "Script executed successfully. Sample Data Processed Successfully."

infra/scripts/index_datasets.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
from azure.identity import AzureCliCredential
2+
from azure.search.documents import SearchClient
3+
from azure.search.documents.indexes import SearchIndexClient
4+
from azure.search.documents.indexes.models import SearchIndex, SimpleField, SearchField, SearchFieldDataType
5+
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
6+
import csv
7+
import sys
8+
import io
9+
10+
if len(sys.argv) > 1:
11+
storage_account_name = sys.argv[1]
12+
blob_container_name = sys.argv[2]
13+
ai_search_endpoint = sys.argv[3]
14+
if not ai_search_endpoint.__contains__("search.windows.net"):
15+
ai_search_endpoint = f"https://{ai_search_endpoint}.search.windows.net"
16+
else:
17+
print("Usage: python index_datasets.py <storage_account_name> <blob_container_name> <ai_search_endpoint>")
18+
sys.exit(1)
19+
20+
credential = AzureCliCredential()
21+
22+
blob_service_client = BlobServiceClient(account_url=f"https://{storage_account_name}.blob.core.windows.net", credential=credential)
23+
container_client = blob_service_client.get_container_client(blob_container_name)
24+
25+
try:
26+
print("Fetching files in container...")
27+
blob_list = container_client.list_blobs()
28+
except Exception as e:
29+
print(f"Error fetching files: {e}")
30+
sys.exit(1)
31+
32+
success_count = 0
33+
fail_count = 0
34+
35+
for blob in blob_list:
36+
if blob.name.endswith(".csv"):
37+
index_name = blob.name.replace(".csv", "").lower()
38+
csv_data = container_client.download_blob(blob.name).readall()
39+
data_list = []
40+
try:
41+
print(f"Reading data from blob: {blob.name}...")
42+
csv_text = csv_data.decode('utf-8')
43+
csv_reader = csv.DictReader(io.StringIO(csv_text))
44+
for row in csv_reader:
45+
data_list.append(row)
46+
print(f"Loaded {len(data_list)} records from CSV file - {blob.name}.")
47+
except Exception as e:
48+
print(f"Error reading CSV file - {blob.name}: {e}")
49+
fail_count += 1
50+
continue
51+
52+
if not data_list:
53+
print(f"No data found in CSV file - {blob.name}. Skipping.")
54+
fail_count += 1
55+
continue
56+
57+
headers = list(data_list[0].keys())
58+
59+
index_fields = [ SimpleField(name="Id", type=SearchFieldDataType.String, key=True) ]
60+
for header in headers:
61+
index_fields.append(SearchField(name=header, type=SearchFieldDataType.String, searchable=True))
62+
63+
index = SearchIndex(name=index_name, fields=index_fields)
64+
65+
try:
66+
print("Creating or updating Azure Search index...")
67+
search_index_client = SearchIndexClient(endpoint=ai_search_endpoint, credential=credential)
68+
index_result = search_index_client.create_or_update_index(index=index)
69+
print(f"Index '{index_name}' created or updated successfully.")
70+
except Exception as e:
71+
print(f"Error creating/updating index: {e}")
72+
fail_count += 1
73+
continue
74+
75+
for idx, item in enumerate(data_list, start=1):
76+
item["Id"] = str(idx)
77+
78+
try:
79+
print("Uploading documents to the index...")
80+
search_client = SearchClient(endpoint=ai_search_endpoint, index_name=index_name, credential=credential)
81+
result = search_client.upload_documents(documents=data_list)
82+
print(f"Uploaded {len(data_list)} documents.")
83+
success_count += 1
84+
except Exception as e:
85+
print(f"Error uploading documents: {e}")
86+
fail_count += 1
87+
continue
88+
89+
print(f"Processing complete. Success: {success_count}, Failed: {fail_count}")

0 commit comments

Comments
 (0)