Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion documents/CustomizeData.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ If you would like to update the solution to leverage your own data please follow
<AI-Search-Name> <Search-Endpoint> \
<AI-Foundry-Resource-ID> <CU-Foundry-Resource-ID> \
<OpenAI-Endpoint> <Embedding-Model> <Deployment-Model> \
<CU-Endpoint> <AI-Agent-Endpoint> <CU-API-Version>
<CU-Endpoint> <CU-API-Version> <AI-Agent-Endpoint>
```

## How to Login to VM Using Azure Bastion
Expand Down
2 changes: 1 addition & 1 deletion documents/DeploymentGuide.md
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ Once you've opened the project in [Codespaces](#github-codespaces), [Dev Contain
<AI-Search-Name> <Search-Endpoint> \
<AI-Foundry-Resource-ID> <CU-Foundry-Resource-ID> \
<OpenAI-Endpoint> <Embedding-Model> <Deployment-Model> \
<CU-Endpoint> <AI-Agent-Endpoint> <CU-API-Version> <Use-Case>
<CU-Endpoint> <CU-API-Version> <AI-Agent-Endpoint> <Use-Case>
```

10. Once the script has run successfully, open the [Azure Portal](https://portal.azure.com/), go to the deployed resource group, find the App Service, and get the app URL from `Default domain`.
Expand Down
113 changes: 54 additions & 59 deletions infra/scripts/index_scripts/00_create_sample_data_files.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
import pyodbc
import struct
import csv
import json
import os
from datetime import datetime
from azure.identity import AzureCliCredential, get_bearer_token_provider
import struct

import pyodbc
from azure.identity import AzureCliCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
# SQL Server setup

SQL_SERVER = '<YOUR-SQL-SERVER-NAME>.database.windows.net'
SQL_DATABASE = '<YOUR-DATABASE-NAME>'

credential = AzureCliCredential(process_timeout=30)

try:
try:
driver = "{ODBC Driver 18 for SQL Server}"
token_bytes = credential.get_token("https://database.windows.net/.default").token.encode("utf-16-LE")
token_struct = struct.pack(f"<I{len(token_bytes)}s", len(token_bytes), token_bytes)
Expand All @@ -21,7 +23,7 @@
conn = pyodbc.connect(connection_string, attrs_before={SQL_COPT_SS_ACCESS_TOKEN: token_struct})
cursor = conn.cursor()
print("SQL Server connection established.")
except:
except Exception:
driver = "{ODBC Driver 17 for SQL Server}"
token_bytes = credential.get_token("https://database.windows.net/.default").token.encode("utf-16-LE")
token_struct = struct.pack(f"<I{len(token_bytes)}s", len(token_bytes), token_bytes)
Expand All @@ -31,62 +33,62 @@
cursor = conn.cursor()
print("SQL Server connection established.")


def export_table_to_csv(table_name, output_dir=".", cursor=cursor):
"""
Export a SQL table to CSV file.

Args:
table_name: Name of the table to export
output_dir: Directory to save the CSV file (default: current directory)
cursor: Database cursor to use (default: uses the global cursor)

Returns:
Path to the created CSV file
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Generate output filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Generate output filename
csv_filename = f"sample_{table_name}.csv"
# csv_filename = f"{table_name}_{timestamp}.csv"
csv_path = os.path.join(output_dir, csv_filename)

try:
# Query all data from the table
query = f"SELECT * FROM {table_name}"
print(f"Executing query: {query}")
cursor.execute(query)

# Get column names
columns = [column[0] for column in cursor.description]

# Write to CSV
print(f"Writing data to '{csv_path}'...")
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)

# Write header
writer.writerow(columns)

# Write data rows
row_count = 0
while True:
rows = cursor.fetchmany(1000) # Fetch in batches for better performance
if not rows:
break

for row in rows:
# Convert each value to string, handling None values
writer.writerow([str(val) if val is not None else '' for val in row])
row_count += 1

if row_count % 10000 == 0:
print(f" Exported {row_count} rows...")

print(f"✓ Successfully exported {row_count} rows to '{csv_path}'")
return csv_path

except Exception as e:
print(f"Error exporting table '{table_name}': {e}")
raise
Expand All @@ -95,46 +97,45 @@ def export_table_to_csv(table_name, output_dir=".", cursor=cursor):
def export_table_to_json(table_name, output_dir=".", cursor=cursor, format="json"):
"""
Export a SQL table to JSON or JSON Lines file.

Args:
table_name: Name of the table to export
output_dir: Directory to save the file (default: current directory)
cursor: Database cursor to use (default: uses the global cursor)
format: Output format - "json" for JSON array or "jsonl" for JSON Lines (default: "json")

Returns:
Path to the created file
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Generate output filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
file_extension = "json" if format == "json" else "jsonl"
filename = f"sample_{table_name}.{file_extension}"
# filename = f"{table_name}_{timestamp}.{file_extension}"
file_path = os.path.join(output_dir, filename)

try:
# Query all data from the table
query = f"SELECT * FROM {table_name}"
print(f"Executing query: {query}")
cursor.execute(query)

# Get column names
columns = [column[0] for column in cursor.description]

if format == "json":
# Collect all rows for JSON array format
print(f"Collecting data from table...")
print("Collecting data from table...")
rows_data = []
row_count = 0

while True:
rows = cursor.fetchmany(1000) # Fetch in batches for better performance
if not rows:
break

for row in rows:
# Convert row to dictionary
row_dict = {}
Expand All @@ -147,26 +148,26 @@ def export_table_to_json(table_name, output_dir=".", cursor=cursor, format="json
row_dict[col] = value
rows_data.append(row_dict)
row_count += 1

if row_count % 10000 == 0:
print(f" Collected {row_count} rows...")

# Write as JSON array
print(f"Writing {row_count} rows to '{file_path}'...")
with open(file_path, 'w', encoding='utf-8') as json_file:
json.dump(rows_data, json_file, ensure_ascii=False, indent=2)

else: # jsonl format
# Write to JSON Lines format (one JSON object per line)
print(f"Writing data to '{file_path}'...")
row_count = 0

with open(file_path, 'w', encoding='utf-8') as jsonl_file:
while True:
rows = cursor.fetchmany(1000) # Fetch in batches for better performance
if not rows:
break

for row in rows:
# Convert row to dictionary
row_dict = {}
Expand All @@ -179,13 +180,13 @@ def export_table_to_json(table_name, output_dir=".", cursor=cursor, format="json
row_dict[col] = value
jsonl_file.write(json.dumps(row_dict, ensure_ascii=False) + '\n')
row_count += 1

if row_count % 10000 == 0:
print(f" Exported {row_count} rows...")

print(f"✓ Successfully exported {row_count} rows to '{file_path}'")
return file_path

except Exception as e:
print(f"Error exporting table '{table_name}': {e}")
raise
Expand All @@ -202,10 +203,6 @@ def export_table_to_json(table_name, output_dir=".", cursor=cursor, format="json
cursor.close()
conn.close()
print("Database connection closed.")


from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
SEARCH_ENDPOINT = "https://<YOUR-SEARCH-SERVICE-NAME>.search.windows.net"
INDEX_NAME = "call_transcripts_index"

Expand All @@ -219,26 +216,25 @@ def export_table_to_json(table_name, output_dir=".", cursor=cursor, format="json
def export_search_index_to_json(index_name, output_dir=".", search_client=search_client, format="json"):
"""
Export all documents from an Azure AI Search index to JSON or JSON Lines file.

Args:
index_name: Name of the search index
output_dir: Directory to save the file (default: current directory)
search_client: Azure Search client to use (default: uses the global search_client)
format: Output format - "json" for JSON array or "jsonl" for JSON Lines (default: "json")

Returns:
Path to the created file
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Generate output filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
file_extension = "json" if format == "json" else "jsonl"
filename = f"sample_{index_name}.{file_extension}"
# filename = f"{index_name}_{timestamp}.{file_extension}"
file_path = os.path.join(output_dir, filename)

try:
# Search for all documents (empty search returns everything)
print(f"Retrieving documents from search index '{index_name}'...")
Expand All @@ -247,31 +243,31 @@ def export_search_index_to_json(index_name, output_dir=".", search_client=search
include_total_count=True,
top=1000 # Adjust batch size as needed
)

# Collect documents
documents = []
doc_count = 0

if format == "json":
# Collect all documents for JSON array format
print(f"Collecting documents...")
print("Collecting documents...")
for result in results:
doc = dict(result)
documents.append(doc)
doc_count += 1

if doc_count % 1000 == 0:
print(f" Collected {doc_count} documents...")

if doc_count == 0:
print(f"No documents found in index '{index_name}'")
return None

# Write as JSON array
print(f"Writing {doc_count} documents to '{file_path}'...")
with open(file_path, 'w', encoding='utf-8') as json_file:
json.dump(documents, json_file, ensure_ascii=False, indent=2)

else: # jsonl format
# Write to JSON Lines format (one JSON object per line)
print(f"Writing documents to '{file_path}'...")
Expand All @@ -280,17 +276,17 @@ def export_search_index_to_json(index_name, output_dir=".", search_client=search
doc = dict(result)
jsonl_file.write(json.dumps(doc, ensure_ascii=False) + '\n')
doc_count += 1

if doc_count % 1000 == 0:
print(f" Exported {doc_count} documents...")

if doc_count == 0:
print(f"No documents found in index '{index_name}'")
return None

print(f"✓ Successfully exported {doc_count} documents to '{file_path}'")
return file_path

except Exception as e:
print(f"Error exporting search index '{index_name}': {e}")
raise
Expand All @@ -299,4 +295,3 @@ def export_search_index_to_json(index_name, output_dir=".", search_client=search
# Export search index to JSON and JSON Lines
export_search_index_to_json(INDEX_NAME, output_dir="./exported_data", format="json")
# export_search_index_to_json(INDEX_NAME, output_dir="./exported_data", format="jsonl")

3 changes: 2 additions & 1 deletion infra/scripts/index_scripts/01_create_search_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

INDEX_NAME = "call_transcripts_index"


def create_search_index():
"""
Creates or updates an Azure Cognitive Search index configured for:
Expand Down Expand Up @@ -105,4 +106,4 @@ def create_search_index():
print(f"✓ Search index '{result.name}' created")


create_search_index()
create_search_index()
Loading