Skip to content

Commit 673ff45

Browse files
Refactor data processing scripts and enhance agent integration
- Updated 00_create_sample_data_files.py to improve CSV and JSON export functions, ensuring better error handling and code readability. - Modified 01_create_search_index.py to include additional whitespace for consistency. - Enhanced 03_cu_process_data_text.py by implementing asynchronous processing for embeddings and agent creation, improving performance and scalability. - Updated 04_cu_process_custom_data.py to streamline the search index creation process and improve error handling. - Adjusted requirements.txt to include new agent framework dependencies and ensure compatibility. - Enhanced process_sample_data.sh and run_create_index_scripts.sh to support new solution_name parameter for better configuration management.
1 parent 167e1a7 commit 673ff45

File tree

7 files changed

+385
-255
lines changed

7 files changed

+385
-255
lines changed

infra/scripts/index_scripts/00_create_sample_data_files.py

Lines changed: 51 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,21 @@
1-
import pyodbc
2-
import struct
31
import csv
42
import json
53
import os
4+
import struct
65
from datetime import datetime
7-
from azure.identity import AzureCliCredential, get_bearer_token_provider
6+
7+
import pyodbc
8+
from azure.identity import AzureCliCredential
9+
from azure.search.documents import SearchClient
10+
from azure.search.documents.indexes import SearchIndexClient
811
# SQL Server setup
912

1013
SQL_SERVER = '<YOUR-SQL-SERVER-NAME>.database.windows.net'
1114
SQL_DATABASE = '<YOUR-DATABASE-NAME>'
1215

1316
credential = AzureCliCredential(process_timeout=30)
1417

15-
try:
18+
try:
1619
driver = "{ODBC Driver 18 for SQL Server}"
1720
token_bytes = credential.get_token("https://database.windows.net/.default").token.encode("utf-16-LE")
1821
token_struct = struct.pack(f"<I{len(token_bytes)}s", len(token_bytes), token_bytes)
@@ -31,15 +34,16 @@
3134
cursor = conn.cursor()
3235
print("SQL Server connection established.")
3336

37+
3438
def export_table_to_csv(table_name, output_dir=".", cursor=cursor):
3539
"""
3640
Export a SQL table to CSV file.
37-
41+
3842
Args:
3943
table_name: Name of the table to export
4044
output_dir: Directory to save the CSV file (default: current directory)
4145
cursor: Database cursor to use (default: uses the global cursor)
42-
46+
4347
Returns:
4448
Path to the created CSV file
4549
"""
@@ -51,42 +55,42 @@ def export_table_to_csv(table_name, output_dir=".", cursor=cursor):
5155
csv_filename = f"sample_{table_name}.csv"
5256
# csv_filename = f"{table_name}_{timestamp}.csv"
5357
csv_path = os.path.join(output_dir, csv_filename)
54-
58+
5559
try:
5660
# Query all data from the table
5761
query = f"SELECT * FROM {table_name}"
5862
print(f"Executing query: {query}")
5963
cursor.execute(query)
60-
64+
6165
# Get column names
6266
columns = [column[0] for column in cursor.description]
63-
67+
6468
# Write to CSV
6569
print(f"Writing data to '{csv_path}'...")
6670
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
6771
writer = csv.writer(csvfile)
68-
72+
6973
# Write header
7074
writer.writerow(columns)
71-
75+
7276
# Write data rows
7377
row_count = 0
7478
while True:
7579
rows = cursor.fetchmany(1000) # Fetch in batches for better performance
7680
if not rows:
7781
break
78-
82+
7983
for row in rows:
8084
# Convert each value to string, handling None values
8185
writer.writerow([str(val) if val is not None else '' for val in row])
8286
row_count += 1
83-
87+
8488
if row_count % 10000 == 0:
8589
print(f" Exported {row_count} rows...")
86-
90+
8791
print(f"✓ Successfully exported {row_count} rows to '{csv_path}'")
8892
return csv_path
89-
93+
9094
except Exception as e:
9195
print(f"Error exporting table '{table_name}': {e}")
9296
raise
@@ -95,46 +99,45 @@ def export_table_to_csv(table_name, output_dir=".", cursor=cursor):
9599
def export_table_to_json(table_name, output_dir=".", cursor=cursor, format="json"):
96100
"""
97101
Export a SQL table to JSON or JSON Lines file.
98-
102+
99103
Args:
100104
table_name: Name of the table to export
101105
output_dir: Directory to save the file (default: current directory)
102106
cursor: Database cursor to use (default: uses the global cursor)
103107
format: Output format - "json" for JSON array or "jsonl" for JSON Lines (default: "json")
104-
108+
105109
Returns:
106110
Path to the created file
107111
"""
108112
# Create output directory if it doesn't exist
109113
os.makedirs(output_dir, exist_ok=True)
110-
114+
111115
# Generate output filename
112-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
113116
file_extension = "json" if format == "json" else "jsonl"
114117
filename = f"sample_{table_name}.{file_extension}"
115118
# filename = f"{table_name}_{timestamp}.{file_extension}"
116119
file_path = os.path.join(output_dir, filename)
117-
120+
118121
try:
119122
# Query all data from the table
120123
query = f"SELECT * FROM {table_name}"
121124
print(f"Executing query: {query}")
122125
cursor.execute(query)
123-
126+
124127
# Get column names
125128
columns = [column[0] for column in cursor.description]
126-
129+
127130
if format == "json":
128131
# Collect all rows for JSON array format
129-
print(f"Collecting data from table...")
132+
print("Collecting data from table...")
130133
rows_data = []
131134
row_count = 0
132-
135+
133136
while True:
134137
rows = cursor.fetchmany(1000) # Fetch in batches for better performance
135138
if not rows:
136139
break
137-
140+
138141
for row in rows:
139142
# Convert row to dictionary
140143
row_dict = {}
@@ -147,26 +150,26 @@ def export_table_to_json(table_name, output_dir=".", cursor=cursor, format="json
147150
row_dict[col] = value
148151
rows_data.append(row_dict)
149152
row_count += 1
150-
153+
151154
if row_count % 10000 == 0:
152155
print(f" Collected {row_count} rows...")
153-
156+
154157
# Write as JSON array
155158
print(f"Writing {row_count} rows to '{file_path}'...")
156159
with open(file_path, 'w', encoding='utf-8') as json_file:
157160
json.dump(rows_data, json_file, ensure_ascii=False, indent=2)
158-
161+
159162
else: # jsonl format
160163
# Write to JSON Lines format (one JSON object per line)
161164
print(f"Writing data to '{file_path}'...")
162165
row_count = 0
163-
166+
164167
with open(file_path, 'w', encoding='utf-8') as jsonl_file:
165168
while True:
166169
rows = cursor.fetchmany(1000) # Fetch in batches for better performance
167170
if not rows:
168171
break
169-
172+
170173
for row in rows:
171174
# Convert row to dictionary
172175
row_dict = {}
@@ -179,13 +182,13 @@ def export_table_to_json(table_name, output_dir=".", cursor=cursor, format="json
179182
row_dict[col] = value
180183
jsonl_file.write(json.dumps(row_dict, ensure_ascii=False) + '\n')
181184
row_count += 1
182-
185+
183186
if row_count % 10000 == 0:
184187
print(f" Exported {row_count} rows...")
185-
188+
186189
print(f"✓ Successfully exported {row_count} rows to '{file_path}'")
187190
return file_path
188-
191+
189192
except Exception as e:
190193
print(f"Error exporting table '{table_name}': {e}")
191194
raise
@@ -202,10 +205,6 @@ def export_table_to_json(table_name, output_dir=".", cursor=cursor, format="json
202205
cursor.close()
203206
conn.close()
204207
print("Database connection closed.")
205-
206-
207-
from azure.search.documents import SearchClient
208-
from azure.search.documents.indexes import SearchIndexClient
209208
SEARCH_ENDPOINT = "https://<YOUR-SEARCH-SERVICE-NAME>.search.windows.net"
210209
INDEX_NAME = "call_transcripts_index"
211210

@@ -219,26 +218,25 @@ def export_table_to_json(table_name, output_dir=".", cursor=cursor, format="json
219218
def export_search_index_to_json(index_name, output_dir=".", search_client=search_client, format="json"):
220219
"""
221220
Export all documents from an Azure AI Search index to JSON or JSON Lines file.
222-
221+
223222
Args:
224223
index_name: Name of the search index
225224
output_dir: Directory to save the file (default: current directory)
226225
search_client: Azure Search client to use (default: uses the global search_client)
227226
format: Output format - "json" for JSON array or "jsonl" for JSON Lines (default: "json")
228-
227+
229228
Returns:
230229
Path to the created file
231230
"""
232231
# Create output directory if it doesn't exist
233232
os.makedirs(output_dir, exist_ok=True)
234-
233+
235234
# Generate output filename
236-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
237235
file_extension = "json" if format == "json" else "jsonl"
238236
filename = f"sample_{index_name}.{file_extension}"
239237
# filename = f"{index_name}_{timestamp}.{file_extension}"
240238
file_path = os.path.join(output_dir, filename)
241-
239+
242240
try:
243241
# Search for all documents (empty search returns everything)
244242
print(f"Retrieving documents from search index '{index_name}'...")
@@ -247,31 +245,31 @@ def export_search_index_to_json(index_name, output_dir=".", search_client=search
247245
include_total_count=True,
248246
top=1000 # Adjust batch size as needed
249247
)
250-
248+
251249
# Collect documents
252250
documents = []
253251
doc_count = 0
254-
252+
255253
if format == "json":
256254
# Collect all documents for JSON array format
257-
print(f"Collecting documents...")
255+
print("Collecting documents...")
258256
for result in results:
259257
doc = dict(result)
260258
documents.append(doc)
261259
doc_count += 1
262-
260+
263261
if doc_count % 1000 == 0:
264262
print(f" Collected {doc_count} documents...")
265-
263+
266264
if doc_count == 0:
267265
print(f"No documents found in index '{index_name}'")
268266
return None
269-
267+
270268
# Write as JSON array
271269
print(f"Writing {doc_count} documents to '{file_path}'...")
272270
with open(file_path, 'w', encoding='utf-8') as json_file:
273271
json.dump(documents, json_file, ensure_ascii=False, indent=2)
274-
272+
275273
else: # jsonl format
276274
# Write to JSON Lines format (one JSON object per line)
277275
print(f"Writing documents to '{file_path}'...")
@@ -280,17 +278,17 @@ def export_search_index_to_json(index_name, output_dir=".", search_client=search
280278
doc = dict(result)
281279
jsonl_file.write(json.dumps(doc, ensure_ascii=False) + '\n')
282280
doc_count += 1
283-
281+
284282
if doc_count % 1000 == 0:
285283
print(f" Exported {doc_count} documents...")
286-
284+
287285
if doc_count == 0:
288286
print(f"No documents found in index '{index_name}'")
289287
return None
290-
288+
291289
print(f"✓ Successfully exported {doc_count} documents to '{file_path}'")
292290
return file_path
293-
291+
294292
except Exception as e:
295293
print(f"Error exporting search index '{index_name}': {e}")
296294
raise

infra/scripts/index_scripts/01_create_search_index.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
INDEX_NAME = "call_transcripts_index"
3232

33+
3334
def create_search_index():
3435
"""
3536
Creates or updates an Azure Cognitive Search index configured for:
@@ -105,4 +106,4 @@ def create_search_index():
105106
print(f"✓ Search index '{result.name}' created")
106107

107108

108-
create_search_index()
109+
create_search_index()

0 commit comments

Comments
 (0)