diff --git a/documents/CustomizeData.md b/documents/CustomizeData.md index 12946833a..90d734955 100644 --- a/documents/CustomizeData.md +++ b/documents/CustomizeData.md @@ -30,7 +30,7 @@ If you would like to update the solution to leverage your own data please follow \ \ \ - + ``` ## How to Login to VM Using Azure Bastion diff --git a/documents/DeploymentGuide.md b/documents/DeploymentGuide.md index 609b3cca7..53c4dae53 100644 --- a/documents/DeploymentGuide.md +++ b/documents/DeploymentGuide.md @@ -309,7 +309,7 @@ Once you've opened the project in [Codespaces](#github-codespaces), [Dev Contain \ \ \ - + ``` 10. Once the script has run successfully, open the [Azure Portal](https://portal.azure.com/), go to the deployed resource group, find the App Service, and get the app URL from `Default domain`. diff --git a/infra/scripts/index_scripts/00_create_sample_data_files.py b/infra/scripts/index_scripts/00_create_sample_data_files.py index a792b827f..a8b862e42 100644 --- a/infra/scripts/index_scripts/00_create_sample_data_files.py +++ b/infra/scripts/index_scripts/00_create_sample_data_files.py @@ -1,10 +1,12 @@ -import pyodbc -import struct import csv import json import os -from datetime import datetime -from azure.identity import AzureCliCredential, get_bearer_token_provider +import struct + +import pyodbc +from azure.identity import AzureCliCredential +from azure.search.documents import SearchClient +from azure.search.documents.indexes import SearchIndexClient # SQL Server setup SQL_SERVER = '.database.windows.net' @@ -12,7 +14,7 @@ credential = AzureCliCredential(process_timeout=30) -try: +try: driver = "{ODBC Driver 18 for SQL Server}" token_bytes = credential.get_token("https://database.windows.net/.default").token.encode("utf-16-LE") token_struct = struct.pack(f" 0: - cursor.execute("UPDATE [dbo].[processed_data] SET StartTime = FORMAT(DATEADD(DAY, ?, StartTime), 'yyyy-MM-dd HH:mm:ss'), EndTime = FORMAT(DATEADD(DAY, ?, EndTime), 'yyyy-MM-dd HH:mm:ss')", (days_difference, days_difference)) - cursor.execute("UPDATE [dbo].[km_processed_data] SET StartTime = FORMAT(DATEADD(DAY, ?, StartTime), 'yyyy-MM-dd HH:mm:ss'), EndTime = FORMAT(DATEADD(DAY, ?, EndTime), 'yyyy-MM-dd HH:mm:ss')", (days_difference, days_difference)) - cursor.execute("UPDATE [dbo].[processed_data_key_phrases] SET StartTime = FORMAT(DATEADD(DAY, ?, StartTime), 'yyyy-MM-dd HH:mm:ss')", (days_difference,)) + # Update processed data for RAG + cursor.execute('DROP TABLE IF EXISTS km_processed_data') + cursor.execute("""CREATE TABLE km_processed_data ( + ConversationId varchar(255) NOT NULL PRIMARY KEY, + StartTime varchar(255), + EndTime varchar(255), + Content varchar(max), + summary varchar(max), + satisfied varchar(255), + sentiment varchar(255), + keyphrases nvarchar(max), + complaint varchar(255), + topic varchar(255) + );""") conn.commit() + cursor.execute('''select ConversationId, StartTime, EndTime, Content, summary, satisfied, sentiment, + key_phrases as keyphrases, complaint, mined_topic as topic from processed_data''') + rows = cursor.fetchall() + columns = ["ConversationId", "StartTime", "EndTime", "Content", "summary", "satisfied", "sentiment", + "keyphrases", "complaint", "topic"] + + df_km = pd.DataFrame([list(row) for row in rows], columns=columns) + generate_sql_insert_script(df_km, 'km_processed_data', columns, 'processed_km_data_with_mined_topics.sql') + + # Update processed_data_key_phrases table + cursor.execute('''select ConversationId, key_phrases, sentiment, mined_topic as topic, StartTime from processed_data''') + rows = [tuple(row) for row in cursor.fetchall()] + column_names = [i[0] for i in cursor.description] + df = pd.DataFrame(rows, columns=column_names) + df = df[df['ConversationId'].isin(conversationIds)] + + # Collect all key phrase records for batch insert + key_phrase_records = [] + for _, row in df.iterrows(): + key_phrases = row['key_phrases'].split(',') + for key_phrase in key_phrases: + key_phrase = key_phrase.strip() + key_phrase_records.append({ + 'ConversationId': row['ConversationId'], + 'key_phrase': key_phrase, + 'sentiment': row['sentiment'], + 'topic': row['topic'], + 'StartTime': row['StartTime'] + }) + + # Batch insert using optimized SQL script + if key_phrase_records: + df_key_phrases = pd.DataFrame(key_phrase_records) + columns = ['ConversationId', 'key_phrase', 'sentiment', 'topic', 'StartTime'] + generate_sql_insert_script(df_key_phrases, 'processed_data_key_phrases', columns, 'processed_new_key_phrases.sql') + + # Adjust dates to current date + today = datetime.today() + cursor.execute("SELECT MAX(CAST(StartTime AS DATETIME)) FROM [dbo].[processed_data]") + max_start_time = cursor.fetchone()[0] + days_difference = (today.date() - max_start_time.date()).days - 1 if max_start_time else 0 + if days_difference > 0: + cursor.execute("UPDATE [dbo].[processed_data] SET StartTime = FORMAT(DATEADD(DAY, ?, StartTime), 'yyyy-MM-dd HH:mm:ss'), EndTime = FORMAT(DATEADD(DAY, ?, EndTime), 'yyyy-MM-dd HH:mm:ss')", (days_difference, days_difference)) + cursor.execute("UPDATE [dbo].[km_processed_data] SET StartTime = FORMAT(DATEADD(DAY, ?, StartTime), 'yyyy-MM-dd HH:mm:ss'), EndTime = FORMAT(DATEADD(DAY, ?, EndTime), 'yyyy-MM-dd HH:mm:ss')", (days_difference, days_difference)) + cursor.execute("UPDATE [dbo].[processed_data_key_phrases] SET StartTime = FORMAT(DATEADD(DAY, ?, StartTime), 'yyyy-MM-dd HH:mm:ss')", (days_difference,)) + conn.commit() -cursor.close() -conn.close() -print("✓ Data processing completed") + cursor.close() + conn.close() + print("✓ Data processing completed") + +finally: + # Delete the agents after processing is complete + print("Deleting topic mining and mapping agents...") + try: + async def delete_agents(): + """Delete topic mining and mapping agents asynchronously.""" + async with ( + AsyncAzureCliCredential(process_timeout=30) as async_cred, + AIProjectClient(endpoint=AI_PROJECT_ENDPOINT, credential=async_cred) as project_client, + ): + await project_client.agents.delete_version(topic_mining_agent.name, topic_mining_agent.version) + await project_client.agents.delete_version(topic_mapping_agent.name, topic_mapping_agent.version) + + asyncio.run(delete_agents()) + print(f"✓ Deleted agents: {topic_mining_agent.name}, {topic_mapping_agent.name}") + except Exception as e: + print(f"Warning: Could not delete agents: {e}") diff --git a/infra/scripts/index_scripts/04_cu_process_custom_data.py b/infra/scripts/index_scripts/04_cu_process_custom_data.py index ad44f7b0b..a45d3533a 100644 --- a/infra/scripts/index_scripts/04_cu_process_custom_data.py +++ b/infra/scripts/index_scripts/04_cu_process_custom_data.py @@ -1,16 +1,24 @@ +""" +Custom data processing script for conversation knowledge mining. + +This module processes custom call transcripts using Azure Content Understanding, +generates embeddings, and stores processed data in SQL Server and Azure Search. +""" import argparse +import asyncio import json import os import re import struct -import time from datetime import datetime, timedelta from urllib.parse import urlparse import pandas as pd import pyodbc -from azure.ai.inference import ChatCompletionsClient, EmbeddingsClient -from azure.ai.inference.models import SystemMessage, UserMessage +from azure.ai.inference.aio import EmbeddingsClient +from azure.ai.projects.aio import AIProjectClient +from azure.ai.projects.models import PromptAgentDefinition +from azure.identity.aio import AzureCliCredential as AsyncAzureCliCredential from azure.identity import AzureCliCredential, get_bearer_token_provider from azure.search.documents import SearchClient from azure.search.documents.indexes import SearchIndexClient @@ -30,6 +38,9 @@ ) from azure.storage.filedatalake import DataLakeServiceClient +from agent_framework import ChatAgent +from agent_framework.azure import AzureAIClient + from content_understanding_client import AzureContentUnderstandingClient # Constants and configuration @@ -50,6 +61,7 @@ parser.add_argument('--sql_database', required=True, help='Azure SQL Database name') parser.add_argument('--cu_endpoint', required=True, help='Azure Content Understanding endpoint') parser.add_argument('--cu_api_version', required=True, help='Azure Content Understanding API version') +parser.add_argument('--solution_name', required=True, help='Solution name for agent naming') args = parser.parse_args() @@ -64,39 +76,32 @@ SQL_DATABASE = args.sql_database CU_ENDPOINT = args.cu_endpoint CU_API_VERSION = args.cu_api_version +SOLUTION_NAME = args.solution_name + +# Construct agent names from solution name (matching 01_create_agents.py pattern) +TOPIC_MINING_AGENT_NAME = f"KM-TopicMiningAgent-{SOLUTION_NAME}" +TOPIC_MAPPING_AGENT_NAME = f"KM-TopicMappingAgent-{SOLUTION_NAME}" + +# Azure AI Foundry (Inference) endpoint +inference_endpoint = f"https://{urlparse(AI_PROJECT_ENDPOINT).netloc}/models" # Azure DataLake setup account_url = f"https://{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net" credential = AzureCliCredential(process_timeout=30) service_client = DataLakeServiceClient(account_url, credential=credential, api_version='2023-01-03') file_system_client = service_client.get_file_system_client(FILE_SYSTEM_CLIENT_NAME) -directory_name = DIRECTORY -paths = list(file_system_client.get_paths(path=directory_name)) +paths = list(file_system_client.get_paths(path=DIRECTORY)) # Azure Search setup search_credential = AzureCliCredential(process_timeout=30) search_client = SearchClient(SEARCH_ENDPOINT, INDEX_NAME, search_credential) index_client = SearchIndexClient(endpoint=SEARCH_ENDPOINT, credential=search_credential) -# Azure AI Foundry (Inference) clients (Managed Identity) -inference_endpoint = f"https://{urlparse(AI_PROJECT_ENDPOINT).netloc}/models" - -chat_client = ChatCompletionsClient( - endpoint=inference_endpoint, - credential=credential, - credential_scopes=["https://ai.azure.com/.default"], -) - -embeddings_client = EmbeddingsClient( - endpoint=inference_endpoint, - credential=credential, - credential_scopes=["https://ai.azure.com/.default"], -) - # Delete the search index search_index_client = SearchIndexClient(SEARCH_ENDPOINT, search_credential) search_index_client.delete_index(INDEX_NAME) + # Create the search index def create_search_index(): """ @@ -168,8 +173,10 @@ def create_search_index(): result = index_client.create_or_update_index(index) print(f"✓ Search index '{result.name}' created") + create_search_index() + # SQL Server setup DRIVER = "{ODBC Driver 18 for SQL Server}" token_bytes = credential.get_token("https://database.windows.net/.default").token.encode("utf-16-LE") @@ -188,15 +195,17 @@ def create_search_index(): token_provider=cu_token_provider ) + # Utility functions -def get_embeddings(text: str): +async def get_embeddings_async(text: str, embeddings_client): + """Get embeddings using async EmbeddingsClient.""" try: - resp = embeddings_client.embed(model=EMBEDDING_MODEL, input=[text]) + resp = await embeddings_client.embed(model=EMBEDDING_MODEL, input=[text]) return resp.data[0].embedding except Exception as e: print(f"Error getting embeddings: {e}") raise -# -------------------------------------------------------------------------- + def generate_sql_insert_script(df, table_name, columns, sql_file_name): """ @@ -269,11 +278,13 @@ def generate_sql_insert_script(df, table_name, columns, sql_file_name): record_count = len(df) return record_count + def clean_spaces_with_regex(text): cleaned_text = re.sub(r'\s+', ' ', text) cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text) return cleaned_text + def chunk_data(text, tokens_per_chunk=1024): text = clean_spaces_with_regex(text) sentences = text.split('. ') @@ -290,20 +301,19 @@ def chunk_data(text, tokens_per_chunk=1024): chunks.append(current_chunk) return chunks -def prepare_search_doc(content, document_id, path_name): + +async def prepare_search_doc(content, document_id, path_name, embeddings_client): chunks = chunk_data(content) docs = [] for idx, chunk in enumerate(chunks, 1): chunk_id = f"{document_id}_{str(idx).zfill(2)}" try: - v_contentVector = get_embeddings(str(chunk)) - except Exception as e: - print(f"Error getting embeddings on first try: {e}") - time.sleep(30) - try: - v_contentVector = get_embeddings(str(chunk)) - except Exception as e: - print(f"Error getting embeddings: {e}") + v_contentVector = await get_embeddings_async(str(chunk), embeddings_client) + except Exception: + await asyncio.sleep(30) + try: + v_contentVector = await get_embeddings_async(str(chunk), embeddings_client) + except Exception: v_contentVector = [] docs.append({ "id": chunk_id, @@ -314,6 +324,7 @@ def prepare_search_doc(content, document_id, path_name): }) return docs + # Database table creation def create_tables(): cursor.execute('DROP TABLE IF EXISTS processed_data') @@ -327,123 +338,167 @@ def create_tables(): sentiment varchar(255), topic varchar(255), key_phrases nvarchar(max), - complaint varchar(255), + complaint varchar(255), mined_topic varchar(255) );""") cursor.execute('DROP TABLE IF EXISTS processed_data_key_phrases') cursor.execute("""CREATE TABLE processed_data_key_phrases ( ConversationId varchar(255), - key_phrase varchar(500), + key_phrase varchar(500), sentiment varchar(255), - topic varchar(255), + topic varchar(255), StartTime varchar(255) );""") conn.commit() + create_tables() -ANALYZER_ID = "ckm-json" -# Process files and insert into DB and Search - transcripts -conversationIds, docs, counter = [], [], 0 -for path in paths: - file_client = file_system_client.get_file_client(path.name) - data_file = file_client.download_file() - data = data_file.readall() - try: - response = cu_client.begin_analyze(ANALYZER_ID, file_location="", file_data=data) - result = cu_client.poll_result(response) - file_name = path.name.split('/')[-1].replace("%3A", "_") - start_time = file_name.replace(".json", "")[-19:] - timestamp_format = "%Y-%m-%d %H_%M_%S" - start_timestamp = datetime.strptime(start_time, timestamp_format) - conversation_id = file_name.split('convo_', 1)[1].split('_')[0] - conversationIds.append(conversation_id) - duration = int(result['result']['contents'][0]['fields']['Duration']['valueString']) - end_timestamp = str(start_timestamp + timedelta(seconds=duration)).split(".")[0] - start_timestamp = str(start_timestamp).split(".")[0] - fields = result['result']['contents'][0]['fields'] - summary = fields['summary']['valueString'] - satisfied = fields['satisfied']['valueString'] - sentiment = fields['sentiment']['valueString'] - topic = fields['topic']['valueString'] - key_phrases = fields['keyPhrases']['valueString'] - complaint = fields['complaint']['valueString'] - content = fields['content']['valueString'] - cursor.execute( - "INSERT INTO processed_data (ConversationId, EndTime, StartTime, Content, summary, satisfied, sentiment, topic, key_phrases, complaint) VALUES (?,?,?,?,?,?,?,?,?,?)", - (conversation_id, end_timestamp, start_timestamp, content, summary, satisfied, sentiment, topic, key_phrases, complaint) - ) - conn.commit() - docs.extend(prepare_search_doc(content, conversation_id, path.name)) - counter += 1 - except: - pass - if docs != [] and counter % 10 == 0: - result = search_client.upload_documents(documents=docs) - docs = [] -if docs: - search_client.upload_documents(documents=docs) - -print(f"✓ Processed {counter} transcript files") - -# Process files for audio data -ANALYZER_ID = "ckm-audio" - -directory_name = AUDIO_DIRECTORY -paths = list(file_system_client.get_paths(path=directory_name)) -docs = [] -counter = 0 -# process and upload audio files to search index - audio data -for path in paths: - file_client = file_system_client.get_file_client(path.name) - data_file = file_client.download_file() - data = data_file.readall() - try: - # # Analyzer file - response = cu_client.begin_analyze(ANALYZER_ID, file_location="", file_data=data) - result = cu_client.poll_result(response) - - file_name = path.name.split('/')[-1] - start_time = file_name.replace(".wav", "")[-19:] - - timestamp_format = "%Y-%m-%d %H_%M_%S" # Adjust format if necessary - start_timestamp = datetime.strptime(start_time, timestamp_format) - - conversation_id = file_name.split('convo_', 1)[1].split('_')[0] - conversationIds.append(conversation_id) - - duration = int(result['result']['contents'][0]['fields']['Duration']['valueString']) - end_timestamp = str(start_timestamp + timedelta(seconds=duration)) - end_timestamp = end_timestamp.split(".")[0] - start_timestamp = str(start_timestamp).split(".")[0] - - summary = result['result']['contents'][0]['fields']['summary']['valueString'] - satisfied = result['result']['contents'][0]['fields']['satisfied']['valueString'] - sentiment = result['result']['contents'][0]['fields']['sentiment']['valueString'] - topic = result['result']['contents'][0]['fields']['topic']['valueString'] - key_phrases = result['result']['contents'][0]['fields']['keyPhrases']['valueString'] - complaint = result['result']['contents'][0]['fields']['complaint']['valueString'] - content = result['result']['contents'][0]['fields']['content']['valueString'] - # print(topic) - cursor.execute(f"INSERT INTO processed_data (ConversationId, EndTime, StartTime, Content, summary, satisfied, sentiment, topic, key_phrases, complaint) VALUES (?,?,?,?,?,?,?,?,?,?)", (conversation_id, end_timestamp, start_timestamp, content, summary, satisfied, sentiment, topic, key_phrases, complaint)) - conn.commit() - - document_id = conversation_id - docs.extend(prepare_search_doc(content, document_id, path.name)) - counter += 1 - except Exception as e: - pass - - if docs != [] and counter % 10 == 0: - result = search_client.upload_documents(documents=docs) +# Process files and insert into DB and Search +async def process_files(): + """Process all files with async embeddings client.""" + conversationIds, docs, counter = [], [], 0 + processed_records = [] # Collect all records for batch insert + + # Create embeddings client for entire processing session + async with ( + AsyncAzureCliCredential(process_timeout=30) as async_cred, + EmbeddingsClient( + endpoint=inference_endpoint, + credential=async_cred, + credential_scopes=["https://ai.azure.com/.default"], + ) as embeddings_client + ): + ANALYZER_ID = "ckm-json" + # Process files and insert into DB and Search - transcripts + for path in paths: + file_client = file_system_client.get_file_client(path.name) + data_file = file_client.download_file() + data = data_file.readall() + try: + response = cu_client.begin_analyze(ANALYZER_ID, file_location="", file_data=data) + result = cu_client.poll_result(response) + file_name = path.name.split('/')[-1].replace("%3A", "_") + start_time = file_name.replace(".json", "")[-19:] + timestamp_format = "%Y-%m-%d %H_%M_%S" + start_timestamp = datetime.strptime(start_time, timestamp_format) + conversation_id = file_name.split('convo_', 1)[1].split('_')[0] + conversationIds.append(conversation_id) + duration = int(result['result']['contents'][0]['fields']['Duration']['valueString']) + end_timestamp = str(start_timestamp + timedelta(seconds=duration)).split(".")[0] + start_timestamp = str(start_timestamp).split(".")[0] + fields = result['result']['contents'][0]['fields'] + summary = fields['summary']['valueString'] + satisfied = fields['satisfied']['valueString'] + sentiment = fields['sentiment']['valueString'] + topic = fields['topic']['valueString'] + key_phrases = fields['keyPhrases']['valueString'] + complaint = fields['complaint']['valueString'] + content = fields['content']['valueString'] + + # Collect record for batch insert + processed_records.append({ + 'ConversationId': conversation_id, + 'EndTime': end_timestamp, + 'StartTime': start_timestamp, + 'Content': content, + 'summary': summary, + 'satisfied': satisfied, + 'sentiment': sentiment, + 'topic': topic, + 'key_phrases': key_phrases, + 'complaint': complaint + }) + + docs.extend(await prepare_search_doc(content, conversation_id, path.name, embeddings_client)) + counter += 1 + except Exception: + pass + if docs != [] and counter % 10 == 0: + result = search_client.upload_documents(documents=docs) + docs = [] + if docs: + search_client.upload_documents(documents=docs) + + print(f"✓ Processed {counter} transcript files") + + # Process files for audio data + ANALYZER_ID = "ckm-audio" + audio_paths = list(file_system_client.get_paths(path=AUDIO_DIRECTORY)) docs = [] - -# upload the last batch -if docs != []: - search_client.upload_documents(documents=docs) - -print(f"✓ Processed {counter} audio files") + counter = 0 + # process and upload audio files to search index - audio data + for path in audio_paths: + file_client = file_system_client.get_file_client(path.name) + data_file = file_client.download_file() + data = data_file.readall() + try: + # Analyzer file + response = cu_client.begin_analyze(ANALYZER_ID, file_location="", file_data=data) + result = cu_client.poll_result(response) + + file_name = path.name.split('/')[-1] + start_time = file_name.replace(".wav", "")[-19:] + + timestamp_format = "%Y-%m-%d %H_%M_%S" + start_timestamp = datetime.strptime(start_time, timestamp_format) + + conversation_id = file_name.split('convo_', 1)[1].split('_')[0] + conversationIds.append(conversation_id) + + duration = int(result['result']['contents'][0]['fields']['Duration']['valueString']) + end_timestamp = str(start_timestamp + timedelta(seconds=duration)) + end_timestamp = end_timestamp.split(".")[0] + start_timestamp = str(start_timestamp).split(".")[0] + + summary = result['result']['contents'][0]['fields']['summary']['valueString'] + satisfied = result['result']['contents'][0]['fields']['satisfied']['valueString'] + sentiment = result['result']['contents'][0]['fields']['sentiment']['valueString'] + topic = result['result']['contents'][0]['fields']['topic']['valueString'] + key_phrases = result['result']['contents'][0]['fields']['keyPhrases']['valueString'] + complaint = result['result']['contents'][0]['fields']['complaint']['valueString'] + content = result['result']['contents'][0]['fields']['content']['valueString'] + + # Collect record for batch insert + processed_records.append({ + 'ConversationId': conversation_id, + 'EndTime': end_timestamp, + 'StartTime': start_timestamp, + 'Content': content, + 'summary': summary, + 'satisfied': satisfied, + 'sentiment': sentiment, + 'topic': topic, + 'key_phrases': key_phrases, + 'complaint': complaint + }) + + document_id = conversation_id + docs.extend(await prepare_search_doc(content, document_id, path.name, embeddings_client)) + counter += 1 + except Exception: + pass + if docs != [] and counter % 10 == 0: + result = search_client.upload_documents(documents=docs) + docs = [] + + # upload the last batch + if docs != []: + search_client.upload_documents(documents=docs) + + print(f"✓ Processed {counter} audio files") + + # Batch insert all processed records using optimized SQL script + if processed_records: + df_processed = pd.DataFrame(processed_records) + columns = ['ConversationId', 'EndTime', 'StartTime', 'Content', 'summary', 'satisfied', 'sentiment', 'topic', 'key_phrases', 'complaint'] + generate_sql_insert_script(df_processed, 'processed_data', columns, 'custom_processed_data_batch_insert.sql') + + return conversationIds + +# Run the async file processing +conversationIds = asyncio.run(process_files()) # Topic mining and mapping cursor.execute('SELECT distinct topic FROM processed_data') @@ -458,121 +513,223 @@ def create_tables(): conn.commit() topics_str = ', '.join(df['topic'].tolist()) -def call_gpt4(topics_str1, client): - topic_prompt = f""" - You are a data analysis assistant specialized in natural language processing and topic modeling. - Your task is to analyze the given text corpus and identify distinct topics present within the data. - {topics_str1} - 1. Identify the key topics in the text using topic modeling techniques. - 2. Choose the right number of topics based on data. Try to keep it up to 8 topics. - 3. Assign a clear and concise label to each topic based on its content. - 4. Provide a brief description of each topic along with its label. - 5. Add parental controls, billing issues like topics to the list of topics if the data includes calls related to them. - If the input data is insufficient for reliable topic modeling, indicate that more data is needed rather than making assumptions. - Ensure that the topics and labels are accurate, relevant, and easy to understand. - Return the topics and their labels in JSON format.Always add 'topics' node and 'label', 'description' attributes in json. - Do not return anything else. - """ - response = client.complete( - model=DEPLOYMENT_MODEL, - messages=[ - SystemMessage(content="You are a helpful assistant."), - UserMessage(content=topic_prompt), - ], - temperature=0, - ) - res = response.choices[0].message.content - return json.loads(res.replace("```json", '').replace("```", '')) +# Create agents for topic mining and mapping +print("Creating topic mining and mapping agents...") + +# Topic Mining Agent instruction +TOPIC_MINING_AGENT_INSTRUCTION = '''You are a data analysis assistant specialized in natural language processing and topic modeling. +Your task is to analyze conversation topics and identify distinct categories. + +Rules: +1. Identify key topics using topic modeling techniques +2. Choose the right number of topics based on data (try to keep it up to 8 topics) +3. Assign clear and concise labels to each topic +4. Provide brief descriptions for each topic +5. Include common topics like parental controls, billing issues if relevant +6. If data is insufficient, indicate more data is needed +7. Return topics in JSON format with 'topics' array containing objects with 'label' and 'description' fields +8. Return ONLY the JSON, no other text or markdown formatting +''' + +# Topic Mapping Agent instruction +TOPIC_MAPPING_AGENT_INSTRUCTION = '''You are a data analysis assistant that maps conversation topics to the closest matching category. +Return ONLY the matching topic EXACTLY as written in the list (case-sensitive) +Do not add any explanatory text, punctuation, quotes, or formatting +Do not create, rephrase, abbreviate, or pluralize topics +If no topic is a perfect match, choose the closest one from the list ONLY +''' + + +# Create async project client and agents +async def create_agents(): + """Create topic mining and mapping agents asynchronously.""" + async with ( + AsyncAzureCliCredential(process_timeout=30) as async_cred, + AIProjectClient(endpoint=AI_PROJECT_ENDPOINT, credential=async_cred) as project_client, + ): + topic_mining_agent = await project_client.agents.create_version( + agent_name=TOPIC_MINING_AGENT_NAME, + definition=PromptAgentDefinition( + model=DEPLOYMENT_MODEL, + instructions=TOPIC_MINING_AGENT_INSTRUCTION, + ), + ) + topic_mapping_agent = await project_client.agents.create_version( + agent_name=TOPIC_MAPPING_AGENT_NAME, + definition=PromptAgentDefinition( + model=DEPLOYMENT_MODEL, + instructions=TOPIC_MAPPING_AGENT_INSTRUCTION, + ), + ) -max_tokens = 3096 -res = call_gpt4(", ".join([]), chat_client) -for object1 in res['topics']: - cursor.execute("INSERT INTO km_mined_topics (label, description) VALUES (?,?)", (object1['label'], object1['description'])) -conn.commit() + return topic_mining_agent, topic_mapping_agent -cursor.execute('SELECT label FROM km_mined_topics') -rows = [tuple(row) for row in cursor.fetchall()] -column_names = [i[0] for i in cursor.description] -df_topics = pd.DataFrame(rows, columns=column_names) -mined_topics_list = df_topics['label'].tolist() -mined_topics = ", ".join(mined_topics_list) - - -def get_mined_topic_mapping(input_text, list_of_topics): - prompt = f'''You are a data analysis assistant to help find the closest topic for a given text {input_text} - from a list of topics - {list_of_topics}. - ALWAYS only return a topic from list - {list_of_topics}. Do not add any other text.''' - response = chat_client.complete( - model=DEPLOYMENT_MODEL, - messages=[ - SystemMessage(content="You are a helpful assistant."), - UserMessage(content=prompt), - ], - temperature=0, - ) - return response.choices[0].message.content +topic_mining_agent, topic_mapping_agent = asyncio.run(create_agents()) +print(f"✓ Created agents: {topic_mining_agent.name}, {topic_mapping_agent.name}") -cursor.execute('SELECT * FROM processed_data') -rows = [tuple(row) for row in cursor.fetchall()] -column_names = [i[0] for i in cursor.description] -df_processed_data = pd.DataFrame(rows, columns=column_names) -df_processed_data = df_processed_data[df_processed_data['ConversationId'].isin(conversationIds)] -for _, row in df_processed_data.iterrows(): - mined_topic_str = get_mined_topic_mapping(row['topic'], str(mined_topics_list)) - cursor.execute("UPDATE processed_data SET mined_topic = ? WHERE ConversationId = ?", (mined_topic_str, row['ConversationId'])) -conn.commit() +try: + async def call_topic_mining_agent(topics_str1): + """Use Topic Mining Agent with Agent Framework to analyze and categorize topics.""" + async with ( + AsyncAzureCliCredential(process_timeout=30) as async_cred, + AIProjectClient(endpoint=AI_PROJECT_ENDPOINT, credential=async_cred) as project_client, + ): + # Create chat client with topic mining agent + chat_client = AzureAIClient( + project_client=project_client, + agent_name=TOPIC_MINING_AGENT_NAME, + use_latest_version=True, + ) -# Update processed data for RAG -cursor.execute('DROP TABLE IF EXISTS km_processed_data') -cursor.execute("""CREATE TABLE km_processed_data ( - ConversationId varchar(255) NOT NULL PRIMARY KEY, - StartTime varchar(255), - EndTime varchar(255), - Content varchar(max), - summary varchar(max), - satisfied varchar(255), - sentiment varchar(255), - keyphrases nvarchar(max), - complaint varchar(255), - topic varchar(255) -);""") -conn.commit() -cursor.execute('''select ConversationId, StartTime, EndTime, Content, summary, satisfied, sentiment, -key_phrases as keyphrases, complaint, mined_topic as topic from processed_data''') -rows = cursor.fetchall() -columns = ["ConversationId", "StartTime", "EndTime", "Content", "summary", "satisfied", "sentiment", - "keyphrases", "complaint", "topic"] - -df_km = pd.DataFrame([list(row) for row in rows], columns=columns) -record_count = generate_sql_insert_script(df_km, 'km_processed_data', columns, 'km_processed_data_insert.sql') -print(f"✓ Loaded {record_count} sample records") - -# Update processed_data_key_phrases table -cursor.execute('''select ConversationId, key_phrases, sentiment, mined_topic as topic, StartTime from processed_data''') -rows = [tuple(row) for row in cursor.fetchall()] -column_names = [i[0] for i in cursor.description] -df = pd.DataFrame(rows, columns=column_names) -df = df[df['ConversationId'].isin(conversationIds)] -for _, row in df.iterrows(): - key_phrases = row['key_phrases'].split(',') - for key_phrase in key_phrases: - key_phrase = key_phrase.strip() - cursor.execute("INSERT INTO processed_data_key_phrases (ConversationId, key_phrase, sentiment, topic, StartTime) VALUES (?,?,?,?,?)", - (row['ConversationId'], key_phrase, row['sentiment'], row['topic'], row['StartTime'])) -conn.commit() + async with ChatAgent( + chat_client=chat_client, + store=False, # No need to store conversation history + ) as chat_agent: + # Query with the topics string + query = f"Analyze these conversation topics and identify distinct categories: {topics_str1}" -# Adjust dates to current date -today = datetime.today() -cursor.execute("SELECT MAX(CAST(StartTime AS DATETIME)) FROM [dbo].[processed_data]") -max_start_time = cursor.fetchone()[0] -days_difference = (today - max_start_time).days - 1 if max_start_time else 0 -cursor.execute("UPDATE [dbo].[processed_data] SET StartTime = FORMAT(DATEADD(DAY, ?, StartTime), 'yyyy-MM-dd HH:mm:ss'), EndTime = FORMAT(DATEADD(DAY, ?, EndTime), 'yyyy-MM-dd HH:mm:ss')", (days_difference, days_difference)) -cursor.execute("UPDATE [dbo].[km_processed_data] SET StartTime = FORMAT(DATEADD(DAY, ?, StartTime), 'yyyy-MM-dd HH:mm:ss'), EndTime = FORMAT(DATEADD(DAY, ?, EndTime), 'yyyy-MM-dd HH:mm:ss')", (days_difference, days_difference)) -cursor.execute("UPDATE [dbo].[processed_data_key_phrases] SET StartTime = FORMAT(DATEADD(DAY, ?, StartTime), 'yyyy-MM-dd HH:mm:ss')", (days_difference,)) -conn.commit() + result = await chat_agent.run(messages=query) + res = result.text + # Clean up markdown formatting if present + res = res.replace("```json", '').replace("```", '').strip() + return json.loads(res) -cursor.close() -conn.close() -print("✓ Data processing completed") + MAX_TOKENS = 3096 + + res = asyncio.run(call_topic_mining_agent(topics_str)) + for object1 in res['topics']: + cursor.execute("INSERT INTO km_mined_topics (label, description) VALUES (?,?)", (object1['label'], object1['description'])) + conn.commit() + + cursor.execute('SELECT label FROM km_mined_topics') + rows = [tuple(row) for row in cursor.fetchall()] + column_names = [i[0] for i in cursor.description] + df_topics = pd.DataFrame(rows, columns=column_names) + mined_topics_list = df_topics['label'].tolist() + mined_topics = ", ".join(mined_topics_list) + print(f"✓ Mined {len(mined_topics_list)} topics") + + async def call_topic_mapping_agent(input_text, list_of_topics): + """Use Topic Mapping Agent with Agent Framework to map topic to category.""" + async with ( + AsyncAzureCliCredential(process_timeout=30) as async_cred, + AIProjectClient(endpoint=AI_PROJECT_ENDPOINT, credential=async_cred) as project_client, + ): + # Create chat client with topic mapping agent + chat_client = AzureAIClient( + project_client=project_client, + agent_name=TOPIC_MAPPING_AGENT_NAME, + use_latest_version=True, + ) + + async with ChatAgent( + chat_client=chat_client, + store=False, + ) as chat_agent: + query = f"""Find the closest topic for this text: '{input_text}' from this list of topics: {list_of_topics}""" + + result = await chat_agent.run(messages=query) + return result.text.strip() + + cursor.execute('SELECT * FROM processed_data') + rows = [tuple(row) for row in cursor.fetchall()] + column_names = [i[0] for i in cursor.description] + df_processed_data = pd.DataFrame(rows, columns=column_names) + df_processed_data = df_processed_data[df_processed_data['ConversationId'].isin(conversationIds)] + + # Map topics using agent asynchronously + async def map_all_topics(): + """Map all topics to categories using agent.""" + for _, row in df_processed_data.iterrows(): + mined_topic_str = await call_topic_mapping_agent(row['topic'], str(mined_topics_list)) + cursor.execute("UPDATE processed_data SET mined_topic = ? WHERE ConversationId = ?", (mined_topic_str, row['ConversationId'])) + conn.commit() + + asyncio.run(map_all_topics()) + + # Update processed data for RAG + cursor.execute('DROP TABLE IF EXISTS km_processed_data') + cursor.execute("""CREATE TABLE km_processed_data ( + ConversationId varchar(255) NOT NULL PRIMARY KEY, + StartTime varchar(255), + EndTime varchar(255), + Content varchar(max), + summary varchar(max), + satisfied varchar(255), + sentiment varchar(255), + keyphrases nvarchar(max), + complaint varchar(255), + topic varchar(255) + );""") + conn.commit() + cursor.execute('''select ConversationId, StartTime, EndTime, Content, summary, satisfied, sentiment, + key_phrases as keyphrases, complaint, mined_topic as topic from processed_data''') + rows = cursor.fetchall() + columns = ["ConversationId", "StartTime", "EndTime", "Content", "summary", "satisfied", "sentiment", + "keyphrases", "complaint", "topic"] + + df_km = pd.DataFrame([list(row) for row in rows], columns=columns) + record_count = generate_sql_insert_script(df_km, 'km_processed_data', columns, 'custom_km_data_with_mined_topics.sql') + print(f"✓ Loaded {record_count} sample records") + + # Update processed_data_key_phrases table + cursor.execute('''select ConversationId, key_phrases, sentiment, mined_topic as topic, StartTime from processed_data''') + rows = [tuple(row) for row in cursor.fetchall()] + column_names = [i[0] for i in cursor.description] + df = pd.DataFrame(rows, columns=column_names) + df = df[df['ConversationId'].isin(conversationIds)] + + # Collect all key phrase records for batch insert + key_phrase_records = [] + for _, row in df.iterrows(): + key_phrases = row['key_phrases'].split(',') + for key_phrase in key_phrases: + key_phrase = key_phrase.strip() + key_phrase_records.append({ + 'ConversationId': row['ConversationId'], + 'key_phrase': key_phrase, + 'sentiment': row['sentiment'], + 'topic': row['topic'], + 'StartTime': row['StartTime'] + }) + + # Batch insert using optimized SQL script + if key_phrase_records: + df_key_phrases = pd.DataFrame(key_phrase_records) + columns = ['ConversationId', 'key_phrase', 'sentiment', 'topic', 'StartTime'] + generate_sql_insert_script(df_key_phrases, 'processed_data_key_phrases', columns, 'custom_new_key_phrases.sql') + + # Adjust dates to current date + today = datetime.today() + cursor.execute("SELECT MAX(CAST(StartTime AS DATETIME)) FROM [dbo].[processed_data]") + max_start_time = cursor.fetchone()[0] + days_difference = (today.date() - max_start_time.date()).days - 1 if max_start_time else 0 + if days_difference > 0: + cursor.execute("UPDATE [dbo].[processed_data] SET StartTime = FORMAT(DATEADD(DAY, ?, StartTime), 'yyyy-MM-dd HH:mm:ss'), EndTime = FORMAT(DATEADD(DAY, ?, EndTime), 'yyyy-MM-dd HH:mm:ss')", (days_difference, days_difference)) + cursor.execute("UPDATE [dbo].[km_processed_data] SET StartTime = FORMAT(DATEADD(DAY, ?, StartTime), 'yyyy-MM-dd HH:mm:ss'), EndTime = FORMAT(DATEADD(DAY, ?, EndTime), 'yyyy-MM-dd HH:mm:ss')", (days_difference, days_difference)) + cursor.execute("UPDATE [dbo].[processed_data_key_phrases] SET StartTime = FORMAT(DATEADD(DAY, ?, StartTime), 'yyyy-MM-dd HH:mm:ss')", (days_difference,)) + conn.commit() + + cursor.close() + conn.close() + print("✓ Data processing completed") + +finally: + # Delete the agents after processing is complete + print("Deleting topic mining and mapping agents...") + try: + async def delete_agents(): + """Delete topic mining and mapping agents asynchronously.""" + async with ( + AsyncAzureCliCredential(process_timeout=30) as async_cred, + AIProjectClient(endpoint=AI_PROJECT_ENDPOINT, credential=async_cred) as project_client, + ): + await project_client.agents.delete_version(topic_mining_agent.name, topic_mining_agent.version) + await project_client.agents.delete_version(topic_mapping_agent.name, topic_mapping_agent.version) + + asyncio.run(delete_agents()) + print(f"✓ Deleted agents: {topic_mining_agent.name}, {topic_mapping_agent.name}") + except Exception as e: + print(f"Warning: Could not delete agents: {e}") diff --git a/infra/scripts/index_scripts/requirements.txt b/infra/scripts/index_scripts/requirements.txt index 3fa8d865d..e48546cb0 100644 --- a/infra/scripts/index_scripts/requirements.txt +++ b/infra/scripts/index_scripts/requirements.txt @@ -1,15 +1,12 @@ azure-storage-file-datalake==12.20.0 -# langchain -openai==1.84.0 -azure-ai-projects==1.0.0b5 azure-ai-inference==1.0.0b9 +agent-framework-azure-ai==1.0.0b251120 +agent-framework-core==1.0.0b251120 pypdf==5.6.0 -# pyodbc tiktoken==0.9.0 azure-identity==1.23.0 azure-ai-textanalytics==5.3.0 azure-search-documents==11.5.2 azure-keyvault-secrets==4.9.0 pandas==2.3.0 -pyodbc==5.2.0 -# graphrag==0.3.6 \ No newline at end of file +pyodbc==5.2.0 \ No newline at end of file diff --git a/infra/scripts/process_custom_data.sh b/infra/scripts/process_custom_data.sh index 3cbaf4cba..1b788ece4 100644 --- a/infra/scripts/process_custom_data.sh +++ b/infra/scripts/process_custom_data.sh @@ -33,8 +33,8 @@ deploymentModel="${15}" # Content Understanding & AI Agent cuEndpoint="${16}" -aiAgentEndpoint="${17}" -cuApiVersion="${18}" +cuApiVersion="${17}" +aiAgentEndpoint="${18}" # Global variables to track original network access states original_storage_public_access="" @@ -336,12 +336,13 @@ get_values_from_azd_env() { aiAgentEndpoint=$(azd env get-value AZURE_AI_AGENT_ENDPOINT 2>&1 | grep -E '^https?://[a-zA-Z0-9._/:/-]+$') cuApiVersion=$(azd env get-value AZURE_CONTENT_UNDERSTANDING_API_VERSION 2>&1 | grep -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}(-preview)?$') deploymentModel=$(azd env get-value AZURE_OPENAI_DEPLOYMENT_MODEL 2>&1 | grep -E '^[a-zA-Z0-9._-]+$') + solutionName=$(azd env get-value SOLUTION_NAME 2>&1 | grep -E '^[a-zA-Z0-9._-]+$') # Strip FQDN suffix from SQL server name if present (Azure CLI needs just the server name) sqlServerName="${sqlServerName%.database.windows.net}" # Validate that we extracted all required values - if [ -z "$resourceGroupName" ] || [ -z "$storageAccountName" ] || [ -z "$fileSystem" ] || [ -z "$sqlServerName" ] || [ -z "$SqlDatabaseName" ] || [ -z "$backendUserMidClientId" ] || [ -z "$backendUserMidDisplayName" ] || [ -z "$aiSearchName" ] || [ -z "$aif_resource_id" ]; then + if [ -z "$resourceGroupName" ] || [ -z "$storageAccountName" ] || [ -z "$fileSystem" ] || [ -z "$sqlServerName" ] || [ -z "$SqlDatabaseName" ] || [ -z "$backendUserMidClientId" ] || [ -z "$backendUserMidDisplayName" ] || [ -z "$aiSearchName" ] || [ -z "$aif_resource_id" ] || [ -z "$solutionName" ]; then echo "Error: One or more required values could not be retrieved from azd environment." return 1 fi @@ -392,7 +393,7 @@ get_values_from_az_deployment() { aiAgentEndpoint=$(extract_value "azureAiAgentEndpoint" "azurE_AI_AGENT_ENDPOINT") cuApiVersion=$(extract_value "azureContentUnderstandingApiVersion" "azurE_CONTENT_UNDERSTANDING_API_VERSION") deploymentModel=$(extract_value "azureOpenAIDeploymentModel" "azurE_OPENAI_DEPLOYMENT_MODEL") - usecase=$(extract_value "useCase" "usE_CASE") + solutionName=$(extract_value "solutionName" "solutioN_NAME") # Strip FQDN suffix from SQL server name if present (Azure CLI needs just the server name) sqlServerName="${sqlServerName%.database.windows.net}" @@ -415,7 +416,7 @@ get_values_from_az_deployment() { ["aiAgentEndpoint"]="AZURE_AI_AGENT_ENDPOINT" ["cuApiVersion"]="AZURE_CONTENT_UNDERSTANDING_API_VERSION" ["deploymentModel"]="AZURE_OPENAI_DEPLOYMENT_MODEL" - ["usecase"]="USE_CASE" + ["solutionName"]="SOLUTION_NAME" ) # Validate and collect missing values @@ -549,6 +550,7 @@ echo "CU Endpoint: $cuEndpoint" echo "CU API Version: $cuApiVersion" echo "AI Agent Endpoint: $aiAgentEndpoint" echo "Deployment Model: $deploymentModel" +echo "Solution Name: $solutionName" echo "===============================================" echo "" @@ -562,6 +564,7 @@ fi pythonScriptPath="$SCRIPT_DIR/index_scripts/" # Install the requirements +echo "Installing requirements" pip install --quiet -r ${pythonScriptPath}requirements.txt if [ $? -ne 0 ]; then echo "Error: Failed to install Python requirements." @@ -595,9 +598,12 @@ python "${pythonScriptPath}04_cu_process_custom_data.py" \ --sql_server "$sql_server_fqdn" \ --sql_database "$SqlDatabaseName" \ --cu_endpoint "$cuEndpoint" \ - --cu_api_version "$cuApiVersion" + --cu_api_version "$cuApiVersion" \ + --solution_name "$solutionName" if [ $? -ne 0 ]; then echo "Error: 04_cu_process_custom_data.py failed." exit 1 fi + +echo "All scripts executed successfully." diff --git a/infra/scripts/process_sample_data.sh b/infra/scripts/process_sample_data.sh index c39a5a6ee..9a68fbc25 100644 --- a/infra/scripts/process_sample_data.sh +++ b/infra/scripts/process_sample_data.sh @@ -348,12 +348,13 @@ get_values_from_azd_env() { cuApiVersion=$(azd env get-value AZURE_CONTENT_UNDERSTANDING_API_VERSION 2>&1 | grep -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}(-preview)?$') deploymentModel=$(azd env get-value AZURE_OPENAI_DEPLOYMENT_MODEL 2>&1 | grep -E '^[a-zA-Z0-9._-]+$') usecase=$(azd env get-value USE_CASE 2>&1 | grep -E '^[a-zA-Z0-9._-]+$') + solutionName=$(azd env get-value SOLUTION_NAME 2>&1 | grep -E '^[a-zA-Z0-9._-]+$') # Strip FQDN suffix from SQL server name if present (Azure CLI needs just the server name) sqlServerName="${sqlServerName%.database.windows.net}" # Validate that we extracted all required values - if [ -z "$resourceGroupName" ] || [ -z "$storageAccountName" ] || [ -z "$fileSystem" ] || [ -z "$sqlServerName" ] || [ -z "$SqlDatabaseName" ] || [ -z "$backendUserMidClientId" ] || [ -z "$backendUserMidDisplayName" ] || [ -z "$aiSearchName" ] || [ -z "$aif_resource_id" ] || [ -z "$usecase" ]; then + if [ -z "$resourceGroupName" ] || [ -z "$storageAccountName" ] || [ -z "$fileSystem" ] || [ -z "$sqlServerName" ] || [ -z "$SqlDatabaseName" ] || [ -z "$backendUserMidClientId" ] || [ -z "$backendUserMidDisplayName" ] || [ -z "$aiSearchName" ] || [ -z "$aif_resource_id" ] || [ -z "$usecase" ] || [ -z "$solutionName" ]; then echo "Error: One or more required values could not be retrieved from azd environment." return 1 fi @@ -405,6 +406,7 @@ get_values_from_az_deployment() { cuApiVersion=$(extract_value "azureContentUnderstandingApiVersion" "azurE_CONTENT_UNDERSTANDING_API_VERSION") deploymentModel=$(extract_value "azureOpenAIDeploymentModel" "azurE_OPENAI_DEPLOYMENT_MODEL") usecase=$(extract_value "useCase" "usE_CASE") + solutionName=$(extract_value "solutionName" "solutioN_NAME") # Strip FQDN suffix from SQL server name if present (Azure CLI needs just the server name) sqlServerName="${sqlServerName%.database.windows.net}" @@ -428,6 +430,7 @@ get_values_from_az_deployment() { ["cuApiVersion"]="AZURE_CONTENT_UNDERSTANDING_API_VERSION" ["deploymentModel"]="AZURE_OPENAI_DEPLOYMENT_MODEL" ["usecase"]="USE_CASE" + ["solutionName"]="SOLUTION_NAME" ) # Validate and collect missing values @@ -574,6 +577,7 @@ echo "CU Endpoint: $cuEndpoint" echo "CU API Version: $cuApiVersion" echo "AI Agent Endpoint: $aiAgentEndpoint" echo "Deployment Model: $deploymentModel" +echo "Solution Name: $solutionName" echo "===============================================" echo "" @@ -596,7 +600,7 @@ echo "copy_kb_files.sh completed successfully." # Call run_create_index_scripts.sh echo "Running run_create_index_scripts.sh" # Pass all required environment variables and backend managed identity info for role assignment -bash "$SCRIPT_DIR/run_create_index_scripts.sh" "$resourceGroupName" "$aiSearchName" "$searchEndpoint" "$sqlServerName" "$SqlDatabaseName" "$backendUserMidDisplayName" "$backendUserMidClientId" "$storageAccountName" "$openaiEndpoint" "$deploymentModel" "$embeddingModel" "$cuEndpoint" "$cuApiVersion" "$aif_resource_id" "$cu_foundry_resource_id" "$aiAgentEndpoint" "$usecase" +bash "$SCRIPT_DIR/run_create_index_scripts.sh" "$resourceGroupName" "$aiSearchName" "$searchEndpoint" "$sqlServerName" "$SqlDatabaseName" "$backendUserMidDisplayName" "$backendUserMidClientId" "$storageAccountName" "$openaiEndpoint" "$deploymentModel" "$embeddingModel" "$cuEndpoint" "$cuApiVersion" "$aif_resource_id" "$cu_foundry_resource_id" "$aiAgentEndpoint" "$usecase" "$solutionName" if [ $? -ne 0 ]; then echo "Error: run_create_index_scripts.sh failed." exit 1 diff --git a/infra/scripts/run_create_index_scripts.sh b/infra/scripts/run_create_index_scripts.sh index 882c48d66..bf1ac121d 100644 --- a/infra/scripts/run_create_index_scripts.sh +++ b/infra/scripts/run_create_index_scripts.sh @@ -21,6 +21,7 @@ aif_resource_id="${14}" cu_foundry_resource_id="${15}" ai_agent_endpoint="${16}" usecase="${17}" +solution_name="${18}" pythonScriptPath="$SCRIPT_DIR/index_scripts/" @@ -134,7 +135,7 @@ fi echo "✓ Processing data with CU" sql_server_fqdn="$sqlServerName.database.windows.net" -python ${pythonScriptPath}03_cu_process_data_text.py --search_endpoint="$search_endpoint" --ai_project_endpoint="$ai_agent_endpoint" --deployment_model="$deployment_model" --embedding_model="$embedding_model" --storage_account_name="$storageAccountName" --sql_server="$sql_server_fqdn" --sql_database="$sqlDatabaseName" --cu_endpoint="$cu_endpoint" --cu_api_version="$cu_api_version" --usecase="$usecase" +python ${pythonScriptPath}03_cu_process_data_text.py --search_endpoint="$search_endpoint" --ai_project_endpoint="$ai_agent_endpoint" --deployment_model="$deployment_model" --embedding_model="$embedding_model" --storage_account_name="$storageAccountName" --sql_server="$sql_server_fqdn" --sql_database="$sqlDatabaseName" --cu_endpoint="$cu_endpoint" --cu_api_version="$cu_api_version" --usecase="$usecase" --solution_name="$solution_name" if [ $? -ne 0 ]; then echo "Error: 03_cu_process_data_text.py failed." error_flag=true