-
Notifications
You must be signed in to change notification settings - Fork 184
Incorporate evaluation script and GitHub workflow #103
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
name: "AI Agent Evaluation" | ||
|
||
on: | ||
workflow_dispatch: | ||
push: | ||
branches: | ||
- main | ||
|
||
permissions: | ||
id-token: write | ||
contents: read | ||
|
||
jobs: | ||
run-action: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
|
||
- name: Azure login using Federated Credentials | ||
uses: azure/login@v2 | ||
with: | ||
client-id: ${{ vars.AZURE_CLIENT_ID }} | ||
tenant-id: ${{ vars.AZURE_TENANT_ID }} | ||
subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }} | ||
|
||
- name: Run Evaluation | ||
uses: microsoft/ai-agent-evals@v1-beta | ||
with: | ||
azure-aiproject-connection-string: ${{ vars.AZURE_EXISTING_AIPROJECT_CONNECTION_STRING || vars.AZURE_AIPROJECT_CONNECTION_STRING }} | ||
deployment-name: ${{ vars.AZURE_AI_AGENT_DEPLOYMENT_NAME }} | ||
agent-ids: ${{ vars.AZURE_EXISTING_AGENT_ID || vars.AZURE_AI_AGENT_ID }} | ||
data-path: ${{ github.workspace }}/evals/test-data.json |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
from azure.ai.projects import AIProjectClient | ||
from azure.ai.projects.models import Agent, ConnectionType, MessageRole, RunStatus | ||
from azure.identity import DefaultAzureCredential | ||
from azure.ai.evaluation import AIAgentConverter, evaluate, FluencyEvaluator, ToolCallAccuracyEvaluator, IntentResolutionEvaluator, TaskAdherenceEvaluator | ||
|
||
import os | ||
import time | ||
import json | ||
from pathlib import Path | ||
from dotenv import load_dotenv | ||
|
||
def run_evaluation(): | ||
"""Demonstrate how to evaluate an AI agent using the Azure AI Project SDK""" | ||
current_dir = Path(__file__).parent | ||
eval_input_path = current_dir / f"eval-input.jsonl" | ||
eval_output_path = current_dir / f"eval-output.json" | ||
|
||
env_path = current_dir / "../src/.env" | ||
load_dotenv(dotenv_path=env_path) | ||
|
||
# Get AI project parameters from environment variables | ||
AZURE_AIPROJECT_CONNECTION_STRING = ( | ||
os.environ.get("AZURE_EXISTING_AIPROJECT_CONNECTION_STRING") or | ||
os.environ.get("AZURE_AIPROJECT_CONNECTION_STRING") | ||
) | ||
AZURE_AI_AGENT_DEPLOYMENT_NAME = os.getenv("AZURE_AI_AGENT_DEPLOYMENT_NAME") | ||
API_VERSION = os.getenv("API_VERSION") or "" | ||
AGENT_ID = ( | ||
os.environ.get("AZURE_EXISTING_AGENT_ID") or | ||
os.environ.get("AZURE_AI_AGENT_ID") | ||
) | ||
|
||
# Initialize the AIProjectClient and related entities | ||
project_client = AIProjectClient.from_connection_string( | ||
AZURE_AIPROJECT_CONNECTION_STRING, | ||
credential=DefaultAzureCredential() | ||
) | ||
default_connection = project_client.connections.get_default( | ||
connection_type=ConnectionType.AZURE_OPEN_AI, include_credentials=True | ||
) | ||
model_config = default_connection.to_evaluator_model_config( | ||
deployment_name=AZURE_AI_AGENT_DEPLOYMENT_NAME, | ||
api_version=API_VERSION, | ||
include_credentials=True, | ||
) | ||
agent = project_client.agents.get_agent(AGENT_ID) | ||
thread_data_converter = AIAgentConverter(project_client) | ||
|
||
# Read data input file | ||
with open(current_dir / "test-data.json", "r", encoding="utf-8") as f: | ||
test_data = json.load(f) | ||
|
||
# Execute the test data against the agent and prepare the evaluation input | ||
with open(eval_input_path, "w", encoding="utf-8") as f: | ||
|
||
for row in test_data: | ||
# Create a new thread for each query to isolate conversations | ||
thread = project_client.agents.create_thread() | ||
|
||
# Send the user query | ||
project_client.agents.create_message( | ||
thread.id, role=MessageRole.USER, content=row.get("query") | ||
) | ||
|
||
# Run the agent and measure performance | ||
start_time = time.time() | ||
run = project_client.agents.create_and_process_run( | ||
thread_id=thread.id, agent_id=agent.id | ||
) | ||
end_time = time.time() | ||
|
||
if run.status != RunStatus.COMPLETED: | ||
raise ValueError(run.last_error or "Run failed to complete") | ||
|
||
metrics = { | ||
"server-run-duration-in-seconds": ( | ||
run.completed_at - run.created_at | ||
).total_seconds(), | ||
"client-run-duration-in-seconds": end_time - start_time, | ||
"completion-tokens": run.usage.completion_tokens, | ||
"prompt-tokens": run.usage.prompt_tokens, | ||
"ground-truth": row.get("ground-truth", '') | ||
} | ||
|
||
# Add thread data + operational metrics to the evaluation input | ||
evaluation_data = thread_data_converter.prepare_evaluation_data(thread_ids=thread.id) | ||
eval_item = evaluation_data[0] | ||
eval_item["metrics"] = metrics | ||
f.write(json.dumps(eval_item) + "\n") | ||
|
||
|
||
# Now, run a sample set of evaluators using the evaluation input | ||
# See https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/agent-evaluate-sdk | ||
# for the full list of evaluators availalbe | ||
tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config) | ||
intent_resolution = IntentResolutionEvaluator(model_config=model_config) | ||
task_adherence = TaskAdherenceEvaluator(model_config=model_config) | ||
results = evaluate( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to make sure both the user (whoever did azd up) has access to storage, otherwise uploading to AI Foundry won't work out of the box |
||
data=eval_input_path, | ||
evaluators={ | ||
"tool_call_accuracy": tool_call_accuracy, | ||
"intent_resolution": intent_resolution, | ||
"task_adherence": task_adherence, | ||
"operational_metrics": OperationalMetricsEvaluator(), | ||
}, | ||
#azure_ai_project=project_client.scope, # uncomment to upload result to AI Foundry | ||
output_path=eval_output_path | ||
) | ||
|
||
# Print the evaluation results | ||
print_eval_results(results, eval_input_path, eval_output_path) | ||
|
||
return results | ||
|
||
|
||
class OperationalMetricsEvaluator: | ||
"""Propagate operational metrics to the final evaluation results""" | ||
def __init__(self): | ||
pass | ||
def __call__(self, *, metrics: dict, **kwargs): | ||
return metrics | ||
|
||
|
||
def print_eval_results(results, input_path, output_path): | ||
"""Print the evaluation results in a formatted table""" | ||
metrics = results.get("metrics", {}) | ||
|
||
# Get the maximum length for formatting | ||
key_len = max(len(key) for key in metrics.keys()) + 5 | ||
value_len = 20 | ||
full_len = key_len + value_len + 5 | ||
|
||
# Format the header | ||
print("\n" + "=" * full_len) | ||
print("Evaluation Results".center(full_len)) | ||
print("=" * full_len) | ||
|
||
# Print each metric | ||
print(f"{'Metric':<{key_len}} | {'Value'}") | ||
print("-" * (key_len) + "-+-" + "-" * value_len) | ||
|
||
for key, value in metrics.items(): | ||
if isinstance(value, float): | ||
formatted_value = f"{value:.2f}" | ||
else: | ||
formatted_value = str(value) | ||
|
||
print(f"{key:<{key_len}} | {formatted_value}") | ||
|
||
print("=" * full_len + "\n") | ||
|
||
# Print additional information | ||
print(f"Evaluation input: {input_path}") | ||
print(f"Evaluation output: {output_path}") | ||
if "studio_url" in results: | ||
print(f"AI Foundry URL: {results['studio_url']}") | ||
|
||
print("\n" + "=" * full_len + "\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
try: | ||
run_evaluation() | ||
except Exception as e: | ||
print(f"Error during evaluation: {e}") | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
[ | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO: Update to queries more relevant for this app |
||
"query": "What is the capital of France?", | ||
"ground-truth": "The capital of France is Paris." | ||
}, | ||
{ | ||
"query": "What is the capital of Japan?", | ||
"ground-truth": "The capital of Japan is Tokyo." | ||
} | ||
] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we make sure
AZURE_EXISTING_AGENT_ID
gets written to.env
during startup?