Skip to content

Incorporate evaluation script and GitHub workflow #103

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .github/workflows/ai-evaluation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: "AI Agent Evaluation"

on:
workflow_dispatch:
push:
branches:
- main

permissions:
id-token: write
contents: read

jobs:
run-action:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Azure login using Federated Credentials
uses: azure/login@v2
with:
client-id: ${{ vars.AZURE_CLIENT_ID }}
tenant-id: ${{ vars.AZURE_TENANT_ID }}
subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }}

- name: Run Evaluation
uses: microsoft/ai-agent-evals@v1-beta
with:
azure-aiproject-connection-string: ${{ vars.AZURE_EXISTING_AIPROJECT_CONNECTION_STRING || vars.AZURE_AIPROJECT_CONNECTION_STRING }}
deployment-name: ${{ vars.AZURE_AI_AGENT_DEPLOYMENT_NAME }}
agent-ids: ${{ vars.AZURE_EXISTING_AGENT_ID || vars.AZURE_AI_AGENT_ID }}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we make sure AZURE_EXISTING_AGENT_ID gets written to .env during startup?

data-path: ${{ github.workspace }}/evals/test-data.json
3 changes: 2 additions & 1 deletion azure.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,5 @@ pipeline:
- AZURE_AI_EMBED_MODEL_VERSION
- AZURE_AI_EMBED_DIMENSIONS
- AZURE_AI_SEARCH_INDEX_NAME
- AZURE_EXISTING_AIPROJECT_CONNECTION_STRING
- AZURE_EXISTING_AIPROJECT_CONNECTION_STRING
- AZURE_EXISTING_AGENT_ID
167 changes: 167 additions & 0 deletions evals/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
from azure.ai.projects import AIProjectClient
from azure.ai.projects.models import Agent, ConnectionType, MessageRole, RunStatus
from azure.identity import DefaultAzureCredential
from azure.ai.evaluation import AIAgentConverter, evaluate, FluencyEvaluator, ToolCallAccuracyEvaluator, IntentResolutionEvaluator, TaskAdherenceEvaluator

import os
import time
import json
from pathlib import Path
from dotenv import load_dotenv

def run_evaluation():
"""Demonstrate how to evaluate an AI agent using the Azure AI Project SDK"""
current_dir = Path(__file__).parent
eval_input_path = current_dir / f"eval-input.jsonl"
eval_output_path = current_dir / f"eval-output.json"

env_path = current_dir / "../src/.env"
load_dotenv(dotenv_path=env_path)

# Get AI project parameters from environment variables
AZURE_AIPROJECT_CONNECTION_STRING = (
os.environ.get("AZURE_EXISTING_AIPROJECT_CONNECTION_STRING") or
os.environ.get("AZURE_AIPROJECT_CONNECTION_STRING")
)
AZURE_AI_AGENT_DEPLOYMENT_NAME = os.getenv("AZURE_AI_AGENT_DEPLOYMENT_NAME")
API_VERSION = os.getenv("API_VERSION") or ""
AGENT_ID = (
os.environ.get("AZURE_EXISTING_AGENT_ID") or
os.environ.get("AZURE_AI_AGENT_ID")
)

# Initialize the AIProjectClient and related entities
project_client = AIProjectClient.from_connection_string(
AZURE_AIPROJECT_CONNECTION_STRING,
credential=DefaultAzureCredential()
)
default_connection = project_client.connections.get_default(
connection_type=ConnectionType.AZURE_OPEN_AI, include_credentials=True
)
model_config = default_connection.to_evaluator_model_config(
deployment_name=AZURE_AI_AGENT_DEPLOYMENT_NAME,
api_version=API_VERSION,
include_credentials=True,
)
agent = project_client.agents.get_agent(AGENT_ID)
thread_data_converter = AIAgentConverter(project_client)

# Read data input file
with open(current_dir / "test-data.json", "r", encoding="utf-8") as f:
test_data = json.load(f)

# Execute the test data against the agent and prepare the evaluation input
with open(eval_input_path, "w", encoding="utf-8") as f:

for row in test_data:
# Create a new thread for each query to isolate conversations
thread = project_client.agents.create_thread()

# Send the user query
project_client.agents.create_message(
thread.id, role=MessageRole.USER, content=row.get("query")
)

# Run the agent and measure performance
start_time = time.time()
run = project_client.agents.create_and_process_run(
thread_id=thread.id, agent_id=agent.id
)
end_time = time.time()

if run.status != RunStatus.COMPLETED:
raise ValueError(run.last_error or "Run failed to complete")

metrics = {
"server-run-duration-in-seconds": (
run.completed_at - run.created_at
).total_seconds(),
"client-run-duration-in-seconds": end_time - start_time,
"completion-tokens": run.usage.completion_tokens,
"prompt-tokens": run.usage.prompt_tokens,
"ground-truth": row.get("ground-truth", '')
}

# Add thread data + operational metrics to the evaluation input
evaluation_data = thread_data_converter.prepare_evaluation_data(thread_ids=thread.id)
eval_item = evaluation_data[0]
eval_item["metrics"] = metrics
f.write(json.dumps(eval_item) + "\n")


# Now, run a sample set of evaluators using the evaluation input
# See https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/agent-evaluate-sdk
# for the full list of evaluators availalbe
tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)
intent_resolution = IntentResolutionEvaluator(model_config=model_config)
task_adherence = TaskAdherenceEvaluator(model_config=model_config)
results = evaluate(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to make sure both the user (whoever did azd up) has access to storage, otherwise uploading to AI Foundry won't work out of the box

data=eval_input_path,
evaluators={
"tool_call_accuracy": tool_call_accuracy,
"intent_resolution": intent_resolution,
"task_adherence": task_adherence,
"operational_metrics": OperationalMetricsEvaluator(),
},
#azure_ai_project=project_client.scope, # uncomment to upload result to AI Foundry
output_path=eval_output_path
)

# Print the evaluation results
print_eval_results(results, eval_input_path, eval_output_path)

return results


class OperationalMetricsEvaluator:
"""Propagate operational metrics to the final evaluation results"""
def __init__(self):
pass
def __call__(self, *, metrics: dict, **kwargs):
return metrics


def print_eval_results(results, input_path, output_path):
"""Print the evaluation results in a formatted table"""
metrics = results.get("metrics", {})

# Get the maximum length for formatting
key_len = max(len(key) for key in metrics.keys()) + 5
value_len = 20
full_len = key_len + value_len + 5

# Format the header
print("\n" + "=" * full_len)
print("Evaluation Results".center(full_len))
print("=" * full_len)

# Print each metric
print(f"{'Metric':<{key_len}} | {'Value'}")
print("-" * (key_len) + "-+-" + "-" * value_len)

for key, value in metrics.items():
if isinstance(value, float):
formatted_value = f"{value:.2f}"
else:
formatted_value = str(value)

print(f"{key:<{key_len}} | {formatted_value}")

print("=" * full_len + "\n")

# Print additional information
print(f"Evaluation input: {input_path}")
print(f"Evaluation output: {output_path}")
if "studio_url" in results:
print(f"AI Foundry URL: {results['studio_url']}")

print("\n" + "=" * full_len + "\n")


if __name__ == "__main__":
try:
run_evaluation()
except Exception as e:
print(f"Error during evaluation: {e}")


10 changes: 10 additions & 0 deletions evals/test-data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[
{
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: Update to queries more relevant for this app

"query": "What is the capital of France?",
"ground-truth": "The capital of France is Paris."
},
{
"query": "What is the capital of Japan?",
"ground-truth": "The capital of Japan is Tokyo."
}
]