diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_coherence.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_coherence.py new file mode 100644 index 000000000000..ce910cb5c9f7 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_coherence.py @@ -0,0 +1,188 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + using inline dataset content. + +USAGE: + python sample_coherence.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import json +import os +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "type": "string" + }, + "response": { + "type": "string" + } + }, + "required": [] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "coherence", + "evaluator_name": "builtin.coherence", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Coherence Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Sample inline data + success_query = "What is the capital of France?" + success_response = "The capital of France is Paris." + + # Failure example - incoherent response + failure_query = "What is the capital of France?" + failure_response = "France capital is... well, the city where government sits is Paris but no wait, Lyon is bigger actually maybe Rome? The French people live in many cities but the main one, I think it's definitely Paris or maybe not, depends on what you mean by capital." + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + # Success example - coherent response + SourceFileContentContent( + item= { + "query": success_query, + "response": success_response + } + ), + # Failure example - incoherent response + SourceFileContentContent( + item= { + "query": failure_query, + "response": failure_response + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_fluency.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_fluency.py new file mode 100644 index 000000000000..3fa939bdfe11 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_fluency.py @@ -0,0 +1,176 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + using inline dataset content. + +USAGE: + python sample_fluency.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "type": "string" + }, + "response": { + "type": "string" + } + }, + "required": [] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "coherence", + "evaluator_name": "builtin.coherence", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Fluency Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Sample inline data + query = "What is the capital of France?" + response = "The capital of France is Paris." + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + SourceFileContentContent( + item= { + "query": query, + "response": response + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_generic_agentic_evaluator/sample_generic_agentic_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_generic_agentic_evaluator/sample_generic_agentic_evaluator.py new file mode 100644 index 000000000000..f794ba0a8eba --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_generic_agentic_evaluator/sample_generic_agentic_evaluator.py @@ -0,0 +1,76 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Any agentic evaluator using inline dataset content. + +USAGE: + python sample_generic_agentic_evaluator.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +from utils import run_evaluator +from schema_mappings import evaluator_to_data_source_config, evaluator_to_data_mapping +from openai.types.evals.create_eval_jsonl_run_data_source_param import SourceFileContentContent + + +load_dotenv() + + +def _get_evaluator_initialization_parameters(evaluator_name: str) -> dict[str, str]: + if evaluator_name == "task_navigation_efficiency": + return { + "matching_mode": "exact_match" # Can be "exact_match", "in_order_match", or "any_order_match" + } + else: + model_deployment_name = os.environ.get( + "AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + return { + "deployment_name": model_deployment_name + } + + +def _get_evaluation_contents() -> list[SourceFileContentContent]: + # Sample inline data + # Change this to add more examples for evaluation + # Use the appropriate schema based on the evaluator being used + success_query = "What is the capital of France?" + success_response = "The capital of France is Paris." + + evaluation_contents = [SourceFileContentContent( + item={ + "query": success_query, + "response": success_response + } + )] + + return evaluation_contents + +def main() -> None: + evaluator_name = "coherence" # Change to any agentic evaluator name like "relevance", "response_completeness", "task_navigation_efficiency" + data_source_config = evaluator_to_data_source_config[evaluator_name] + initialization_parameters = _get_evaluator_initialization_parameters(evaluator_name) + data_mapping = evaluator_to_data_mapping[evaluator_name] + evaluation_contents = _get_evaluation_contents() + + run_evaluator(evaluator_name, evaluation_contents, data_source_config, initialization_parameters, data_mapping) + + +if __name__ == "__main__": + main() diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_generic_agentic_evaluator/schema_mappings.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_generic_agentic_evaluator/schema_mappings.py new file mode 100644 index 000000000000..fb40df066694 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_generic_agentic_evaluator/schema_mappings.py @@ -0,0 +1,540 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +evaluator_to_data_source_config = { + "coherence": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "type": "string" + }, + "response": { + "type": "string" + } + }, + "required": [] + }, + "include_sample_schema": True + }, + "fluency": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "type": "string" + }, + "response": { + "type": "string" + } + }, + "required": [] + }, + "include_sample_schema": True + }, + "groundedness": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "context": { + "type": "string" + }, + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "string"}, + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["response"] + }, + "include_sample_schema": True + }, + "intent_resolution": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "response"] + }, + "include_sample_schema": True + }, + "relevance": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "type": "string" + }, + "response": { + "type": "string" + } + }, + "required": ["query", "response"] + }, + "include_sample_schema": True + }, + "response_completeness": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "ground_truth": { + "type": "string" + }, + "response": { + "type": "string" + } + }, + "required": ["ground_truth", "response"] + }, + "include_sample_schema": True + }, + "task_adherence": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "response"] + }, + "include_sample_schema": True + }, + "task_completion": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "response"] + }, + "include_sample_schema": True + }, + "tool_call_accuracy": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_calls": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "tool_definitions"] + }, + "include_sample_schema": True + }, + "tool_input_accuracy": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "response", "tool_definitions"] + }, + "include_sample_schema": True + }, + "tool_output_utilization": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "response"] + }, + "include_sample_schema": True + }, + "tool_selection": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_calls": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "response", "tool_definitions"] + }, + "include_sample_schema": True + }, + "tool_success": { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["response"] + }, + "include_sample_schema": True + } +} + +evaluator_to_data_mapping = { + "coherence": { + "query": "{{item.query}}", + "response": "{{item.response}}" + }, + "fluency": { + "query": "{{item.query}}", + "response": "{{item.response}}" + }, + "groundedness": { + "context": "{{item.context}}", + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}" + }, + "intent_resolution": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}" + }, + "relevance": { + "query": "{{item.query}}", + "response": "{{item.response}}" + }, + "response_completeness": { + "ground_truth": "{{item.ground_truth}}", + "response": "{{item.response}}" + }, + "task_adherence": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}" + }, + "task_completion": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}" + }, + "tool_call_accuracy": { + "query": "{{item.query}}", + "tool_definitions": "{{item.tool_definitions}}", + "tool_calls": "{{item.tool_calls}}", + "response": "{{item.response}}" + }, + "tool_input_accuracy": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}" + }, + "tool_output_utilization": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}" + }, + "tool_selection": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_calls": "{{item.tool_calls}}", + "tool_definitions": "{{item.tool_definitions}}" + }, + "tool_success": { + "tool_definitions": "{{item.tool_definitions}}", + "response": "{{item.response}}" + } +} diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_generic_agentic_evaluator/utils.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_generic_agentic_evaluator/utils.py new file mode 100644 index 000000000000..6d55afd55fb4 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_generic_agentic_evaluator/utils.py @@ -0,0 +1,123 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +from dotenv import load_dotenv +import json +import os +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def run_evaluator(evaluator_name: str, evaluation_contents: list[SourceFileContentContent], data_source_config: dict, initialization_parameters: dict[str, str], data_mapping: dict[str, str]) -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": f"{evaluator_name}", + "evaluator_name": f"builtin.{evaluator_name}", + "initialization_parameters": initialization_parameters, + "data_mapping": data_mapping + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name=f"Test {evaluator_name} Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content=evaluation_contents + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve( + run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve( + run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_groundedness.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_groundedness.py new file mode 100644 index 000000000000..a0d866d52a30 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_groundedness.py @@ -0,0 +1,355 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Groundedness evaluator using inline dataset content. + +USAGE: + python sample_groundedness.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "context": { + "type": "string" + }, + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "string"}, + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["response"] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "groundedness", + "evaluator_name": "builtin.groundedness", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "context": "{{item.context}}", + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Groundedness Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Success example - response grounded in context + success_context = ( + "France, a country in Western Europe, is known for its rich history and cultural heritage. " + "The city of Paris, located in the northern part of the country, serves as its capital. " + "Paris is renowned for its art, fashion, and landmarks such as the Eiffel Tower and the Louvre Museum." + ) + success_response = "Paris is the capital of France." + + # Failure example - response not grounded in context + failure_context = ( + "France, a country in Western Europe, is known for its rich history and cultural heritage. " + "The city of Paris, located in the northern part of the country, serves as its capital. " + "Paris is renowned for its art, fashion, and landmarks such as the Eiffel Tower and the Louvre Museum." + ) + failure_response = "London is the capital of France and has a population of over 10 million people." + + # Simple example with query + simple_query = "What is the population of Tokyo?" + simple_context = "Tokyo, the capital of Japan, has a population of approximately 14 million people in the city proper and 38 million in the greater metropolitan area." + simple_response = "According to the information provided, Tokyo has approximately 14 million people in the city proper and 38 million in the greater metropolitan area." + + # Complex example - conversation format with grounded response + complex_context = "Weather service provides current weather information for any location." + complex_response = [ + { + "createdAt": "2025-03-26T17:27:35Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_CUdbkBfvVBla2YP3p24uhElJ", + "name": "fetch_weather", + "arguments": { + "location": "Seattle" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:37Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "tool_call_id": "call_CUdbkBfvVBla2YP3p24uhElJ", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "weather": "Rainy, 14°C" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:42Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The current weather in Seattle is rainy with a temperature of 14°C." + } + ] + } + ] + + complex_tool_definitions = [ + { + "name": "fetch_weather", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for." + } + } + } + } + ] + + # Another complex example - conversation format with query but no tool calls + query_conversation_context = "The company's employee handbook states that vacation days must be requested at least 2 weeks in advance and approved by your direct supervisor." + query_conversation_query = [ + { + "createdAt": "2025-03-26T17:30:00Z", + "run_id": "run_ABC123DEF456", + "role": "user", + "content": [ + { + "type": "text", + "text": "What's the policy for requesting vacation days?" + } + ] + } + ] + query_conversation_response = [ + { + "createdAt": "2025-03-26T17:30:05Z", + "run_id": "run_ABC123DEF456", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "According to the employee handbook, vacation days must be requested at least 2 weeks in advance and need approval from your direct supervisor." + } + ] + } + ] + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + # Success example - grounded response + SourceFileContentContent( + item= { + "context": success_context, + "response": success_response, + "query": None, + "tool_definitions": None + } + ), + # Failure example - ungrounded response + SourceFileContentContent( + item= { + "context": failure_context, + "response": failure_response, + "query": None, + "tool_definitions": None + } + ), + # Simple example with query + SourceFileContentContent( + item= { + "context": simple_context, + "query": simple_query, + "response": simple_response, + "tool_definitions": None + } + ), + # Complex example - conversation format with grounded response + SourceFileContentContent( + item= { + "context": complex_context, + "response": complex_response, + "query": None, + "tool_definitions": complex_tool_definitions + } + ), + # Another complex example - conversation format with query but no tool calls + SourceFileContentContent( + item= { + "context": query_conversation_context, + "query": query_conversation_query, + "response": query_conversation_response, + "tool_definitions": None + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_intent_resolution.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_intent_resolution.py new file mode 100644 index 000000000000..4d2b90b0b214 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_intent_resolution.py @@ -0,0 +1,347 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Intent Resolution evaluator using inline dataset content. + +USAGE: + python sample_intent_resolution.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "response"] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "intent_resolution", + "evaluator_name": "builtin.intent_resolution", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Intent Resolution Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Success example - Intent is identified and understood and the response correctly resolves user intent + success_query = "What are the opening hours of the Eiffel Tower?" + success_response = "Opening hours of the Eiffel Tower are 9:00 AM to 11:00 PM." + + # Failure example - Even though intent is correctly identified, the response does not resolve the user intent + failure_query = "What is the opening hours of the Eiffel Tower?" + failure_response = "Please check the official website for the up-to-date information on Eiffel Tower opening hours." + + # Complex conversation example with tool calls + complex_query = [ + {"role": "system", "content": "You are a friendly and helpful customer service agent."}, + { + "createdAt": "2025-03-14T06:14:20Z", + "role": "user", + "content": [ + { + "type": "text", + "text": "Hi, I need help with my order #123 status?", + } + ], + }, + ] + + complex_response = [ + { + "createdAt": "2025-03-14T06:14:30Z", + "run_id": "0", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "tool_call_001", + "name": "get_order", + "arguments": {"order_id": "123"}, + } + ], + }, + { + "createdAt": "2025-03-14T06:14:35Z", + "run_id": "0", + "tool_call_id": "tool_call_001", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }', + } + ], + }, + { + "createdAt": "2025-03-14T06:14:40Z", + "run_id": "0", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "tool_call_002", + "name": "get_tracking", + "arguments": {"order_id": "123"}, + } + ], + }, + { + "createdAt": "2025-03-14T06:14:45Z", + "run_id": "0", + "tool_call_id": "tool_call_002", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": '{ "tracking_number": "ABC123", "carrier": "UPS" }', + } + ], + }, + { + "createdAt": "2025-03-14T06:14:50Z", + "run_id": "0", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "Your order #123 has been shipped and is expected to be delivered on March 15, 2025. The tracking number is ABC123 with UPS.", + } + ], + }, + ] + + # Tool definitions for the complex example + tool_definitions = [ + { + "name": "get_order", + "description": "Get the details of a specific order.", + "parameters": { + "type": "object", + "properties": {"order_id": {"type": "string", "description": "The order ID to get the details for."}}, + }, + }, + { + "name": "get_tracking", + "description": "Get tracking information for an order.", + "parameters": { + "type": "object", + "properties": {"order_id": {"type": "string", "description": "The order ID to get tracking for."}}, + }, + }, + ] + + tool_definition = { + "name": "get_order", + "description": "Get the details of a specific order.", + "parameters": { + "type": "object", + "properties": {"order_id": {"type": "string", "description": "The order ID to get the details for."}}, + }, + } + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + # Example 1: Success case - simple string query and response + SourceFileContentContent( + item= { + "query": success_query, + "response": success_response + } + ), + # Example 2: Failure case - simple string query and response + SourceFileContentContent( + item= { + "query": failure_query, + "response": failure_response + } + ), + # Example 3: Complex conversation with tool calls and tool definitions + SourceFileContentContent( + item= { + "query": complex_query, + "response": complex_response, + "tool_definitions": tool_definitions + } + ), + # Example 4: Complex conversation without tool definitions + SourceFileContentContent( + item= { + "query": complex_query, + "response": complex_response + } + ), + # Example 5: Complex conversation with single tool definition + SourceFileContentContent( + item= { + "query": complex_query, + "response": complex_response, + "tool_definitions": tool_definition + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_relevance.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_relevance.py new file mode 100644 index 000000000000..ff7b7788b1c9 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_relevance.py @@ -0,0 +1,188 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Relevance evaluator using inline dataset content. + +USAGE: + python sample_relevance.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "type": "string" + }, + "response": { + "type": "string" + } + }, + "required": ["query", "response"] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "relevance", + "evaluator_name": "builtin.relevance", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Relevance Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Success example - relevant response + success_query = "What is the capital of Japan?" + success_response = "The capital of Japan is Tokyo." + + # Failure example - irrelevant response + failure_query = "What is the capital of Japan?" + failure_response = "Japan is known for its beautiful cherry blossoms and advanced technology. The country has a rich cultural heritage and is famous for sushi and anime." + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + # Success example - relevant response + SourceFileContentContent( + item= { + "query": success_query, + "response": success_response + } + ), + # Failure example - irrelevant response + SourceFileContentContent( + item= { + "query": failure_query, + "response": failure_response + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_response_completeness.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_response_completeness.py new file mode 100644 index 000000000000..ed3671b08682 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_response_completeness.py @@ -0,0 +1,188 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Response Completeness evaluator using inline dataset content. + +USAGE: + python sample_response_completeness.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "ground_truth": { + "type": "string" + }, + "response": { + "type": "string" + } + }, + "required": ["ground_truth", "response"] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "response_completeness", + "evaluator_name": "builtin.response_completeness", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "ground_truth": "{{item.ground_truth}}", + "response": "{{item.response}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Response Completeness Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Complete response example + complete_response = "Itinery: Day 1 check out the downtown district of the city on train; for Day 2, we can rest in hotel." + complete_ground_truth = "Itinery: Day 1 take a train to visit the downtown area for city sightseeing; Day 2 rests in hotel." + + # Incomplete response example + incomplete_response = "The order with ID 124 is delayed and should now arrive by March 20, 2025." + incomplete_ground_truth = "The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025." + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + # Complete response example + SourceFileContentContent( + item= { + "ground_truth": complete_ground_truth, + "response": complete_response + } + ), + # Incomplete response example + SourceFileContentContent( + item= { + "ground_truth": incomplete_ground_truth, + "response": incomplete_response + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_task_adherence.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_task_adherence.py new file mode 100644 index 000000000000..6408fb4c36b6 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_task_adherence.py @@ -0,0 +1,309 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Task Adherence evaluator using inline dataset content. + +USAGE: + python sample_task_adherence.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "response"] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "task_adherence", + "evaluator_name": "builtin.task_adherence", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Task Adherence Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Failure example - vague adherence to the task + failure_query = "What are the best practices for maintaining a healthy rose garden during the summer?" + failure_response = "Make sure to water your roses regularly and trim them occasionally." + + # Success example - full adherence to the task + success_query = "What are the best practices for maintaining a healthy rose garden during the summer?" + success_response = "For optimal summer care of your rose garden, start by watering deeply early in the morning to ensure the roots are well-hydrated without encouraging fungal growth. Apply a 2-3 inch layer of organic mulch around the base of the plants to conserve moisture and regulate soil temperature. Fertilize with a balanced rose fertilizer every 4–6 weeks to support healthy growth. Prune away any dead or diseased wood to promote good air circulation, and inspect regularly for pests such as aphids or spider mites, treating them promptly with an appropriate organic insecticidal soap. Finally, ensure that your roses receive at least 6 hours of direct sunlight daily for robust flowering." + + # Complex conversation example with tool calls + complex_query = [ + { + "role": "system", + "content": "You are an expert in literature and can provide book recommendations." + + }, + { + "createdAt": "2025-03-14T08:00:00Z", + "role": "user", + "content": [ + { + "type": "text", + "text": "I love historical fiction. Can you recommend a good book from that genre?" + } + ] + } + ] + + complex_response = [ + { + "createdAt": "2025-03-14T08:00:05Z", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "Let me fetch a recommendation for historical fiction." + } + ] + }, + { + "createdAt": "2025-03-14T08:00:10Z", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "tool_call_20250314_001", + "name": "get_book", + "arguments": { + "genre": "historical fiction" + } + } + ] + }, + { + "createdAt": "2025-03-14T08:00:15Z", + "role": "tool", + "tool_call_id": "tool_call_20250314_001", + "content": [ + { + "type": "tool_result", + "tool_result": "{ \"book\": { \"title\": \"The Pillars of the Earth\", \"author\": \"Ken Follett\", \"summary\": \"A captivating tale set in medieval England that weaves historical events with personal drama.\" } }" + } + ] + }, + { + "createdAt": "2025-03-14T08:00:20Z", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "Based on our records, I recommend 'The Pillars of the Earth' by Ken Follett. This novel is an excellent example of historical fiction with a rich narrative and well-developed characters. Would you like more details or another suggestion?" + } + ] + } + ] + + complex_tool_definitions = [ + { + "name": "get_book", + "description": "Retrieve a book recommendation for a specified genre.", + "parameters": { + "type": "object", + "properties": { + "genre": { + "type": "string", + "description": "The genre for which a book recommendation is requested." + } + } + } + } + ] + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + # Failure example - vague adherence + SourceFileContentContent( + item= { + "query": failure_query, + "response": failure_response, + "tool_definitions": None + } + ), + # Success example - full adherence + SourceFileContentContent( + item= { + "query": success_query, + "response": success_response, + "tool_definitions": None + } + ), + # Complex conversation example with tool calls + SourceFileContentContent( + item= { + "query": complex_query, + "response": complex_response, + "tool_definitions": complex_tool_definitions + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_task_completion.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_task_completion.py new file mode 100644 index 000000000000..b95948b10a33 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_task_completion.py @@ -0,0 +1,348 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Task Completion evaluator using inline dataset content. + +USAGE: + python sample_task_completion.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "response"] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "task_completion", + "evaluator_name": "builtin.task_completion", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Task Completion Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Success example - task completed successfully + success_query = "Book a flight from New York to Los Angeles for next Friday" + success_response = "I've successfully booked your flight from New York (JFK) to Los Angeles (LAX) for Friday, March 29th. Your confirmation number is ABC123. The flight departs at 2:30 PM EST and arrives at 5:45 PM PST." + + # Failure example - task not completed + failure_query = "Cancel my subscription and refund my payment" + failure_response = "I understand you want to cancel your subscription. Here are some helpful articles about our cancellation policy and refund terms that you might find useful." + + # Complex example - conversation format with task completion + complex_query = [ + { + "createdAt": "2025-03-26T17:27:35Z", + "run_id": "run_TaskCompletion123", + "role": "user", + "content": [ + { + "type": "text", + "text": "I need to transfer $500 from my checking account to my savings account" + } + ] + } + ] + complex_response = [ + { + "createdAt": "2025-03-26T17:27:40Z", + "run_id": "run_TaskCompletion123", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_TransferMoney456", + "name": "transfer_money", + "arguments": { + "from_account": "checking", + "to_account": "savings", + "amount": 500 + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:42Z", + "run_id": "run_TaskCompletion123", + "tool_call_id": "call_TransferMoney456", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "status": "success", + "transaction_id": "TXN789", + "new_checking_balance": 2500.00, + "new_savings_balance": 8500.00 + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:45Z", + "run_id": "run_TaskCompletion123", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "I've successfully transferred $500 from your checking account to your savings account. Transaction ID: TXN789. Your new checking balance is $2,500.00 and your savings balance is $8,500.00." + } + ] + } + ] + + complex_tool_definitions = [ + { + "name": "transfer_money", + "description": "Transfers money between user accounts.", + "parameters": { + "type": "object", + "properties": { + "from_account": { + "type": "string", + "description": "The source account type (checking, savings, etc.)" + }, + "to_account": { + "type": "string", + "description": "The destination account type (checking, savings, etc.)" + }, + "amount": { + "type": "number", + "description": "The amount to transfer" + } + } + } + } + ] + + # Another complex example - conversation format with query but no tool calls + query_conversation_query = [ + { + "createdAt": "2025-03-26T17:30:00Z", + "run_id": "run_SimpleTask789", + "role": "user", + "content": [ + { + "type": "text", + "text": "Please calculate 15% tip on a $80 dinner bill" + } + ] + } + ] + query_conversation_response = [ + { + "createdAt": "2025-03-26T17:30:05Z", + "run_id": "run_SimpleTask789", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The 15% tip on an $80 dinner bill is $12.00. Your total bill including tip would be $92.00." + } + ] + } + ] + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + # Success example - task completed + SourceFileContentContent( + item= { + "query": success_query, + "response": success_response, + "tool_definitions": None + } + ), + # Failure example - task not completed + SourceFileContentContent( + item= { + "query": failure_query, + "response": failure_response, + "tool_definitions": None + } + ), + # Complex example - conversation format with tool usage + SourceFileContentContent( + item= { + "query": complex_query, + "response": complex_response, + "tool_definitions": complex_tool_definitions + } + ), + # Another complex example - conversation format without tool calls + SourceFileContentContent( + item= { + "query": query_conversation_query, + "response": query_conversation_response, + "tool_definitions": None + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_task_navigation_efficiency.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_task_navigation_efficiency.py new file mode 100644 index 000000000000..dfc850e47cbc --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_task_navigation_efficiency.py @@ -0,0 +1,215 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Task Navigation Efficiency evaluator using inline dataset content. + +USAGE: + python sample_task_navigation_efficiency.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ.get("AZURE_AI_PROJECT_ENDPOINT", "") # Sample : https://.services.ai.azure.com/api/projects/ + + with DefaultAzureCredential() as credential: + + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "response": { + "type": "array" + }, + "ground_truth": { + "type": "array" + } + }, + "required": ["response", "ground_truth"] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "task_navigation_efficiency", + "evaluator_name": "builtin.task_navigation_efficiency", + "initialization_parameters": { + "matching_mode": "exact_match" # Can be "exact_match", "in_order_match", or "any_order_match" + }, + "data_mapping": { + "response": "{{item.response}}", + "ground_truth": "{{item.ground_truth}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Task Navigation Efficiency Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # simple inline data with response and ground truth without parameters + simple_response=[ + {"role": "assistant", "content": [ + {"type": "tool_call", "tool_call_id": "call_1", "name": "identify_tools_to_call", + "arguments": {}} + ]}, + {"role": "assistant", "content": [ + {"type": "tool_call", "tool_call_id": "call_2", "name": "call_tool_A", "arguments": {}} + ]}, + {"role": "assistant", "content": [ + {"type": "tool_call", "tool_call_id": "call_3", "name": "call_tool_B", "arguments": {}} + ]}, + {"role": "assistant", "content": [ + {"type": "tool_call", "tool_call_id": "call_4", "name": "response_synthesis", "arguments": {}} + ]} + ] + + simple_ground_truth=["identify_tools_to_call", "call_tool_A", "call_tool_B", "response_synthesis"] + + # Another example with parameters in tool calls + response=[ + {"role": "assistant", "content": [ + {"type": "tool_call", "tool_call_id": "call_1", "name": "search", + "arguments": {"query": "weather", "location": "NYC"}} + ]}, + {"role": "assistant", "content": [ + {"type": "tool_call", "tool_call_id": "call_2", "name": "format_result", + "arguments": {"format": "json"}} + ]} + ] + + ground_truth=( + ["search", "format_result"], + { + "search": {"query": "weather", "location": "NYC"}, + "format_result": {"format": "json"} + } + ) + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + SourceFileContentContent( + item= { + "response": simple_response, + "ground_truth": simple_ground_truth + } + ), + SourceFileContentContent( + item= { + "response": response, + "ground_truth": ground_truth + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_call_accuracy.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_call_accuracy.py new file mode 100644 index 000000000000..03a64123b90a --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_call_accuracy.py @@ -0,0 +1,409 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Tool Call Accuracy evaluator using inline dataset content. + +USAGE: + python sample_tool_call_accuracy.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_calls": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "tool_definitions"] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "tool_call_accuracy", + "evaluator_name": "builtin.tool_call_accuracy", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "query": "{{item.query}}", + "tool_definitions": "{{item.tool_definitions}}", + "tool_calls": "{{item.tool_calls}}", + "response": "{{item.response}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Tool Call Accuracy Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Example 1: Simple tool call evaluation + query1 = "What's the weather like in New York?" + tool_definitions1 = { + "name": "get_weather", + "description": "Get weather information for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + } + } + } + + tool_calls1 = { + "type": "tool_call", + "tool_call_id": "call_1", + "name": "get_weather", + "arguments": {"location": "New York"} + } + + # Example 2: Multiple tool calls + query2 = "Search for customer orders and send an email update" + tool_definitions2 = [ + { + "id": "search_database_tool", + "name": "search_database", + "description": "Search database for information", + "parameters": { + "type": "object", + "properties": { + "query": {"type": "string"}, + "table": {"type": "string"} + } + } + }, + { + "id": "send_email_tool", + "name": "send_email", + "description": "Send an email", + "parameters": { + "type": "object", + "properties": { + "to": {"type": "string"}, + "subject": {"type": "string"} + } + } + } + ] + tool_calls2 = [ + { + "type": "tool_call", + "tool_call_id": "call_1", + "name": "search_database", + "arguments": {"query": "customer orders", "table": "orders"} + }, + { + "type": "tool_call", + "tool_call_id": "call_2", + "name": "send_email", + "arguments": {"to": "customer@example.com", "subject": "Order Update"} + } + ] + + # Example 3: Conversation format + query3 = "Can you send me an email with weather information for Seattle?" + response3 = [ + { + "createdAt": "2025-03-26T17:27:35Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_CUdbkBfvVBla2YP3p24uhElJ", + "name": "fetch_weather", + "arguments": { + "location": "Seattle" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:37Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "tool_call_id": "call_CUdbkBfvVBla2YP3p24uhElJ", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "weather": "Rainy, 14\u00b0C" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:38Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_iq9RuPxqzykebvACgX8pqRW2", + "name": "send_email", + "arguments": { + "recipient": "your_email@example.com", + "subject": "Weather Information for Seattle", + "body": "The current weather in Seattle is rainy with a temperature of 14\u00b0C." + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:41Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "tool_call_id": "call_iq9RuPxqzykebvACgX8pqRW2", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "message": "Email successfully sent to your_email@example.com." + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:42Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "I have successfully sent you an email with the weather information for Seattle. The current weather is rainy with a temperature of 14\u00b0C." + } + ] + } + ] + + tool_definitions3 = [ + { + "name": "fetch_weather", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for." + } + } + } + }, + { + "name": "send_email", + "description": "Sends an email with the specified subject and body to the recipient.", + "parameters": { + "type": "object", + "properties": { + "recipient": { + "type": "string", + "description": "Email address of the recipient." + }, + "subject": { + "type": "string", + "description": "Subject of the email." + }, + "body": { + "type": "string", + "description": "Body content of the email." + } + } + } + } + ] + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + # Example 1: Simple tool call evaluation + SourceFileContentContent( + item= { + "query": query1, + "tool_definitions": tool_definitions1, + "tool_calls": tool_calls1, + "response": None + } + ), + # Example 2: Multiple tool calls + SourceFileContentContent( + item= { + "query": query2, + "tool_definitions": tool_definitions2, + "tool_calls": tool_calls2, + "response": None + } + ), + # Example 3: Conversation format with object types + SourceFileContentContent( + item= { + "query": query3, + "tool_definitions": tool_definitions3, + "response": response3, + "tool_calls": None + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_input_accuracy.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_input_accuracy.py new file mode 100644 index 000000000000..e9dee15ce1d1 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_input_accuracy.py @@ -0,0 +1,410 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Tool Input Accuracy evaluator using inline dataset content. + +USAGE: + python sample_tool_input_accuracy.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "response", "tool_definitions"] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "tool_input_accuracy", + "evaluator_name": "builtin.tool_input_accuracy", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Tool Input Accuracy Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Success example - accurate tool inputs (string query, complex response) + success_query = "Get the weather for Boston" + success_response = [ + { + "createdAt": "2025-03-26T17:27:35Z", + "run_id": "run_ToolInputAccuracy123", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_WeatherBoston456", + "name": "get_weather", + "arguments": { + "location": "Boston" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:37Z", + "run_id": "run_ToolInputAccuracy123", + "tool_call_id": "call_WeatherBoston456", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "weather": "Sunny, 22°C" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:39Z", + "run_id": "run_ToolInputAccuracy123", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The current weather in Boston is sunny with a temperature of 22°C." + } + ] + } + ] + success_tool_definitions = [ + { + "name": "get_weather", + "description": "Get weather information for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + } + } + } + ] + + # Failure example - inaccurate tool inputs (string query, complex response) + failure_query = "Send an email to john@example.com with the meeting details" + failure_response = [ + { + "createdAt": "2025-03-26T17:28:10Z", + "run_id": "run_ToolInputFail789", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_EmailFail101", + "name": "send_email", + "arguments": { + "recipient": "john@example.com" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:28:12Z", + "run_id": "run_ToolInputFail789", + "tool_call_id": "call_EmailFail101", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "error": "Missing required fields: subject and body" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:28:14Z", + "run_id": "run_ToolInputFail789", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "I encountered an error sending the email. Please provide the subject and message content." + } + ] + } + ] + failure_tool_definitions = [ + { + "name": "send_email", + "description": "Send an email to specified recipient", + "parameters": { + "type": "object", + "properties": { + "recipient": {"type": "string", "description": "Recipient email address"}, + "subject": {"type": "string", "description": "Email subject line"}, + "body": {"type": "string", "description": "Email message body"} + } + } + } + ] + + # Complex example - accurate tool inputs (complex query, complex response) + complex_query = [ + { + "createdAt": "2025-03-26T17:29:00Z", + "run_id": "run_ComplexToolInput321", + "role": "user", + "content": [ + { + "type": "text", + "text": "Book a meeting room for Friday from 2 PM to 4 PM for the project review" + } + ] + } + ] + complex_response = [ + { + "createdAt": "2025-03-26T17:29:05Z", + "run_id": "run_ComplexToolInput321", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_BookRoom654", + "name": "book_meeting_room", + "arguments": { + "date": "2025-03-29", + "start_time": "14:00", + "end_time": "16:00", + "purpose": "project review" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:29:07Z", + "run_id": "run_ComplexToolInput321", + "tool_call_id": "call_BookRoom654", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "room_id": "Conference Room B", + "confirmation": "Room booked successfully" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:29:09Z", + "run_id": "run_ComplexToolInput321", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "I've successfully booked Conference Room B for Friday, March 29th from 2:00 PM to 4:00 PM for your project review." + } + ] + } + ] + complex_tool_definitions = [ + { + "name": "book_meeting_room", + "description": "Book a meeting room for specified date and time", + "parameters": { + "type": "object", + "properties": { + "date": {"type": "string", "description": "Date in YYYY-MM-DD format"}, + "start_time": {"type": "string", "description": "Start time in HH:MM format"}, + "end_time": {"type": "string", "description": "End time in HH:MM format"}, + "purpose": {"type": "string", "description": "Meeting purpose"} + } + } + } + ] + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + # Success example - accurate tool inputs + SourceFileContentContent( + item= { + "query": success_query, + "response": success_response, + "tool_definitions": success_tool_definitions + } + ), + # Failure example - inaccurate tool inputs + SourceFileContentContent( + item= { + "query": failure_query, + "response": failure_response, + "tool_definitions": failure_tool_definitions + } + ), + # Complex example - conversation format with accurate tool inputs + SourceFileContentContent( + item= { + "query": complex_query, + "response": complex_response, + "tool_definitions": complex_tool_definitions + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_output_utilization.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_output_utilization.py new file mode 100644 index 000000000000..00dea2337cc5 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_output_utilization.py @@ -0,0 +1,353 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Tool Output Utilization evaluator using inline dataset content. + +USAGE: + python sample_tool_output_utilization.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "response"] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "tool_output_utilization", + "evaluator_name": "builtin.tool_output_utilization", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Tool Output Utilization Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Example 1: Good utilization - uses tool output effectively + query1 = [ + { + "createdAt": "2025-03-26T17:27:30Z", + "run_id": "run_ToolOutput123", + "role": "user", + "content": [ + { + "type": "text", + "text": "What's the weather like in Paris and should I bring an umbrella?" + } + ] + } + ] + response1 = [ + { + "createdAt": "2025-03-26T17:27:35Z", + "run_id": "run_ToolOutput123", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_WeatherParis456", + "name": "get_weather", + "arguments": { + "location": "Paris" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:37Z", + "run_id": "run_ToolOutput123", + "tool_call_id": "call_WeatherParis456", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "weather": "Rainy, 16°C", + "humidity": "85%", + "precipitation": "Heavy rain expected" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:39Z", + "run_id": "run_ToolOutput123", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The weather in Paris is currently rainy with a temperature of 16°C and high humidity at 85%. Heavy rain is expected, so yes, you should definitely bring an umbrella!" + } + ] + } + ] + tool_definitions1 = [ + { + "name": "get_weather", + "description": "Get detailed weather information for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + } + } + } + ] + + # Example 2: Poor utilization - ignores tool output + query2 = [ + { + "createdAt": "2025-03-26T17:28:00Z", + "run_id": "run_ToolOutputPoor789", + "role": "user", + "content": [ + { + "type": "text", + "text": "What's the current stock price of AAPL and should I buy?" + } + ] + } + ] + response2 = [ + { + "createdAt": "2025-03-26T17:28:05Z", + "run_id": "run_ToolOutputPoor789", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_StockPrice101", + "name": "get_stock_price", + "arguments": { + "symbol": "AAPL" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:28:07Z", + "run_id": "run_ToolOutputPoor789", + "tool_call_id": "call_StockPrice101", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "price": "$190.50", + "change": "+2.5%", + "volume": "50M", + "market_cap": "$2.9T" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:28:09Z", + "run_id": "run_ToolOutputPoor789", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "I can't provide investment advice. Please consult with a financial advisor for investment decisions." + } + ] + } + ] + tool_definitions2 = [ + { + "name": "get_stock_price", + "description": "Get current stock price and market data", + "parameters": { + "type": "object", + "properties": { + "symbol": {"type": "string", "description": "Stock symbol (e.g., AAPL)"} + } + } + } + ] + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + # Example 1: Good tool output utilization + SourceFileContentContent( + item= { + "query": query1, + "response": response1, + "tool_definitions": tool_definitions1 + } + ), + # Example 2: Poor tool output utilization + SourceFileContentContent( + item= { + "query": query2, + "response": response2, + "tool_definitions": tool_definitions2 + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_selection.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_selection.py new file mode 100644 index 000000000000..c514360b5125 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_selection.py @@ -0,0 +1,327 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Tool Selection evaluator using inline dataset content. + +USAGE: + python sample_tool_selection.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_calls": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["query", "response", "tool_definitions"] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "tool_selection", + "evaluator_name": "builtin.tool_selection", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_calls": "{{item.tool_calls}}", + "tool_definitions": "{{item.tool_definitions}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Tool Selection Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Example: Conversation format + query = "Can you send me an email with weather information for Seattle?" + response = [ + { + "createdAt": "2025-03-26T17:27:35Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_CUdbkBfvVBla2YP3p24uhElJ", + "name": "fetch_weather", + "arguments": { + "location": "Seattle" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:37Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "tool_call_id": "call_CUdbkBfvVBla2YP3p24uhElJ", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "weather": "Rainy, 14\u00b0C" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:38Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_iq9RuPxqzykebvACgX8pqRW2", + "name": "send_email", + "arguments": { + "recipient": "your_email@example.com", + "subject": "Weather Information for Seattle", + "body": "The current weather in Seattle is rainy with a temperature of 14\u00b0C." + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:41Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "tool_call_id": "call_iq9RuPxqzykebvACgX8pqRW2", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "message": "Email successfully sent to your_email@example.com." + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:42Z", + "run_id": "run_zblZyGCNyx6aOYTadmaqM4QN", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "I have successfully sent you an email with the weather information for Seattle. The current weather is rainy with a temperature of 14\u00b0C." + } + ] + } + ] + + tool_definitions = [ + { + "name": "fetch_weather", + "description": "Fetches the weather information for the specified location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to fetch weather for." + } + } + } + }, + { + "name": "send_email", + "description": "Sends an email with the specified subject and body to the recipient.", + "parameters": { + "type": "object", + "properties": { + "recipient": { + "type": "string", + "description": "Email address of the recipient." + }, + "subject": { + "type": "string", + "description": "Subject of the email." + }, + "body": { + "type": "string", + "description": "Body content of the email." + } + } + } + } + ] + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + SourceFileContentContent( + item= { + "query": query, + "response": response, + "tool_calls": None, + "tool_definitions": tool_definitions + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_success.py b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_success.py new file mode 100644 index 000000000000..51f019012471 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluation/sample_agentic_evaluators/sample_tool_success.py @@ -0,0 +1,317 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list eval group and and eval runs + for Tool Success evaluator using inline dataset content. + +USAGE: + python sample_tool_success.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b1" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Azure AI Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. +""" + +from dotenv import load_dotenv +import os +import json +import time + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent +) + + +load_dotenv() + + +def main() -> None: + endpoint = os.environ[ + "AZURE_AI_PROJECT_ENDPOINT" + ] # Sample : https://.services.ai.azure.com/api/projects/ + model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") # Sample : gpt-4o-mini + + with DefaultAzureCredential() as credential: + with AIProjectClient(endpoint=endpoint, credential=credential, api_version="2025-11-15-preview") as project_client: + print("Creating an OpenAI client from the AI Project client") + + client = project_client.get_openai_client() + client._custom_query = {"api-version": "2025-11-15-preview"} + + data_source_config = { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "tool_definitions": { + "anyOf": [ + {"type": "object"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + }, + "response": { + "anyOf": [ + {"type": "string"}, + { + "type": "array", + "items": { + "type": "object" + } + } + ] + } + }, + "required": ["response"] + }, + "include_sample_schema": True + } + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "tool_success", + "evaluator_name": "builtin.tool_success", + "initialization_parameters": { + "deployment_name": f"{model_deployment_name}" + }, + "data_mapping": { + "tool_definitions": "{{item.tool_definitions}}", + "response": "{{item.response}}" + } + } + ] + + print("Creating Eval Group") + eval_object = client.evals.create( + name="Test Tool Success Evaluator with inline data", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Eval Group created") + + print("Get Eval Group by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Eval Run Response:") + pprint(eval_object_response) + + # Example 1: Successful tool execution + response1 = [ + { + "createdAt": "2025-03-26T17:27:35Z", + "run_id": "run_ToolSuccess123", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_FileUpload456", + "name": "upload_file", + "arguments": { + "file_path": "/documents/report.pdf", + "destination": "cloud_storage" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:37Z", + "run_id": "run_ToolSuccess123", + "tool_call_id": "call_FileUpload456", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "status": "success", + "file_id": "file_12345", + "upload_url": "https://storage.example.com/file_12345", + "message": "File uploaded successfully" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:27:39Z", + "run_id": "run_ToolSuccess123", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "I've successfully uploaded your report.pdf to cloud storage. The file ID is file_12345 and it's available at the provided URL." + } + ] + } + ] + tool_definitions1 = [ + { + "name": "upload_file", + "description": "Upload a file to cloud storage", + "parameters": { + "type": "object", + "properties": { + "file_path": {"type": "string", "description": "Path to the file to upload"}, + "destination": {"type": "string", "description": "Destination storage location"} + } + } + } + ] + + # Example 2: Failed tool execution + response2 = [ + { + "createdAt": "2025-03-26T17:28:10Z", + "run_id": "run_ToolFail789", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_DatabaseQuery101", + "name": "query_database", + "arguments": { + "table": "users", + "query": "SELECT * FROM users WHERE age > 25" + } + } + ] + }, + { + "createdAt": "2025-03-26T17:28:12Z", + "run_id": "run_ToolFail789", + "tool_call_id": "call_DatabaseQuery101", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "status": "error", + "error_code": "DB_CONNECTION_FAILED", + "message": "Unable to connect to database. Connection timeout after 30 seconds." + } + } + ] + }, + { + "createdAt": "2025-03-26T17:28:14Z", + "run_id": "run_ToolFail789", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "I encountered an error while trying to query the database. The connection timed out after 30 seconds. Please try again later or contact your database administrator." + } + ] + } + ] + tool_definitions2 = [ + { + "name": "query_database", + "description": "Execute SQL queries on the database", + "parameters": { + "type": "object", + "properties": { + "table": {"type": "string", "description": "Database table name"}, + "query": {"type": "string", "description": "SQL query to execute"} + } + } + } + ] + + print("Creating Eval Run with Inline Data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={ + "team": "eval-exp", + "scenario": "inline-data-v1" + }, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content= [ + # Example 1: Successful tool execution + SourceFileContentContent( + item= { + "tool_definitions": tool_definitions1, + "response": response1 + } + ), + # Example 2: Failed tool execution + SourceFileContentContent( + item= { + "tool_definitions": tool_definitions2, + "response": response2 + } + ) + ] + ) + ) + ) + + print(f"Eval Run created") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + print("\n\n----Eval Run Output Items----\n\n") + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list( + run_id=run.id, eval_id=eval_object.id + )) + pprint(output_items) + print(f"Eval Run Status: {run.status}") + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + + # [END evaluations_sample] + + +def _to_json_primitive(obj): + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (list, tuple)): + return [_to_json_primitive(i) for i in obj] + if isinstance(obj, dict): + return {k: _to_json_primitive(v) for k, v in obj.items()} + for method in ("to_dict", "as_dict", "dict", "serialize"): + if hasattr(obj, method): + try: + return _to_json_primitive(getattr(obj, method)()) + except Exception: + pass + if hasattr(obj, "__dict__"): + return _to_json_primitive({k: v for k, v in vars(obj).items() if not k.startswith("_")}) + return str(obj) + +def pprint(str) -> None: + print(json.dumps(_to_json_primitive(str), indent=2)) + +if __name__ == "__main__": + main() \ No newline at end of file