Skip to content
Draft
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ The framework automatically generates HTML reports in the `reports/` directory a

The evaluation framework runs automatically in CI/CD pipelines when pull requests modify azsdk cli. Changes to `.github/copilot-instructions.md` or any instruction files in `eng/common/instructions/` would also be triggered but in progress. This ensures instruction or mcp changes don't negatively impact agent behavior before merging. The evaluations run alongside other PR validation tests and must pass for the PR to be merged.

**Pipeline**: [release pipeline](https://dev.azure.com/azure-sdk/internal/_build?definitionId=7684) - Configuration in `eng/common/pipelines/copilot-instruction-evals.yml`
**Pipeline**: [release pipeline](https://dev.azure.com/azure-sdk/internal/_build?definitionId=7684) - Configuration in `eng/common/pipelines/ai-evals-tests.yml`

## Walkthrough: Release Plan Creation Evaluation

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
using Azure.Sdk.Tools.Cli.Evaluations.Evaluators;
using Azure.Sdk.Tools.Cli.Evaluations.Helpers;
using Azure.Sdk.Tools.Cli.Evaluations.Models;
using Microsoft.Extensions.AI.Evaluation;
using NUnit.Framework;

namespace Azure.Sdk.Tools.Cli.Evaluations.Scenarios
{
public partial class Scenario
{
[Test]
[Category(RepositoryCategories.AzureRestApiSpecs)]
public async Task Evaluate_GenerateSdk()
{
const string prompt = "Do every step necessary to generate my SDK for Dotnet, up until testing. Proceed and don't ask me questions. Stop before running tests on the SDK. I'm in a public repo. My tspconfig is at: \"C:\\azure-rest-api-specs\\specification\\healthdataaiservices\\HealthDataAIServices.DeidServices\\tspconfig.yaml\", and the repo: \"C:\\azure-sdk-for-net\"";
string[] expectedTools =
[
"azsdk_verify_setup", "azsdk_run_typespec_validation", "azsdk_package_generate_code","azsdk_package_build_code"
];

// Build scenario data
var scenarioData = ChatMessageHelper.LoadScenarioFromPrompt(prompt, expectedTools);

// External construction of evaluation context
bool checkInputs = false;

var result = await EvaluationHelper.RunToolInputScenarioAsync(
scenarioName: this.ScenarioName,
scenarioData: scenarioData,
chatCompletion: s_chatCompletion!,
chatConfig: s_chatConfig!,
executionName: s_executionName,
reportingPath: ReportingPath,
toolNames: s_toolNames!,
evaluators: [new ExpectedToolInputEvaluator()],
enableResponseCaching: true,
additionalContexts: new EvaluationContext[]
{
new ExpectedToolInputEvaluatorContext(scenarioData.ExpectedOutcome, s_toolNames!, checkInputs)
});

EvaluationHelper.ValidateToolInputsEvaluator(result);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
using Azure.Sdk.Tools.Cli.Evaluations.Evaluators;
using Azure.Sdk.Tools.Cli.Evaluations.Helpers;
using Azure.Sdk.Tools.Cli.Evaluations.Models;
using Microsoft.Extensions.AI.Evaluation;
using NUnit.Framework;

namespace Azure.Sdk.Tools.Cli.Evaluations.Scenarios
{
public partial class Scenario
{
[Test]
public async Task Evaluate_VerifySetup()
{
const string prompt = "Verify my setup for Dotnet.";
string[] expectedTools =
[
"azsdk_verify_setup"
];

// Build scenario data
var scenarioData = ChatMessageHelper.LoadScenarioFromPrompt(prompt, expectedTools);

// External construction of evaluation context
bool checkInputs = false;

var result = await EvaluationHelper.RunToolInputScenarioAsync(
scenarioName: this.ScenarioName,
scenarioData: scenarioData,
chatCompletion: s_chatCompletion!,
chatConfig: s_chatConfig!,
executionName: s_executionName,
reportingPath: ReportingPath,
toolNames: s_toolNames!,
evaluators: [new ExpectedToolInputEvaluator()],
enableResponseCaching: true,
additionalContexts: new EvaluationContext[]
{
new ExpectedToolInputEvaluatorContext(scenarioData.ExpectedOutcome, s_toolNames!, checkInputs)
});

EvaluationHelper.ValidateToolInputsEvaluator(result);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
using Microsoft.Extensions.AI;

namespace Azure.Sdk.Tools.Cli.Evaluations.ToolMocks
{
public class PackageBuildCode : IToolMock
{
public string ToolName => "azsdk_package_build_code";
public string CallId => "tooluse_l1vP7afmgopmnhjpjp";
private string ToolResult => """{"message":"Build completed successfully.","result":"succeeded","language":"DotNet","package_name":"Azure.Health.Deidentification","version":"1.1.0-beta.2","package_type":"Unknown","sdk_repo":"","next_steps":[],"operation_status":"Succeeded"}""";
public ChatMessage GetMockResponse(string callid)
{
return new ChatMessage(
ChatRole.Tool,
[
new FunctionResultContent(
callid,
ToolResult
)
]
);
}

public ChatMessage GetMockCall()
{
return new ChatMessage(
ChatRole.Assistant,
[
new FunctionCallContent(
CallId,
ToolName,
new Dictionary<string, object?>
{
{ "packagePath", "C:\\azure-sdk-for-net\\sdk\\healthdataaiservices\\Azure.Health.Deidentification" },
}
)
]
);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using Microsoft.Extensions.AI;

namespace Azure.Sdk.Tools.Cli.Evaluations.ToolMocks
{
public class PackageGenerateCode : IToolMock
{
public string ToolName => "azsdk_package_generate_code";
public string CallId => "tooluse_l1vP7afmgopmnhgmjp";
private string ToolResult => """{"message":"SDK generation completed successfully using tspconfig.yaml.","result":"succeeded","language":"DotNet","package_name":"","version":"","package_type":"Unknown","sdk_repo":"azure-sdk-for-net","next_steps":["If the SDK is not Python, build the code"],"operation_status":"Succeeded"}""";
public ChatMessage GetMockResponse(string callid)
{
return new ChatMessage(
ChatRole.Tool,
[
new FunctionResultContent(
callid,
ToolResult
)
]
);
}

public ChatMessage GetMockCall()
{
return new ChatMessage(
ChatRole.Assistant,
[
new FunctionCallContent(
CallId,
ToolName,
new Dictionary<string, object?>
{
{ "tspConfigPath","C:\\azure-rest-api-specs\\specification\\healthdataaiservices\\HealthDataAIServices.DeidServices\\tspconfig.yaml" },
{ "localSdkRepoPath", "C:\\azure-sdk-for-net" },
{ "tspLocationPath", "" },
{ "emitterOptions", "" }
}
)
]
);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ private static void RegisterMocks()
new CreatePullRequest(),
new CreateReleasePlan(),
new VerifySetup(),
new PackageGenerateCode(),
new PackageBuildCode(),
};

foreach (var mock in mockInstances)
Expand Down
Loading