Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ public ValueTask<EvaluationResult> EvaluateAsync(

if (!string.Equals(expectedJson, actualJson, StringComparison.OrdinalIgnoreCase))
{
MetricError($"Tool call arguments did not match. This was tool call #{countCalls}\nExpected Argument JSON:{expectedJson}\nActual Argument JSON:{actualJson}", metric);
MetricError($"Tool call arguments did not match. This was tool call #{countCalls}: {expToolName}\nExpected Argument JSON:{expectedJson}\nActual Argument JSON:{actualJson}", metric);
return result;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
[assembly: Parallelizable(ParallelScope.All)]

// Set conservative parallelism
[assembly: LevelOfParallelism(5)]
[assembly: LevelOfParallelism(10)]
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using Microsoft.Extensions.AI;
using ModelContextProtocol.Client;
using OpenAI.Responses;

namespace Azure.Sdk.Tools.Cli.Evaluations.Helpers
{
Expand All @@ -14,7 +15,10 @@ public ChatCompletion(IChatClient chatClient, IMcpClient mcpClient)
_mcpClient = mcpClient;
}

public async Task<ChatResponse> GetChatResponseWithExpectedResponseAsync(IEnumerable<ChatMessage> chat, Dictionary<string, ChatMessage> expectedToolResults)
public async Task<ChatResponse> GetChatResponseWithExpectedResponseAsync(
IEnumerable<ChatMessage> chat,
Dictionary<string, ChatMessage> expectedToolResults,
IEnumerable<string> optionalToolNames)
{
var tools = await _mcpClient.ListToolsAsync();
var conversationMessages = chat.ToList();
Expand All @@ -25,6 +29,7 @@ public async Task<ChatResponse> GetChatResponseWithExpectedResponseAsync(IEnumer
};
var response = await _chatClient.GetResponseAsync(chat, chatOptions);
var chatInitialIndex = conversationMessages.Count;
var optionalCallIds = new HashSet<string>();

while (response.FinishReason == ChatFinishReason.ToolCalls)
{
Expand Down Expand Up @@ -67,12 +72,49 @@ public async Task<ChatResponse> GetChatResponseWithExpectedResponseAsync(IEnumer

conversationMessages.Add(errorResponseMessage);
}

if(optionalToolNames.Contains(functionCall.Name))
{
optionalCallIds.Add(functionCall.CallId);
}
}

response = await _chatClient.GetResponseAsync(conversationMessages, chatOptions);
}

return new ChatResponse([.. conversationMessages.Skip(chatInitialIndex)]);
// Add the final assistant message (when there are no further tool calls)
var finalAssistantMessage = response.Messages.FirstOrDefault();
if (finalAssistantMessage != null)
{
conversationMessages.Add(finalAssistantMessage);
}

// Filter out any optional tool calls and their corresponding tool results
var conversation = conversationMessages.Skip(chatInitialIndex);
if (optionalCallIds.Count == 0)
{
return new ChatResponse([.. conversation]);
}

var filtered = FilterOptionalToolResponses(conversation, optionalCallIds);
return new ChatResponse([.. filtered]);
}

private IEnumerable<ChatMessage> FilterOptionalToolResponses(IEnumerable<ChatMessage> messages, HashSet<string> optionalCallIds)
{
foreach (var message in messages)
{
// Remove optional tool calls and results.
message.Contents = [.. message.Contents.Where(content =>
!(content is FunctionCallContent fc && !string.IsNullOrEmpty(fc.CallId) && optionalCallIds.Contains(fc.CallId)) &&
!(content is FunctionResultContent fr && !string.IsNullOrEmpty(fr.CallId) && optionalCallIds.Contains(fr.CallId))
)];

if (message.Contents.Any())
{
yield return message;
}
}
}

public async Task<ChatResponse> GetChatResponseAsync(IEnumerable<ChatMessage> chat)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ namespace Azure.Sdk.Tools.Cli.Evaluations.Helpers
{
public class EvaluationHelper
{
private const string verifySetupToolName = "azsdk_verify_setup";

public static void ValidateBooleanMetricEvaluator(EvaluationResult result, string metricName)
{
EvaluationRating[] expectedRatings = [EvaluationRating.Good, EvaluationRating.Exceptional];
Expand Down Expand Up @@ -70,16 +72,33 @@ public static async Task<EvaluationResult> RunToolInputScenarioAsync(
IEnumerable<IEvaluator>? evaluators = null,
bool enableResponseCaching = true,
IEnumerable<EvaluationContext>? additionalContexts = null,
CancellationToken cancellationToken = default)
CancellationToken cancellationToken = default,
IEnumerable<string>? optionalToolNames = null)
{
evaluators ??= [new ExpectedToolInputEvaluator()];

var fullChat = scenarioData.ChatHistory.Append(scenarioData.NextMessage);
var expectedToolResults = ChatMessageHelper.GetExpectedToolsByName(scenarioData.ExpectedOutcome, toolNames);

// Default optional tools to empty when not provided
optionalToolNames ??= [];

// Get expected tool names from the scenario data and optional tool names
var expectedToolNames = ChatMessageHelper.GetExpectedToolsByName(scenarioData.ExpectedOutcome, toolNames).Keys;
var filteredOptionalToolNames = GetOptionalToolNames(optionalToolNames, expectedToolNames);

// We can use LoadScenarioPrompt with empty prompt to get optional tools
// in the proper format.
var optionalTools = ChatMessageHelper.LoadScenarioFromPrompt("", filteredOptionalToolNames).ExpectedOutcome;

// Include the optional tools along side the expected.
// Later we will then filter them out from the response.
var toolChatMessages = optionalTools.Concat(scenarioData.ExpectedOutcome);
var toolResults = ChatMessageHelper.GetExpectedToolsByName(toolChatMessages, toolNames);

var response = await chatCompletion.GetChatResponseWithExpectedResponseAsync(
fullChat,
expectedToolResults);
toolResults,
filteredOptionalToolNames);

return await RunScenarioAsync(
fullChat,
Expand All @@ -93,5 +112,15 @@ public static async Task<EvaluationResult> RunToolInputScenarioAsync(
additionalContexts,
cancellationToken);
}

private static IEnumerable<string> GetOptionalToolNames(
IEnumerable<string> optionalToolNames,
IEnumerable<string> expectedToolNames)
{
// Build optional list excluding any names that are expected
// also make sure to always include verify setup
var combinedOptional = optionalToolNames.Append(verifySetupToolName);
return combinedOptional.Except(expectedToolNames);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
using Azure.Sdk.Tools.Cli.Evaluations.Evaluators;
using Azure.Sdk.Tools.Cli.Evaluations.Helpers;
using Azure.Sdk.Tools.Cli.Evaluations.Models;
using Microsoft.Extensions.AI.Evaluation;
using NUnit.Framework;

namespace Azure.Sdk.Tools.Cli.Evaluations.Scenarios
{
public partial class Scenario
{
[Test]
[Category(RepositoryCategories.AzureRestApiSpecs)]
public async Task Evaluate_CheckPublicRepo()
{
const string prompt = "Check if my TypeSpec project is in the public repo. My setup has already been verified, do not run azsdk_verify_setup. Project root: C:\\\\azure-rest-api-specs\\\\specification\\\\contosowidgetmanager\\\\Contoso.WidgetManager.";
string[] expectedTools =
[
"azsdk_typespec_check_project_in_public_repo"
];

var scenarioData = ChatMessageHelper.LoadScenarioFromPrompt(prompt, expectedTools);
bool checkInputs = true;

var result = await EvaluationHelper.RunToolInputScenarioAsync(
scenarioName: this.ScenarioName,
scenarioData: scenarioData,
chatCompletion: s_chatCompletion!,
chatConfig: s_chatConfig!,
executionName: s_executionName,
reportingPath: ReportingPath,
toolNames: s_toolNames!,
evaluators: [new ExpectedToolInputEvaluator()],
enableResponseCaching: true,
additionalContexts: new EvaluationContext[]
{
new ExpectedToolInputEvaluatorContext(scenarioData.ExpectedOutcome, s_toolNames!, checkInputs)
});

EvaluationHelper.ValidateToolInputsEvaluator(result);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
using Azure.Sdk.Tools.Cli.Evaluations.Evaluators;
using Azure.Sdk.Tools.Cli.Evaluations.Helpers;
using Azure.Sdk.Tools.Cli.Evaluations.Models;
using Microsoft.Extensions.AI.Evaluation;
using NUnit.Framework;

namespace Azure.Sdk.Tools.Cli.Evaluations.Scenarios
{
public partial class Scenario
{
[Test]
[Category(RepositoryCategories.AzureRestApiSpecs)]
public async Task Evaluate_CheckPublicRepoThenValidate()
{
const string prompt =
"Confirm the TypeSpec project is in the public repo, then run TypeSpec validation. " +
"Project path: C\\:\\azure-rest-api-specs\\specification\\contosowidgetmanager\\Contoso.WidgetManager. " +
"My setup has already been verified, do not run azsdk_verify_setup.";

string[] expectedTools =
[
"azsdk_typespec_check_project_in_public_repo",
"azsdk_run_typespec_validation"
];

var scenarioData = ChatMessageHelper.LoadScenarioFromPrompt(prompt, expectedTools);
bool checkInputs = true;

var result = await EvaluationHelper.RunToolInputScenarioAsync(
scenarioName: this.ScenarioName,
scenarioData: scenarioData,
chatCompletion: s_chatCompletion!,
chatConfig: s_chatConfig!,
executionName: s_executionName,
reportingPath: ReportingPath,
toolNames: s_toolNames!,
evaluators: [new ExpectedToolInputEvaluator()],
enableResponseCaching: true,
additionalContexts: new EvaluationContext[]
{
new ExpectedToolInputEvaluatorContext(scenarioData.ExpectedOutcome, s_toolNames!, checkInputs)
});

EvaluationHelper.ValidateToolInputsEvaluator(result);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
using Azure.Sdk.Tools.Cli.Evaluations.Evaluators;
using Azure.Sdk.Tools.Cli.Evaluations.Helpers;
using Azure.Sdk.Tools.Cli.Evaluations.Models;
using Microsoft.Extensions.AI.Evaluation;
using NUnit.Framework;

namespace Azure.Sdk.Tools.Cli.Evaluations.Scenarios
{
public partial class Scenario
{
[Test]
[Category(RepositoryCategories.AzureRestApiSpecs)]
public async Task Evaluate_GetModifiedTypespecProjects()
{
const string prompt = "List the TypeSpec projects modified in my repo. My setup has already been verified, do not run azsdk_verify_setup. Path to root: C:\\azure-rest-api-specs. Compare against main.";
string[] expectedTools =
[
"azsdk_get_modified_typespec_projects"
];

var scenarioData = ChatMessageHelper.LoadScenarioFromPrompt(prompt, expectedTools);
bool checkInputs = true;

var result = await EvaluationHelper.RunToolInputScenarioAsync(
scenarioName: this.ScenarioName,
scenarioData: scenarioData,
chatCompletion: s_chatCompletion!,
chatConfig: s_chatConfig!,
executionName: s_executionName,
reportingPath: ReportingPath,
toolNames: s_toolNames!,
evaluators: [new ExpectedToolInputEvaluator()],
enableResponseCaching: true,
additionalContexts: new EvaluationContext[]
{
new ExpectedToolInputEvaluatorContext(scenarioData.ExpectedOutcome, s_toolNames!, checkInputs)
});

EvaluationHelper.ValidateToolInputsEvaluator(result);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ public async Task Evaluate_ValidateTypespec()
"azsdk_run_typespec_validation",
];

string [] optionalTools =
[
"azsdk_typespec_check_project_in_public_repo"
];

// Build scenario data from prompt
var scenarioData = ChatMessageHelper.LoadScenarioFromPrompt(prompt, expectedTools);

Expand All @@ -38,7 +43,8 @@ public async Task Evaluate_ValidateTypespec()
additionalContexts: new EvaluationContext[]
{
new ExpectedToolInputEvaluatorContext(scenarioData.ExpectedOutcome, s_toolNames!, checkInputs)
});
},
optionalToolNames: optionalTools);

EvaluationHelper.ValidateToolInputsEvaluator(result);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
using Azure.Sdk.Tools.Cli.Evaluations.Evaluators;
using Azure.Sdk.Tools.Cli.Evaluations.Helpers;
using Azure.Sdk.Tools.Cli.Evaluations.Models;
using Microsoft.Extensions.AI.Evaluation;
using NUnit.Framework;

namespace Azure.Sdk.Tools.Cli.Evaluations.Scenarios
{
public partial class Scenario
{
[Test]
public async Task Evaluate_LinkNamespaceApprovalIssue()
{
const string prompt = "Link namespace approval issue https://github.com/Azure/azure-sdk/issues/1234 to release plan 12345. My setup has already been verified, do not run azsdk_verify_setup.";
string[] expectedTools =
[
"azsdk_link_namespace_approval_issue"
];

var scenarioData = ChatMessageHelper.LoadScenarioFromPrompt(prompt, expectedTools);
bool checkInputs = true;

var result = await EvaluationHelper.RunToolInputScenarioAsync(
scenarioName: this.ScenarioName,
scenarioData: scenarioData,
chatCompletion: s_chatCompletion!,
chatConfig: s_chatConfig!,
executionName: s_executionName,
reportingPath: ReportingPath,
toolNames: s_toolNames!,
evaluators: [new ExpectedToolInputEvaluator()],
enableResponseCaching: true,
additionalContexts: new EvaluationContext[]
{
new ExpectedToolInputEvaluatorContext(scenarioData.ExpectedOutcome, s_toolNames!, checkInputs)
});

EvaluationHelper.ValidateToolInputsEvaluator(result);
}
}
}
Loading