diff --git a/docs/mkdocs/en/evaluation.md b/docs/mkdocs/en/evaluation.md index f7f1eb5e4..d7f39a18c 100644 --- a/docs/mkdocs/en/evaluation.md +++ b/docs/mkdocs/en/evaluation.md @@ -58,53 +58,64 @@ if err != nil { ```json { - "eval_set_id": "math-basic", + "evalSetId": "math-basic", "name": "math-basic", - "eval_cases": [ + "evalCases": [ { - "eval_id": "calc_add", - "conversation": [ - { - "invocation_id": "calc_add-1", - "user_content": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "final_response": { - "parts": [ - { - "text": "calc result: 5" - } - ], - "role": "assistant" - }, - "intermediate_data": { - "tool_uses": [ - { - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ] - }, - "creation_timestamp": 1761134484.981062 + "evalId": "calc_add", + "conversation": [ + { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + } + ], + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" } - ], - "session_input": { - "app_name": "math-eval-app", - "user_id": "user" - }, - "creation_timestamp": 1761134484.981062 - }, + } ], - "creation_timestamp": 1761134484.9804401 + "creationTimestamp": 1761134484.9804401 } ``` @@ -113,8 +124,23 @@ if err != nil { ```json [ { - "metric_name": "tool_trajectory_avg_score", - "threshold": 1 + "metricName": "tool_trajectory_avg_score", + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } } ] ``` @@ -122,7 +148,159 @@ if err != nil { #### Evaluation Result File Example ```json -"{\"eval_set_result_id\":\"math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc\",\"eval_set_result_name\":\"math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc\",\"eval_set_id\":\"math-basic\",\"eval_case_results\":[{\"eval_set_id\":\"math-basic\",\"eval_id\":\"calc_add\",\"final_eval_status\":1,\"overall_eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}],\"eval_metric_result_per_invocation\":[{\"actual_invocation\":{\"invocation_id\":\"8b205b3f-682e-409a-b751-89ef805d0221\",\"user_content\":{\"parts\":[{\"text\":\"calc add 2 3\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"The result of adding 2 and 3 is **5**.\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"id\":\"call_00_j75SIh8A9xSlG61OrC1ARIab\",\"args\":{\"a\":2,\"b\":3,\"operation\":\"add\"},\"name\":\"calculator\"}]}},\"expected_invocation\":{\"invocation_id\":\"calc_add-1\",\"user_content\":{\"parts\":[{\"text\":\"calc add 2 3\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"calc result: 5\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"args\":{\"a\":2,\"b\":3,\"operation\":\"add\"},\"name\":\"calculator\"}]},\"creation_timestamp\":1761134484.981062},\"eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}]}],\"session_id\":\"74252944-b1a7-4c17-8f39-4a5809395d1d\",\"user_id\":\"user\"},{\"eval_set_id\":\"math-basic\",\"eval_id\":\"calc_multiply\",\"final_eval_status\":1,\"overall_eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}],\"eval_metric_result_per_invocation\":[{\"actual_invocation\":{\"invocation_id\":\"65226930-d45c-43ae-ab88-9c35f3abce70\",\"user_content\":{\"parts\":[{\"text\":\"calc multiply 6 7\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"6 × 7 = 42\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"id\":\"call_00_b3Gj4Y3fJu9Blkbl6H0MLquO\",\"args\":{\"a\":6,\"b\":7,\"operation\":\"multiply\"},\"name\":\"calculator\"}]}},\"expected_invocation\":{\"invocation_id\":\"calc_multiply-1\",\"user_content\":{\"parts\":[{\"text\":\"calc multiply 6 7\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"calc result: 42\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"args\":{\"a\":6,\"b\":7,\"operation\":\"multiply\"},\"name\":\"calculator\"}]},\"creation_timestamp\":1761134484.9812014},\"eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}]}],\"session_id\":\"6393fabd-ab50-49b7-8656-59fcb0a29758\",\"user_id\":\"user\"}],\"creation_timestamp\":1761134849.3572516}" +{ + "evalSetResultId": "math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494", + "evalSetResultName": "math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494", + "evalSetId": "math-basic", + "evalCaseResults": [ + { + "evalSetId": "math-basic", + "evalId": "calc_add", + "finalEvalStatus": 1, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "53845847-16e0-4960-9d00-d3abf0ab1807", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "The result of 2 + 3 is **5**." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_J7WhW8PJurYtWji3J5H7ITN9", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_J7WhW8PJurYtWji3J5H7ITN9", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } + } + ] + } + ], + "sessionId": "e9cc851f-8c89-45f4-b430-7c54991c7dda", + "userId": "user" + } + ], + "creationTimestamp": 1763997862.5581782 +} ``` ### inmemory @@ -217,6 +395,7 @@ cases := []*evalset.EvalCase{ IntermediateData: &evalset.IntermediateData{ ToolUses: []*genai.FunctionCall{ { + ID: "tool_use_1", Name: "calculator", Args: map[string]interface{}{ "operation": "add", @@ -225,6 +404,18 @@ cases := []*evalset.EvalCase{ }, }, }, + ToolResponses: []*genai.FunctionResponse{ + { + ID: "tool_use_1", + Name: "calculator", + Response: map[string]interface{}{ + "a": 2.0, + "b": 3.0, + "operation": "add", + "result": 5.0, + }, + }, + }, }, }, }, @@ -249,6 +440,25 @@ import "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" evalMetric := &metric.EvalMetric{ MetricName: "tool_trajectory_avg_score", Threshold: 1.0, + Criterion: criterion.New( + criterion.WithToolTrajectory( + ctooltrajectory.New( + ctooltrajectory.WithDefault( + &ctooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &cjson.JSONCriterion{ + MatchStrategy: cjson.JSONMatchStrategyExact, + }, + Response: &cjson.JSONCriterion{ + MatchStrategy: cjson.JSONMatchStrategyExact, + }, + }, + ), + ), + ), + ), } metricManager.Add(ctx, appName, evalSetID, evalMetric) ``` @@ -358,18 +568,29 @@ The framework provides two implementations of the EvalSet Manager: ### Metric -A Metric represents an evaluation metric used to measure a specific aspect of an EvalSet's performance. +Metric represents an evaluation indicator used to measure a certain aspect of EvalSet’s performance. Each evaluation indicator includes the metric name, evaluation criterion, and score threshold. -Each metric consists of a metric name and a scoring threshold: +During the evaluation process, the evaluator compares the actual conversation with the expected conversation according to the configured evaluation criterion, calculates the evaluation score for this metric, and compares it with the threshold: -- When the evaluation score falls below the threshold, the metric is considered failed. -- When the evaluation score reaches or exceeds the threshold, the metric is considered passed. +- When the evaluation score is lower than the threshold, the metric is determined as not passed. +- When the evaluation score reaches or exceeds the threshold, the metric is determined as passed. ```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + // EvalMetric represents a single metric used to evaluate an EvalCase. type EvalMetric struct { - MetricName string // metric name. - Threshold float64 // scoring threshold. + MetricName string // Metric name. + Threshold float64 // Score threshold. + Criterion *criterion.Criterion // Evaluation criterion. +} + +// Criterion aggregates various evaluation criteria. +type Criterion struct { + ToolTrajectory *tooltrajectory.ToolTrajectoryCriterion // Tool trajectory evaluation criterion. } ``` @@ -526,15 +747,19 @@ type EvalCaseResult struct { EvalMetricResult represents the evaluation result of a specific metric, including the score, status, threshold, and additional information. ```go -import "trpc.group/trpc-go/trpc-agent-go/evaluation/status" +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/status" +) // EvalMetricResult represents the evaluation result of a single metric. type EvalMetricResult struct { - MetricName string // Metric name. - Score float64 // Actual score. - EvalStatus status.EvalStatus // Evaluation status. - Threshold float64 // Score threshold. - Details map[string]any // Additional information, such as scoring process, error description, etc. + MetricName string // Metric name. + Score float64 // Actual score. + EvalStatus status.EvalStatus // Evaluation status. + Threshold float64 // Score threshold. + Criterion *criterion.Criterion // Evaluation criterion. + Details map[string]any // Additional information, such as scoring process, error description, etc. } ``` @@ -961,3 +1186,176 @@ func (l *customLocator) List(baseDir, appName string) ([]string, error) { return results, nil } ``` + +### Evaluation Criterion + +The evaluation criterion describes the specific evaluation method and can be combined as needed. + +The framework has the following built-in types of evaluation criteria: + +| Criterion Type | Applicable Object | +| ----------------------- | ----------------------------------------------------- | +| TextCriterion | Text string | +| JSONCriterion | JSON object, usually used to compare `map[string]any` | +| ToolTrajectoryCriterion | Tool invocation trajectory | +| Criterion | Aggregation of multiple criteria | + +#### TextCriterion + +TextCriterion is used for string matching and can be configured to ignore case and to use a specific matching strategy. + +```go +// TextCriterion defines the matching method for strings. +type TextCriterion struct { + Ignore bool // Whether to skip matching. + CaseInsensitive bool // Whether case-insensitive. + MatchStrategy TextMatchStrategy // Matching strategy. + Compare func(actual, expected string) (bool, error) // Custom comparison. +} +``` + +Explanation of TextMatchStrategy values: + +| TextMatchStrategy Value | Description | +| ----------------------- | ----------------------------------------------------------------------- | +| exact | The actual string is exactly the same as the expected string (default). | +| contains | The actual string contains the expected string. | +| regex | The actual string matches the expected string as a regular expression. | + +#### JSONCriterion + +JSONCriterion is used to compare structured JSON data. You can configure whether to ignore the comparison and choose a specific matching strategy. + +```go +// JSONCriterion defines the matching method for JSON objects. +type JSONCriterion struct { + Ignore bool // Whether to skip matching. + MatchStrategy JSONMatchStrategy // Matching strategy. + Compare func(actual, expected map[string]any) (bool, error) // Custom comparison. +} +``` + +Explanation of JSONMatchStrategy values: + +| JSONMatchStrategy Value | Description | +| ----------------------- | ------------------------------------------------------------------- | +| exact | The actual JSON is exactly the same as the expected JSON (default). | + +#### ToolTrajectoryCriterion + +ToolTrajectoryCriterion is used to configure the evaluation criteria for tool invocations and responses. You can set default strategies, customize strategies by tool name, and control whether to ignore the invocation order. + +```go +// ToolTrajectoryCriterion defines the evaluation criteria for tool invocations and responses. +type ToolTrajectoryCriterion struct { + DefaultStrategy *ToolTrajectoryStrategy // Default strategy. + ToolStrategy map[string]*ToolTrajectoryStrategy // Customized strategies by tool name. + OrderInsensitive bool // Whether to ignore invocation order. + Compare func(actual, expected *evalset.Invocation) (bool, error) // Custom comparison. +} + +// ToolTrajectoryStrategy defines the matching strategy for a single tool. +type ToolTrajectoryStrategy struct { + Name *TextCriterion // Tool name matching. + Arguments *JSONCriterion // Invocation arguments matching. + Response *JSONCriterion // Tool response matching. +} +``` + +DefaultStrategy is used to configure the global default evaluation criterion and applies to all tools. + +ToolStrategy overrides the evaluation criterion for specific tools by tool name. When ToolStrategy is not set, all tool invocations use DefaultStrategy. + +If no evaluation criterion is configured, the framework uses the default evaluation criterion: tool names are compared using TextCriterion with the `exact` strategy, and arguments and responses are compared using JSONCriterion with the `exact` strategy. This ensures that tool trajectory evaluation always has a reasonable fallback behavior. + +The following example illustrates a typical scenario: for most tools you want strict alignment of tool invocations and results, but for time-related tools such as `current_time`, the response value itself is unstable. Therefore, you only need to check whether the correct tool and arguments were invoked as expected, without requiring the time value itself to be exactly the same. + +```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +criterion := criterion.New( + criterion.WithToolTrajectory( + tooltrajectory.New( + tooltrajectory.WithDefault( + &tooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &json.JSONCriterion{ + MatchStrategy: json.JSONMatchStrategyExact, + }, + Response: &json.JSONCriterion{ + MatchStrategy: json.JSONMatchStrategyExact, + }, + }, + ), + tooltrajectory.WithTool(map[string]*tooltrajectory.ToolTrajectoryStrategy{ + "current_time": { + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &json.JSONCriterion{ + MatchStrategy: json.JSONMatchStrategyExact, + }, + Response: &json.JSONCriterion{ + Ignore: true, // Ignore matching of this tool's response. + }, + }, + }), + ), + ), +) +``` + +By default, tool invocations are compared one by one in the order in which they appear. The actual tool invocation sequence and the expected tool invocation sequence must match in length, order, and in the tool name, arguments, and response at each step. If the invocation order is different, the evaluation will be considered as failed. + +OrderInsensitive controls whether the tool invocation order is ignored. When enabled, the evaluation logic first generates a sorting key for each tool invocation (composed of the tool name and the normalized representation of arguments and response). It then sorts the actual invocation sequence and the expected invocation sequence by this key, producing two invocation lists with stable order. Next, it compares the corresponding invocations in the sorted lists one by one, and determines whether these invocations match according to the configured evaluation criteria. Put simply, as long as the tool invocations on both sides are completely identical in content, the evaluation will not fail due to differences in the original invocation order. For example: + +```go +criterion := criterion.New( + criterion.WithToolTrajectory( + ctooltrajectory.New( + ctooltrajectory.WithOrderInsensitive(true), + ), + ), +) +``` + +### Evaluator + +#### Tool Trajectory Evaluator + +The metric name corresponding to the tool trajectory evaluator is `tool_trajectory_avg_score`. It is used to evaluate whether the Agent’s use of tools across multiple conversations conforms to expectations. + +In a single conversation, the evaluator compares the actual tool invocation trajectory with the expected trajectory using `ToolTrajectoryCriterion`: + +* If the entire tool invocation trajectory satisfies the evaluation criterion, the score of this conversation on this metric is 1. +* If any step of the invocation does not satisfy the evaluation criterion, the score of this conversation on this metric is 0. + +In the scenario of multiple conversations, the evaluator takes the average of the scores of all conversations on this metric as the final `tool_trajectory_avg_score`, and compares it with `EvalMetric.Threshold` to determine whether the result is pass or fail. + +A typical way to combine the tool trajectory evaluator with Metric and Criterion is as follows: + +```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + ctooltrajectory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +evalMetric := &metric.EvalMetric{ + MetricName: "tool_trajectory_avg_score", + Threshold: 1.0, + Criterion: criterion.New( + criterion.WithToolTrajectory( + // Use the default evaluation criterion; tool name, arguments, and response must be strictly identical. + ctooltrajectory.New(), + ), + ), +} +``` diff --git a/docs/mkdocs/zh/evaluation.md b/docs/mkdocs/zh/evaluation.md index ffd9125db..3cfdcae9b 100644 --- a/docs/mkdocs/zh/evaluation.md +++ b/docs/mkdocs/zh/evaluation.md @@ -58,53 +58,64 @@ if err != nil { ```json { - "eval_set_id": "math-basic", + "evalSetId": "math-basic", "name": "math-basic", - "eval_cases": [ + "evalCases": [ { - "eval_id": "calc_add", - "conversation": [ - { - "invocation_id": "calc_add-1", - "user_content": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "final_response": { - "parts": [ - { - "text": "calc result: 5" - } - ], - "role": "assistant" - }, - "intermediate_data": { - "tool_uses": [ - { - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ] - }, - "creation_timestamp": 1761134484.981062 + "evalId": "calc_add", + "conversation": [ + { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + } + ], + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" } - ], - "session_input": { - "app_name": "math-eval-app", - "user_id": "user" - }, - "creation_timestamp": 1761134484.981062 - }, + } ], - "creation_timestamp": 1761134484.9804401 + "creationTimestamp": 1761134484.9804401 } ``` @@ -113,8 +124,23 @@ if err != nil { ```json [ { - "metric_name": "tool_trajectory_avg_score", - "threshold": 1 + "metricName": "tool_trajectory_avg_score", + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } } ] ``` @@ -122,7 +148,159 @@ if err != nil { #### 评估结果 EvalResult 文件示例 ```json -"{\"eval_set_result_id\":\"math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc\",\"eval_set_result_name\":\"math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc\",\"eval_set_id\":\"math-basic\",\"eval_case_results\":[{\"eval_set_id\":\"math-basic\",\"eval_id\":\"calc_add\",\"final_eval_status\":1,\"overall_eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}],\"eval_metric_result_per_invocation\":[{\"actual_invocation\":{\"invocation_id\":\"8b205b3f-682e-409a-b751-89ef805d0221\",\"user_content\":{\"parts\":[{\"text\":\"calc add 2 3\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"The result of adding 2 and 3 is **5**.\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"id\":\"call_00_j75SIh8A9xSlG61OrC1ARIab\",\"args\":{\"a\":2,\"b\":3,\"operation\":\"add\"},\"name\":\"calculator\"}]}},\"expected_invocation\":{\"invocation_id\":\"calc_add-1\",\"user_content\":{\"parts\":[{\"text\":\"calc add 2 3\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"calc result: 5\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"args\":{\"a\":2,\"b\":3,\"operation\":\"add\"},\"name\":\"calculator\"}]},\"creation_timestamp\":1761134484.981062},\"eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}]}],\"session_id\":\"74252944-b1a7-4c17-8f39-4a5809395d1d\",\"user_id\":\"user\"},{\"eval_set_id\":\"math-basic\",\"eval_id\":\"calc_multiply\",\"final_eval_status\":1,\"overall_eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}],\"eval_metric_result_per_invocation\":[{\"actual_invocation\":{\"invocation_id\":\"65226930-d45c-43ae-ab88-9c35f3abce70\",\"user_content\":{\"parts\":[{\"text\":\"calc multiply 6 7\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"6 × 7 = 42\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"id\":\"call_00_b3Gj4Y3fJu9Blkbl6H0MLquO\",\"args\":{\"a\":6,\"b\":7,\"operation\":\"multiply\"},\"name\":\"calculator\"}]}},\"expected_invocation\":{\"invocation_id\":\"calc_multiply-1\",\"user_content\":{\"parts\":[{\"text\":\"calc multiply 6 7\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"calc result: 42\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"args\":{\"a\":6,\"b\":7,\"operation\":\"multiply\"},\"name\":\"calculator\"}]},\"creation_timestamp\":1761134484.9812014},\"eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}]}],\"session_id\":\"6393fabd-ab50-49b7-8656-59fcb0a29758\",\"user_id\":\"user\"}],\"creation_timestamp\":1761134849.3572516}" +{ + "evalSetResultId": "math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494", + "evalSetResultName": "math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494", + "evalSetId": "math-basic", + "evalCaseResults": [ + { + "evalSetId": "math-basic", + "evalId": "calc_add", + "finalEvalStatus": 1, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "53845847-16e0-4960-9d00-d3abf0ab1807", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "The result of 2 + 3 is **5**." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_J7WhW8PJurYtWji3J5H7ITN9", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_J7WhW8PJurYtWji3J5H7ITN9", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } + } + ] + } + ], + "sessionId": "e9cc851f-8c89-45f4-b430-7c54991c7dda", + "userId": "user" + } + ], + "creationTimestamp": 1763997862.5581782 +} ``` ### 内存 inmemory @@ -217,6 +395,7 @@ cases := []*evalset.EvalCase{ IntermediateData: &evalset.IntermediateData{ ToolUses: []*genai.FunctionCall{ { + ID: "tool_use_1", Name: "calculator", Args: map[string]interface{}{ "operation": "add", @@ -225,6 +404,18 @@ cases := []*evalset.EvalCase{ }, }, }, + ToolResponses: []*genai.FunctionResponse{ + { + ID: "tool_use_1", + Name: "calculator", + Response: map[string]interface{}{ + "a": 2.0, + "b": 3.0, + "operation": "add", + "result": 5.0, + }, + }, + }, }, }, }, @@ -249,6 +440,25 @@ import "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" evalMetric := &metric.EvalMetric{ MetricName: "tool_trajectory_avg_score", Threshold: 1.0, + Criterion: criterion.New( + criterion.WithToolTrajectory( + ctooltrajectory.New( + ctooltrajectory.WithDefault( + &ctooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &cjson.JSONCriterion{ + MatchStrategy: cjson.JSONMatchStrategyExact, + }, + Response: &cjson.JSONCriterion{ + MatchStrategy: cjson.JSONMatchStrategyExact, + }, + }, + ), + ), + ), + ), } metricManager.Add(ctx, appName, evalSetID, evalMetric) ``` @@ -350,18 +560,31 @@ type Manager interface { ### 评估指标 -- Metric -Metric 表示一个评估指标,用于衡量 EvalSet 的某一方面表现。 +Metric 表示一个评估指标,用于衡量 EvalSet 的某一方面表现,每个评估指标包含指标名、评估准则和评分阈值。 -每个指标包含指标名和评分阈值: +评估过程中,评估器会根据配置的评估准则对实际会话与预期会话进行比较,计算出该指标的评估得分,并与阈值进行对比: - 当评估得分低于阈值时,指标判定为未通过。 - 当评估得分达到或超过阈值时,指标判定为通过。 ```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/llm" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + // EvalMetric 表示用于评估 EvalCase 的单项指标 type EvalMetric struct { - MetricName string // 指标名称 - Threshold float64 // 评分阈值 + MetricName string // 指标名称 + Threshold float64 // 评分阈值 + Criterion *criterion.Criterion // 评估准则 +} + +// Criterion 聚合各类评估准则 +type Criterion struct { + ToolTrajectory *tooltrajectory.ToolTrajectoryCriterion // 工具轨迹评估准则 + LLMJudge *llm.LLMCriterion // LLM 评估准则 } ``` @@ -432,9 +655,9 @@ type Evaluator interface { // EvaluateResult 表示评估器在多次会话上的汇总结果 type EvaluateResult struct { - OverallScore float64 // 总体得分 - OverallStatus status.EvalStatus // 总体状态,分为通过/未通过/未评估 - PerInvocationResults []PerInvocationResult // 单次会话评估结果 + OverallScore float64 // 总体得分 + OverallStatus status.EvalStatus // 总体状态,分为通过/未通过/未评估 + PerInvocationResults []*PerInvocationResult // 单次会话评估结果 } // PerInvocationResult 表示单次会话的评估结果 @@ -474,6 +697,9 @@ type Registry interface { - 若实际工具调用序列与预期完全一致,则计 1 分; - 若不一致,则计 0 分。 - 对于多次会话:计算各会话得分的平均值作为最终得分。 +- `llm_final_response` LLM 最终响应评估器。 + - 单次采样:评估模型返回 `is_the_agent_response_valid` 字段,`valid` 计 1 分,否则计 0 分。 + - 多次采样:按多数表决决定最终判定,再与 `EvalMetric.Threshold` 比较得出通过/未通过,评估调用次数与生成参数由 `LLMCriterion.JudgeModel` 配置。 ### 评估结果 -- EvalResult @@ -514,15 +740,19 @@ type EvalCaseResult struct { EvalMetricResult 表示某一指标的评估结果,包括得分、状态、阈值及附加信息。 ```go -import "trpc.group/trpc-go/trpc-agent-go/evaluation/status" +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/status" +) // EvalMetricResult 表示单项指标的评估结果 type EvalMetricResult struct { - MetricName string // 指标名称 - Score float64 // 实际得分 - EvalStatus status.EvalStatus // 评测状态 - Threshold float64 // 阈值 - Details map[string]any // 额外信息,如评分过程、错误描述等 + MetricName string // 指标名称 + Score float64 // 实际得分 + EvalStatus status.EvalStatus // 评测状态 + Threshold float64 // 阈值 + Criterion *criterion.Criterion // 评估准则 + Details *EvalMetricResultDetails // 额外信息,如评分过程、错误描述等 } ``` @@ -537,6 +767,15 @@ type EvalMetricResultPerInvocation struct { ExpectedInvocation *evalset.Invocation // 预期的对话结果 EvalMetricResults []*EvalMetricResult // 各指标评估结果 } + +// ScoreResult 表示单项指标的分数结果 +type ScoreResult struct { + Score float64 // 得分 +} + +// EvalMetricResultDetails 预留字段 +type EvalMetricResultDetails struct { +} ``` EvalResult Manager 负责管理评估结果的存储、查询与列表操作,接口定义如下: @@ -946,3 +1185,270 @@ func (l *customLocator) List(baseDir, appName string) ([]string, error) { return results, nil } ``` + +### 评估准则 + +评估准则描述具体的评估方式,可按需组合使用。 + +框架内置了以下评估准则类型: + +| 准则类型 | 适用对象 | +|-------------------------|--------------------------------------| +| TextCriterion | 文本字符串 | +| JSONCriterion | JSON 对象,通常用于比较 map[string]any | +| ToolTrajectoryCriterion | 工具调用轨迹 | +| LLMCriterion | 基于 LLM 评估模型的评估 | +| Criterion | 多种准则的聚合 | + +#### TextCriterion + +TextCriterion 用于字符串匹配,可配置是否忽略大小写和具体的匹配策略。 + +```go +// TextCriterion 定义字符串的匹配方式。 +type TextCriterion struct { + Ignore bool // 是否跳过匹配 + CaseInsensitive bool // 是否大小写不敏感 + MatchStrategy TextMatchStrategy // 匹配策略 + Compare func(actual, expected string) (bool, error) // 自定义比较 +} +``` + +TextMatchStrategy 取值说明: + +| TextMatchStrategy 取值 | 说明 | +|-----------------------|------------------------------| +| exact | 实际字符串与预期字符串完全一致(默认)。 | +| contains | 实际字符串包含预期字符串。 | +| regex | 实际字符串满足预期字符串作为正则表达式。 | + +#### JSONCriterion + +JSONCriterion 用于对比结构化 JSON 数据,可配置是否忽略比较以及具体的匹配策略。 + +```go +// JSONCriterion 定义 JSON 对象的匹配方式。 +type JSONCriterion struct { + Ignore bool // 是否跳过匹配 + MatchStrategy JSONMatchStrategy // 匹配策略 + Compare func(actual, expected map[string]any) (bool, error) // 自定义比较 +} +``` + +JSONMatchStrategy 取值说明: + +| JSONMatchStrategy 取值 | 说明 | +|-----------------------|------------------------------| +| exact | 实际 JSON 与预期 JSON 完全一致(默认)。 | + +#### ToolTrajectoryCriterion + +ToolTrajectoryCriterion 用于配置工具调用与响应的评估准则,可设置默认策略、按工具名定制策略以及是否忽略调用顺序。 + +```go +// ToolTrajectoryCriterion 定义工具调用与响应的评估准则。 +type ToolTrajectoryCriterion struct { + DefaultStrategy *ToolTrajectoryStrategy // 默认策略 + ToolStrategy map[string]*ToolTrajectoryStrategy // 按工具名定制策略 + OrderInsensitive bool // 是否忽略调用顺序 + Compare func(actual, expected *evalset.Invocation) (bool, error) // 自定义比较 +} + +// ToolTrajectoryStrategy 定义单个工具的匹配策略。 +type ToolTrajectoryStrategy struct { + Name *TextCriterion // 工具名匹配 + Arguments *JSONCriterion // 调用参数匹配 + Response *JSONCriterion // 工具响应匹配 +} +``` + +DefaultStrategy 用于配置全局默认评估准则,适用于所有工具。 + +ToolStrategy 按工具名覆盖特定工具的评估准则,未设置 ToolStrategy 时所有工具调用都使用 DefaultStrategy。 + +若未设置任何评估准则,框架会使用默认评估准则:工具名按 TextCriterion 的 exact 策略比较,参数和响应按 JSONCriterion 的 exact 策略比较,保证工具轨迹评估始终有合理的兜底行为。 + +下面的示例展示了一个典型场景,大部分工具希望严格对齐工具调用和结果,但 current_time 这类时间相关工具的响应值本身不稳定,因此只需要检查是否按预期调用了正确的工具和参数,而不要求时间值本身完全一致。 + +```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +criterion := criterion.New( + criterion.WithToolTrajectory( + tooltrajectory.New( + tooltrajectory.WithDefault( + &tooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &json.JSONCriterion{ + MatchStrategy: json.JSONMatchStrategyExact, + }, + Response: &json.JSONCriterion{ + MatchStrategy: json.JSONMatchStrategyExact, + }, + }, + ), + tooltrajectory.WithTool(map[string]*tooltrajectory.ToolTrajectoryStrategy{ + "current_time": { + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &json.JSONCriterion{ + MatchStrategy: json.JSONMatchStrategyExact, + }, + Response: &json.JSONCriterion{ + Ignore: true, // 忽略该工具响应的匹配. + }, + }, + }), + ), + ), +) +``` + +默认情况下,工具调用是按出现顺序逐条比对的,实际调用工具序列与预期工具调用序列在长度、顺序以及每一步的工具名、参数和响应上都需要匹配,若调用顺序不同则会被判定为评估不通过。 + +OrderInsensitive 用于控制是否对工具调用顺序不敏感。开启后,评估逻辑会先为每一次工具调用生成一个排序键(由工具名以及参数和响应的规范化表示共同构成),再分别对实际调用序列和预期调用序列按照这一键进行排序,得到两个具有稳定顺序的调用列表;随后按排序后的顺序逐一比对对应位置的调用,并根据配置的评估准则判断这些调用是否匹配。换简单来说,只要两侧包含的工具调用在调用内容上完全一致,即使原始调用顺序不同,也不会因为顺序差异而导致评估不通过,示例如下: + +```go +criterion := criterion.New( + criterion.WithToolTrajectory( + ctooltrajectory.New( + ctooltrajectory.WithOrderInsensitive(true), + ), + ), +) +``` + +#### LLMCriterion + +LLMCriterion 用于配置基于大模型的评估准则,适用于需要由模型给出评估结论的场景。 + +```go +// LLMCriterion 配置评估模型 +type LLMCriterion struct { + JudgeModel *JudgeModelOptions // 评估模型配置 +} + +// JudgeModelOptions 定义评估模型的详细参数 +type JudgeModelOptions struct { + ProviderName string // 模型供应商名称 + ModelName string // 评估模型名称 + BaseURL string // 模型 Base URL + APIKey string // 模型 API Key + ExtraFields map[string]any // 模型请求的额外参数 + NumSamples int // 评估采样次数 + Generation *model.GenerationConfig // 评估模型的生成配置 +} +``` + +- `NumSamples` 控制评估模型调用次数,未配置时默认值为 1。 +- `Generation` 默认使用 `MaxTokens=2000`、`Temperature=0.8`、`Stream=false`。 + +可通过 `criterion.WithLLMJudge` 传入自定义配置,例如: + +```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/llm" + "trpc.group/trpc-go/trpc-agent-go/model" +) + +criterion := criterion.New( + criterion.WithLLMJudge( + llm.New( + "openai", + "deepseek-chat", + llm.WithNumSamples(3), + llm.WithGeneration(&model.GenerationConfig{ + MaxTokens: floatPtr(512), + Temperature: floatPtr(1.0), + Stream: false, + }), + ), + ), +) +``` + +### 评估器 + +#### 工具轨迹评估器 + +工具轨迹评估器对应的指标名称为 `tool_trajectory_avg_score`,用于评估 Agent 在多次会话中对工具的使用是否符合预期。 + +在单次会话中,评估器会使用 `ToolTrajectoryCriterion` 对实际工具调用轨迹与预期轨迹进行比较: + +- 若整条工具调用轨迹满足评估准则,则该会话在此指标上的得分为 1。 +- 若任意一步调用不满足评估准则,则该会话在此指标上的得分为 0。 + +在多次会话的场景下,评估器会对所有会话在该指标上的得分取平均值,作为最终的 `tool_trajectory_avg_score`,并与 `EvalMetric.Threshold` 比较,得到通过/未通过的判定结果。 + +工具轨迹评估器与 Metric、Criterion 的典型组合方式如下: + +```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + ctooltrajectory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +evalMetric := &metric.EvalMetric{ + MetricName: "tool_trajectory_avg_score", + Threshold: 1.0, + Criterion: criterion.New( + criterion.WithToolTrajectory( + // 使用默认评估准则,工具的名称、参数和响应需严格一致 + ctooltrajectory.New(), + ), + ), +} +``` + +#### LLM 最终响应评估器 + +LLM 最终响应评估器对应的指标名称为 `llm_final_response`,通过评估模型判定 Agent 的最终回答是否有效。 + +评估逻辑: + +- 使用 `LLMCriterion` 的 `JudgeModel` 调用评估模型,按配置的 `NumSamples` 采样多次。 +- 评估模型需返回字段 `is_the_agent_response_valid`,取值为 `valid` 或 `invalid`(大小写不敏感);`valid` 记 1 分,`invalid` 记 0 分,其他结果或解析失败会报错。 +- 多次采样时按多数表决聚合,最终得分与 `EvalMetric.Threshold` 比较得到评估结论。 + +典型配置示例如下: + +```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + cllm "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/llm" + "trpc.group/trpc-go/trpc-agent-go/model" +) + +evalMetric := &metric.EvalMetric{ + MetricName: "llm_final_response", + Threshold: 0.9, + Criterion: criterion.New( + criterion.WithLLMJudge( + cllm.New( + "openai", + "gpt-4o", + cllm.WithNumSamples(3), + cllm.WithGeneration(&model.GenerationConfig{ + MaxTokens: ptr(512), + Temperature: ptr(1.0), + Stream: false, + }), + ), + ), + ), +} +``` + +评估提示词会包含用户输入、参考答案与 Agent 的最终回答,适用于自动化校验最终文本输出。 +其中 `ptr` 同样用于构造指针字段。 diff --git a/evaluation/evalresult/evalresult.go b/evaluation/evalresult/evalresult.go index da40fde7b..0650019ef 100644 --- a/evaluation/evalresult/evalresult.go +++ b/evaluation/evalresult/evalresult.go @@ -15,6 +15,7 @@ import ( "trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" "trpc.group/trpc-go/trpc-agent-go/evaluation/status" ) @@ -63,8 +64,10 @@ type EvalMetricResult struct { EvalStatus status.EvalStatus `json:"evalStatus,omitempty"` // Threshold that was used. Threshold float64 `json:"threshold,omitempty"` + // Criterion contains the criterion used for this metric evaluation. + Criterion *criterion.Criterion `json:"criterion,omitempty"` // Details contains additional metric-specific information. - Details map[string]any `json:"details,omitempty"` + Details *EvalMetricResultDetails `json:"details,omitempty"` } // EvalMetricResultPerInvocation represents metric results for a single invocation. @@ -78,6 +81,17 @@ type EvalMetricResultPerInvocation struct { EvalMetricResults []*EvalMetricResult `json:"evalMetricResults,omitempty"` } +// ScoreResult represents the score and rationale for a single metric evaluation. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. +type ScoreResult struct { + Score float64 `json:"score,omitempty"` +} + +// EvalMetricResultDetails contains additional metric-specific information. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. +type EvalMetricResultDetails struct { +} + // Manager defines the interface for managing evaluation results. type Manager interface { // Save stores an evaluation result. diff --git a/evaluation/evalresult/evalresult_test.go b/evaluation/evalresult/evalresult_test.go index 79b54e8af..7614dfd45 100644 --- a/evaluation/evalresult/evalresult_test.go +++ b/evaluation/evalresult/evalresult_test.go @@ -33,10 +33,7 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { "metricName": "tool_trajectory_avg_score", "score": 0.9, "evalStatus": 1, - "threshold": 0.8, - "details": { - "comment": "trajectory matched" - } + "threshold": 0.8 } ], "evalMetricResultPerInvocation": [ @@ -131,10 +128,7 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { "metricName": "tool_trajectory_avg_score", "score": 0.9, "evalStatus": 1, - "threshold": 0.8, - "details": { - "comment": "per invocation matched" - } + "threshold": 0.8 } ] } @@ -169,7 +163,6 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { assert.Equal(t, 0.9, overallMetric.Score) assert.Equal(t, status.EvalStatusPassed, overallMetric.EvalStatus) assert.Equal(t, 0.8, overallMetric.Threshold) - assert.Equal(t, "trajectory matched", overallMetric.Details["comment"]) perInvocation := caseResult.EvalMetricResultPerInvocation[0] assert.NotNil(t, perInvocation.ActualInvocation) @@ -183,7 +176,6 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { assert.Equal(t, 0.9, perMetric.Score) assert.Equal(t, status.EvalStatusPassed, perMetric.EvalStatus) assert.Equal(t, 0.8, perMetric.Threshold) - assert.Equal(t, "per invocation matched", perMetric.Details["comment"]) encoded, marshalErr := json.Marshal(result) assert.NoError(t, marshalErr) diff --git a/evaluation/evalresult/local/local.go b/evaluation/evalresult/local/local.go index 9f87dd7a1..c914836bf 100644 --- a/evaluation/evalresult/local/local.go +++ b/evaluation/evalresult/local/local.go @@ -123,18 +123,21 @@ func (m *manager) evalSetResultPath(appName, evalSetResultID string) string { // load loads the EvalSetResult from the file system. func (m *manager) load(appName, evalSetResultID string) (*evalresult.EvalSetResult, error) { path := m.evalSetResultPath(appName, evalSetResultID) - f, err := os.Open(path) + data, err := os.ReadFile(path) if err != nil { return nil, fmt.Errorf("open file %s: %w", path, err) } - defer f.Close() - var payload string - if err := json.NewDecoder(f).Decode(&payload); err != nil { + var res evalresult.EvalSetResult + if err := json.Unmarshal(data, &res); err == nil { + return &res, nil + } + // Keep backward compatibility with legacy string-wrapped results. + var legacy string + if err := json.Unmarshal(data, &legacy); err != nil { return nil, fmt.Errorf("decode file %s: %w", path, err) } - var res evalresult.EvalSetResult - if err := json.Unmarshal([]byte(payload), &res); err != nil { - return nil, fmt.Errorf("unmarshal eval set result %s: %w", path, err) + if err := json.Unmarshal([]byte(legacy), &res); err != nil { + return nil, fmt.Errorf("decode legacy content in file %s: %w", path, err) } return &res, nil } @@ -154,13 +157,9 @@ func (m *manager) store(appName string, evalSetResult *evalresult.EvalSetResult) if err != nil { return fmt.Errorf("open file %s: %w", tmp, err) } - data, err := json.Marshal(evalSetResult) - if err != nil { - file.Close() - return fmt.Errorf("json marshal: %w", err) - } encoder := json.NewEncoder(file) - if err := encoder.Encode(string(data)); err != nil { + encoder.SetIndent("", " ") + if err := encoder.Encode(evalSetResult); err != nil { file.Close() os.Remove(tmp) return fmt.Errorf("encode file %s: %w", tmp, err) diff --git a/evaluation/evaluation.go b/evaluation/evaluation.go index cd69416fc..b80d8accc 100644 --- a/evaluation/evaluation.go +++ b/evaluation/evaluation.go @@ -20,6 +20,7 @@ import ( "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/registry" istatus "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/status" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" "trpc.group/trpc-go/trpc-agent-go/evaluation/service" "trpc.group/trpc-go/trpc-agent-go/evaluation/service/local" "trpc.group/trpc-go/trpc-agent-go/evaluation/status" @@ -195,6 +196,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu count int score float64 threshold float64 + criterion *criterion.Criterion } // Group metrics results by metric name. aggregatedMetrics := make(map[string]*aggregatedMetric) @@ -208,6 +210,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu } aggregatedMetrics[metric.MetricName].count++ aggregatedMetrics[metric.MetricName].score += metric.Score + aggregatedMetrics[metric.MetricName].criterion = metric.Criterion } } // Aggregate metrics results by metric name. @@ -223,6 +226,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu Score: average, EvalStatus: evalStatus, Threshold: aggregatedMetric.threshold, + Criterion: aggregatedMetric.criterion, }) } status, err := istatus.SummarizeMetricsStatus(metricsResults) diff --git a/evaluation/evaluator/evaluator.go b/evaluation/evaluator/evaluator.go index db330cbb3..94cbe0f38 100644 --- a/evaluation/evaluator/evaluator.go +++ b/evaluation/evaluator/evaluator.go @@ -33,20 +33,20 @@ type Evaluator interface { // It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvaluateResult struct { // OverallScore is the overall score for this evaluation. - OverallScore float64 `json:"overall_score,omitempty"` + OverallScore float64 `json:"overallScore,omitempty"` // OverallStatus represents pass/fail/not-evaluated for the evaluation run. - OverallStatus status.EvalStatus `json:"overall_status,omitempty"` + OverallStatus status.EvalStatus `json:"overallStatus,omitempty"` // PerInvocationResults contains results for each invocation. - PerInvocationResults []PerInvocationResult `json:"per_invocation_results,omitempty"` + PerInvocationResults []*PerInvocationResult `json:"perInvocationResults,omitempty"` } // PerInvocationResult represents the evaluation result for a single invocation. // It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type PerInvocationResult struct { // ActualInvocation is the invocation generated by the agent. - ActualInvocation *evalset.Invocation `json:"actual_invocation,omitempty"` + ActualInvocation *evalset.Invocation `json:"actualInvocation,omitempty"` // ExpectedInvocation is the expected invocation. - ExpectedInvocation *evalset.Invocation `json:"expected_invocation,omitempty"` + ExpectedInvocation *evalset.Invocation `json:"expectedInvocation,omitempty"` // Score is the evaluator's score for this invocation. Score float64 `json:"score,omitempty"` // Status indicates the evaluation status of the invocation. diff --git a/evaluation/evaluator/llm/finalresponse/finalresponse.go b/evaluation/evaluator/llm/finalresponse/finalresponse.go new file mode 100644 index 000000000..5ea0a818d --- /dev/null +++ b/evaluation/evaluator/llm/finalresponse/finalresponse.go @@ -0,0 +1,250 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package finalresponse implements an LLM judge for final responses. +package finalresponse + +import ( + "bytes" + "context" + "fmt" + "regexp" + "strings" + "text/template" + + "google.golang.org/genai" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/llm" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/status" + "trpc.group/trpc-go/trpc-agent-go/model" +) + +var ( + // finalResponsePrompt is the template fed to the judge model. + finalResponsePrompt = `You are an expert rater for an AI agent. The AI agent is going to call an API to answer the user query and generate API tool use code based for the choice of the API and API arguments. The ideal model response should be a function call that fulfills user query, or a natural language response hedges or asks users for further clarification if a function call does not apply. + The primary focus of this rating task is to check correctness of the model responses. + + The data consists of: + - A user query. + - A model generated response for the prompt. The responses can consist of: + - Natural language, when the model is asking for clarification, or tells the user it does not possess the requested functionality / option. + - Code, in the form of one or multiple python function calls, and additional code as needed, for when the model is fulfilling the user request. + You can use the help from a reference response annotated by a human rater. This reference response is of high quality. You can compare the agent's response with the reference response and decide if the agent's response is valid. + Note sometimes the reference response only contains the key entities of the correct answer and you need to be flexible to allow the agent response to contain more information than the reference response, or to present the key entities in a different format or structure or in shorter or longer format. + When the agent response is provided in the form of tables/dataframes or should be best provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response. Likewise, if you have the reference response, then find out the key entities and main components in them and check whether you can retrieve those from the agent response. If the prompt does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes. + + You should follow the constitutions below very carefully to rate the model response: + - Allow flexibility of format even when reference code only uses one of the possible format, unless API spec or user prompt has explicit format requirement + - e.g. For state name, allow both abbreviation and full name unless API spec has explicit requirement. e.g. both 'tx' and 'Texas' should be allowed in the agent response even when reference code only uses one of them. + - e.g. If a reference response list outputs in a list format, the agent response is allowed to use sentence format and vice versa unless user prompt explicitly asks for a specific format. + - e.g. For numbers, allow flexibility of formatting, e.g. 1000000 vs 1,000,000. + - The model shouldn't assume that it doesn't have access to according data or incapable of answering the question if reference response is able to find a legit answer. + - If the model response contains the correct final answer, rate it as valid even when the model response contains more information than the reference response. + - If the user prompt has csv or other table format data, don't read it yourself. Trust the reference response final answer instead. + - When the validation needs maths, date calculations, do not use your own calculator. Trust the reference response final answer instead. + - Be mindful about unit of numbers. For example, if the reference response says 100 miles, but the model response says 100 km, it is invalid. + - When the agent response or the reference response is provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response and whether those match the reference response. If the user query does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes. + - When the answer is in numeric format, check whether there are any format requirements in the numeric format, rounding, precision, number of decimals, etc. specified in the user query and the prompt. If there are no such instructions, then tolerate different numerical formats. + - When the answer is in numeric format and there are rounding or precision differences between the agent response and the reference response, if no further instructions are provided evaluate if the rounding strategy or precision in the agent response follows the standards for that entity. For instance, model accuracy scores must be reported with at least two decimal places (e.g., 0.798 → 0.80 is acceptable, but 0.7 is not). + + Below are the inputs: + { + "User prompt": {{.UserPrompt}}, + "Agent response": {{.ActualResponse}}, + "Reference response": {{.ExpectedResponse}}, + } + + The answer should be a json alone which follows the json structure below: + { + "reasoning": [reasoning], + "is_the_agent_response_valid": [valid or invalid], + } + Answer with assertiveness: + ` + // finalResponsePromptTemplate renders the judge prompt with data. + finalResponsePromptTemplate = template.Must(template.New("finalResponsePrompt").Parse(finalResponsePrompt)) + // labelMatchIsResponseValidRe extracts the validity label from judge output. + labelMatchIsResponseValidRe = regexp.MustCompile(`"is_the_agent_response_valid"\s*:\s*\[?\s*"?([A-Za-z_]+)"?\s*\]?`) +) + +// finalResponseEvaluator evaluates final responses via an LLM judge. +type finalResponseEvaluator struct { + llmBaseEvaluator llm.LLMEvaluator +} + +// New builds the final response evaluator. +func New() evaluator.Evaluator { + e := &finalResponseEvaluator{} + e.llmBaseEvaluator = llm.New(e) + return e +} + +// Name returns the evaluator identifier. +func (e *finalResponseEvaluator) Name() string { + return "llm_final_response" +} + +// Description describes the evaluator purpose. +func (e *finalResponseEvaluator) Description() string { + return "LLM judge for final responses" +} + +// Evaluate runs LLM-based evaluation on final responses. +func (e *finalResponseEvaluator) Evaluate(ctx context.Context, actuals, expecteds []*evalset.Invocation, + evalMetric *metric.EvalMetric) (*evaluator.EvaluateResult, error) { + return e.llmBaseEvaluator.Evaluate(ctx, actuals, expecteds, evalMetric) +} + +// ConstructMessages builds judge prompts from actual and expected responses. +func (e *finalResponseEvaluator) ConstructMessages(actual, expected *evalset.Invocation, + _ *metric.EvalMetric) ([]model.Message, error) { + data := finalResponsePromptData{ + UserPrompt: getTextFromContent(actual.UserContent), + ActualResponse: getTextFromContent(actual.FinalResponse), + ExpectedResponse: getTextFromContent(expected.FinalResponse), + } + var buf bytes.Buffer + if err := finalResponsePromptTemplate.Execute(&buf, data); err != nil { + return nil, fmt.Errorf("execute final response prompt template: %w", err) + } + return []model.Message{ + { + Role: model.RoleUser, + Content: buf.String(), + }, + }, nil +} + +// ScoreBasedOnResponse converts judge feedback to a numeric score. +func (e *finalResponseEvaluator) ScoreBasedOnResponse(response *model.Response, + _ *metric.EvalMetric) (*evalresult.ScoreResult, error) { + if len(response.Choices) == 0 { + return nil, fmt.Errorf("no choices in response") + } + responseText := response.Choices[0].Message.Content + if responseText == "" { + return nil, fmt.Errorf("empty response text") + } + label := extractLabel(responseText) + score := 0.0 + switch label { + case LabelValid: + score = 1.0 + case LabelInvalid: + score = 0.0 + default: + return nil, fmt.Errorf("unknown label: %v", label) + } + return &evalresult.ScoreResult{ + Score: score, + }, nil +} + +// AggregateSamples resolves multiple judge samples to one invocation result. +func (e *finalResponseEvaluator) AggregateSamples(samples []*evaluator.PerInvocationResult, + evalMetric *metric.EvalMetric) (*evaluator.PerInvocationResult, error) { + if len(samples) == 0 { + return nil, fmt.Errorf("no samples") + } + positiveResults := make([]*evaluator.PerInvocationResult, 0) + negativeResults := make([]*evaluator.PerInvocationResult, 0) + for _, sample := range samples { + if sample.Status == status.EvalStatusNotEvaluated { + continue + } + if sample.Score >= evalMetric.Threshold { + positiveResults = append(positiveResults, sample) + } else { + negativeResults = append(negativeResults, sample) + } + } + if len(positiveResults) == 0 && len(negativeResults) == 0 { + return samples[0], nil + } + if len(positiveResults) > len(negativeResults) { + return positiveResults[0], nil + } else { + return negativeResults[0], nil + } +} + +// AggregateInvocations summarizes per-invocation results into an overall score. +func (e *finalResponseEvaluator) AggregateInvocations(results []*evaluator.PerInvocationResult, + evalMetric *metric.EvalMetric) (*evaluator.EvaluateResult, error) { + sumScore := 0.0 + numEvaluated := 0.0 + for _, result := range results { + if result.Status == status.EvalStatusNotEvaluated { + continue + } + numEvaluated++ + sumScore += result.Score + } + if numEvaluated == 0 { + return &evaluator.EvaluateResult{ + OverallStatus: status.EvalStatusNotEvaluated, + }, nil + } + overallScore := sumScore / numEvaluated + overallStatus := status.EvalStatusPassed + if overallScore < evalMetric.Threshold { + overallStatus = status.EvalStatusFailed + } + return &evaluator.EvaluateResult{ + OverallScore: overallScore, + OverallStatus: overallStatus, + PerInvocationResults: results, + }, nil +} + +// finalResponsePromptData feeds values into the judge prompt template. +type finalResponsePromptData struct { + UserPrompt string // UserPrompt is the original user prompt text. + ActualResponse string // ActualResponse is the agent response to be judged. + ExpectedResponse string // ExpectedResponse is the reference response for comparison. +} + +// getTextFromContent extracts plain text from genai content. +func getTextFromContent(content *genai.Content) string { + if content == nil { + return "" + } + var text strings.Builder + for _, part := range content.Parts { + text.WriteString(part.Text) + } + return text.String() +} + +// Label captures the validity category returned by the judge. +type Label string + +const ( + LabelValid Label = "valid" // LabelValid marks a valid agent response. + LabelInvalid Label = "invalid" // LabelInvalid marks an invalid agent response. +) + +// extractLabel extracts the validity label from the judge response. +func extractLabel(response string) Label { + match := labelMatchIsResponseValidRe.FindStringSubmatch(response) + if len(match) < 1 { + return LabelInvalid + } + label := strings.TrimSpace(match[1]) + switch strings.ToLower(label) { + case string(LabelValid): + return LabelValid + case string(LabelInvalid): + return LabelInvalid + } + return Label(label) +} diff --git a/evaluation/evaluator/llm/finalresponse/finalresponse_test.go b/evaluation/evaluator/llm/finalresponse/finalresponse_test.go new file mode 100644 index 000000000..ffde9d5dd --- /dev/null +++ b/evaluation/evaluator/llm/finalresponse/finalresponse_test.go @@ -0,0 +1,234 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package finalresponse + +import ( + "bytes" + "context" + "fmt" + "testing" + "text/template" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/genai" + + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/status" + "trpc.group/trpc-go/trpc-agent-go/model" +) + +func TestConstructMessages(t *testing.T) { + ev := &finalResponseEvaluator{} + actual := &evalset.Invocation{ + UserContent: &genai.Content{Parts: []*genai.Part{{Text: "user?"}}}, + FinalResponse: &genai.Content{Parts: []*genai.Part{ + {Text: "actual"}, + }}, + } + expected := &evalset.Invocation{ + FinalResponse: &genai.Content{Parts: []*genai.Part{ + {Text: "expected"}, + }}, + } + msgs, err := ev.ConstructMessages(actual, expected, nil) + require.NoError(t, err) + require.Len(t, msgs, 1) + assert.Equal(t, model.RoleUser, msgs[0].Role) + assert.Contains(t, msgs[0].Content, "user?") + assert.Contains(t, msgs[0].Content, "actual") + assert.Contains(t, msgs[0].Content, "expected") +} + +func TestConstructMessagesTemplateError(t *testing.T) { + original := finalResponsePromptTemplate + t.Cleanup(func() { finalResponsePromptTemplate = original }) + finalResponsePromptTemplate = template.Must(template.New("err").Funcs(template.FuncMap{ + "fail": func() (string, error) { return "", fmt.Errorf("boom") }, + }).Parse(`{{fail}}`)) + + ev := &finalResponseEvaluator{} + _, err := ev.ConstructMessages(&evalset.Invocation{}, &evalset.Invocation{}, nil) + require.Error(t, err) +} + +func TestScoreBasedOnResponse(t *testing.T) { + ev := &finalResponseEvaluator{} + validResp := &model.Response{ + Choices: []model.Choice{ + { + Message: model.Message{ + Content: `{"is_the_agent_response_valid":"VALID"}`, + }, + }, + }, + } + score, err := ev.ScoreBasedOnResponse(validResp, nil) + require.NoError(t, err) + require.NotNil(t, score.Score) + assert.Equal(t, 1.0, score.Score) + + invalidResp := &model.Response{ + Choices: []model.Choice{ + { + Message: model.Message{ + Content: `{"is_the_agent_response_valid":"INVALID"}`, + }, + }, + }, + } + score, err = ev.ScoreBasedOnResponse(invalidResp, nil) + require.NoError(t, err) + require.NotNil(t, score.Score) + assert.Equal(t, 0.0, score.Score) + + unknownResp := &model.Response{ + Choices: []model.Choice{ + { + Message: model.Message{ + Content: `{"is_the_agent_response_valid":"UNKNOWN"}`, + }, + }, + }, + } + _, err = ev.ScoreBasedOnResponse(unknownResp, nil) + require.Error(t, err) + + emptyChoices := &model.Response{Choices: []model.Choice{}} + _, err = ev.ScoreBasedOnResponse(emptyChoices, nil) + require.Error(t, err) + + emptyContent := &model.Response{ + Choices: []model.Choice{ + { + Message: model.Message{Content: ""}, + }, + }, + } + _, err = ev.ScoreBasedOnResponse(emptyContent, nil) + require.Error(t, err) +} + +func TestAggregateSamples(t *testing.T) { + ev := &finalResponseEvaluator{} + evalMetric := &metric.EvalMetric{Threshold: 0.5} + positive := &evaluator.PerInvocationResult{Score: 1, Status: status.EvalStatusPassed} + negative := &evaluator.PerInvocationResult{Score: 0, Status: status.EvalStatusFailed} + + result, err := ev.AggregateSamples([]*evaluator.PerInvocationResult{positive, negative, positive}, evalMetric) + require.NoError(t, err) + assert.Equal(t, positive, result) + + result, err = ev.AggregateSamples([]*evaluator.PerInvocationResult{negative, negative, positive}, evalMetric) + require.NoError(t, err) + assert.Equal(t, negative, result) + + // No samples returns error. + _, err = ev.AggregateSamples([]*evaluator.PerInvocationResult{}, evalMetric) + require.Error(t, err) + + // Mixed but empty positive/negative (NotEvaluated) falls back to first sample. + result, err = ev.AggregateSamples([]*evaluator.PerInvocationResult{ + {Score: 0, Status: status.EvalStatusNotEvaluated}, + }, evalMetric) + require.NoError(t, err) + assert.Equal(t, status.EvalStatusNotEvaluated, result.Status) +} + +func TestAggregateInvocations(t *testing.T) { + ev := &finalResponseEvaluator{} + evalMetric := &metric.EvalMetric{Threshold: 0.6} + results := []*evaluator.PerInvocationResult{ + {Score: 1, Status: status.EvalStatusPassed}, + {Score: 0, Status: status.EvalStatusFailed}, + {Score: 0, Status: status.EvalStatusNotEvaluated}, + } + agg, err := ev.AggregateInvocations(results, evalMetric) + require.NoError(t, err) + assert.InDelta(t, 0.5, agg.OverallScore, 1e-9) + assert.Equal(t, status.EvalStatusFailed, agg.OverallStatus) + assert.Equal(t, results, agg.PerInvocationResults) + + agg, err = ev.AggregateInvocations([]*evaluator.PerInvocationResult{ + {Score: 0, Status: status.EvalStatusNotEvaluated}, + }, evalMetric) + require.NoError(t, err) + assert.Equal(t, status.EvalStatusNotEvaluated, agg.OverallStatus) +} + +func TestGetTextFromContent(t *testing.T) { + var buf bytes.Buffer + buf.WriteString(getTextFromContent(nil)) + assert.Equal(t, "", buf.String()) + + content := &genai.Content{Parts: []*genai.Part{{Text: "hello "}, {Text: "world"}}} + assert.Equal(t, "hello world", getTextFromContent(content)) +} + +func TestExtractLabel(t *testing.T) { + assert.Equal(t, LabelValid, extractLabel(`"is_the_agent_response_valid":"VALID"`)) + assert.Equal(t, LabelInvalid, extractLabel(`"is_the_agent_response_valid":"INVALID"`)) + assert.Equal(t, LabelInvalid, extractLabel(`no label`)) + assert.Equal(t, Label("UNKNOWN"), extractLabel(`"is_the_agent_response_valid":"UNKNOWN"`)) +} + +type stubLLMBase struct { + evaluateCalled bool + result *evaluator.EvaluateResult +} + +func (s *stubLLMBase) Name() string { return "stub" } + +func (s *stubLLMBase) Description() string { return "stub desc" } + +func (s *stubLLMBase) Evaluate(_ context.Context, _ []*evalset.Invocation, _ []*evalset.Invocation, + _ *metric.EvalMetric) (*evaluator.EvaluateResult, error) { + s.evaluateCalled = true + return s.result, nil +} + +func (s *stubLLMBase) ConstructMessages(*evalset.Invocation, *evalset.Invocation, + *metric.EvalMetric) ([]model.Message, error) { + return nil, nil +} + +func (s *stubLLMBase) ScoreBasedOnResponse(*model.Response, *metric.EvalMetric) (*evalresult.ScoreResult, error) { + return nil, nil +} + +func (s *stubLLMBase) AggregateSamples([]*evaluator.PerInvocationResult, + *metric.EvalMetric) (*evaluator.PerInvocationResult, error) { + return nil, nil +} + +func (s *stubLLMBase) AggregateInvocations([]*evaluator.PerInvocationResult, + *metric.EvalMetric) (*evaluator.EvaluateResult, error) { + return s.result, nil +} + +func TestFinalResponseEvaluator_ConstructorsAndEvaluate(t *testing.T) { + ev := New() + evaluatorImpl, ok := ev.(*finalResponseEvaluator) + require.True(t, ok) + + // Override base to avoid calling real LLM flow. + stub := &stubLLMBase{result: &evaluator.EvaluateResult{OverallStatus: status.EvalStatusPassed}} + evaluatorImpl.llmBaseEvaluator = stub + + got, err := evaluatorImpl.Evaluate(context.Background(), nil, nil, nil) + require.NoError(t, err) + assert.True(t, stub.evaluateCalled) + assert.Equal(t, status.EvalStatusPassed, got.OverallStatus) + assert.Equal(t, "llm_final_response", evaluatorImpl.Name()) + assert.Equal(t, "LLM judge for final responses", evaluatorImpl.Description()) +} diff --git a/evaluation/evaluator/llm/llm.go b/evaluation/evaluator/llm/llm.go new file mode 100644 index 000000000..b51ffc523 --- /dev/null +++ b/evaluation/evaluator/llm/llm.go @@ -0,0 +1,173 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package llm provides base helpers for LLM-backed evaluators. +package llm + +import ( + "context" + "fmt" + + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/status" + "trpc.group/trpc-go/trpc-agent-go/model" + "trpc.group/trpc-go/trpc-agent-go/model/provider" +) + +// LLMEvaluator defines the LLM-backed evaluator contract. +type LLMEvaluator interface { + evaluator.Evaluator + // ConstructMessages builds prompts for the judge model. + ConstructMessages(actual, expected *evalset.Invocation, evalMetric *metric.EvalMetric) ([]model.Message, error) + // ScoreBasedOnResponse extracts a score from the judge response. + ScoreBasedOnResponse(resp *model.Response, evalMetric *metric.EvalMetric) (*evalresult.ScoreResult, error) + // AggregateSamples summarizes multiple sample scores for one invocation. + AggregateSamples(samples []*evaluator.PerInvocationResult, + evalMetric *metric.EvalMetric) (*evaluator.PerInvocationResult, error) + // AggregateInvocations aggregates per-invocation results into the final evaluation. + AggregateInvocations(results []*evaluator.PerInvocationResult, + evalMetric *metric.EvalMetric) (*evaluator.EvaluateResult, error) +} + +// LLMBaseEvaluator hosts shared orchestration logic for LLM evaluators. +type LLMBaseEvaluator struct { + LLMEvaluator LLMEvaluator // LLMEvaluator is the concrete LLM evaluator implementation. +} + +// New constructs an LLMBaseEvaluator wrapper around the concrete evaluator. +func New(llmEvaluator LLMEvaluator) LLMEvaluator { + return &LLMBaseEvaluator{LLMEvaluator: llmEvaluator} +} + +// Name returns the evaluator name. +func (r *LLMBaseEvaluator) Name() string { + return "llm_base_evaluator" +} + +// Description describes the evaluator. +func (r *LLMBaseEvaluator) Description() string { + return "Base evaluator for LLM judge" +} + +// Evaluate runs the judge model over paired invocations and aggregates results. +func (r *LLMBaseEvaluator) Evaluate(ctx context.Context, actuals, expecteds []*evalset.Invocation, + evalMetric *metric.EvalMetric) (*evaluator.EvaluateResult, error) { + if evalMetric == nil || + evalMetric.Criterion == nil || + evalMetric.Criterion.LLMJudge == nil || + evalMetric.Criterion.LLMJudge.JudgeModel == nil || + evalMetric.Criterion.LLMJudge.JudgeModel.Generation == nil { + return nil, fmt.Errorf("missing required fields in eval metric") + } + if evalMetric.Criterion.LLMJudge.JudgeModel.NumSamples <= 0 { + return nil, fmt.Errorf("num samples must be greater than 0") + } + if len(actuals) != len(expecteds) { + return nil, fmt.Errorf("actual invocations (%d) and expected invocations (%d) count mismatch", + len(actuals), len(expecteds)) + } + results := make([]*evaluator.PerInvocationResult, 0, len(actuals)) + for i := range actuals { + actual := actuals[i] + expected := expecteds[i] + messages, err := r.ConstructMessages(actual, expected, evalMetric) + if err != nil { + return nil, fmt.Errorf("construct messages: %w", err) + } + numSamples := evalMetric.Criterion.LLMJudge.JudgeModel.NumSamples + samples := make([]*evaluator.PerInvocationResult, 0, numSamples) + for range numSamples { + response, err := judgeModelResponse(ctx, messages, evalMetric) + if err != nil { + return nil, fmt.Errorf("judge model response: %w", err) + } + score, err := r.ScoreBasedOnResponse(response, evalMetric) + if err != nil { + return nil, fmt.Errorf("score based on response: %w", err) + } + evalStatus := status.EvalStatusPassed + if score.Score < evalMetric.Threshold { + evalStatus = status.EvalStatusFailed + } + samples = append(samples, &evaluator.PerInvocationResult{ + ActualInvocation: actual, + ExpectedInvocation: expected, + Score: score.Score, + Status: evalStatus, + }) + } + perInvocationResult, err := r.AggregateSamples(samples, evalMetric) + if err != nil { + return nil, fmt.Errorf("aggregate samples: %w", err) + } + results = append(results, perInvocationResult) + } + return r.AggregateInvocations(results, evalMetric) +} + +// AggregateInvocations delegates invocation aggregation to the concrete evaluator. +func (r *LLMBaseEvaluator) AggregateInvocations(results []*evaluator.PerInvocationResult, + evalMetric *metric.EvalMetric) (*evaluator.EvaluateResult, error) { + return r.LLMEvaluator.AggregateInvocations(results, evalMetric) +} + +// AggregateSamples delegates sample aggregation to the concrete evaluator. +func (r *LLMBaseEvaluator) AggregateSamples(samples []*evaluator.PerInvocationResult, + evalMetric *metric.EvalMetric) (*evaluator.PerInvocationResult, error) { + return r.LLMEvaluator.AggregateSamples(samples, evalMetric) +} + +// ScoreBasedOnResponse delegates response scoring to the concrete evaluator. +func (r *LLMBaseEvaluator) ScoreBasedOnResponse(resp *model.Response, + evalMetric *metric.EvalMetric) (*evalresult.ScoreResult, error) { + return r.LLMEvaluator.ScoreBasedOnResponse(resp, evalMetric) +} + +// ConstructMessages delegates prompt construction to the concrete evaluator. +func (r *LLMBaseEvaluator) ConstructMessages(actual, expected *evalset.Invocation, + evalMetric *metric.EvalMetric) ([]model.Message, error) { + return r.LLMEvaluator.ConstructMessages(actual, expected, evalMetric) +} + +// judgeModelResponse calls the judge model and returns the final response. +func judgeModelResponse(ctx context.Context, messages []model.Message, + evalMetric *metric.EvalMetric) (*model.Response, error) { + judgeModel := evalMetric.Criterion.LLMJudge.JudgeModel + req := model.Request{ + Messages: messages, + GenerationConfig: *judgeModel.Generation, + } + req.GenerationConfig.Stream = false + modelInstance, err := provider.Model( + judgeModel.ProviderName, + judgeModel.ModelName, + provider.WithAPIKey(judgeModel.APIKey), + provider.WithBaseURL(judgeModel.BaseURL), + provider.WithExtraFields(judgeModel.ExtraFields), + ) + if err != nil { + return nil, fmt.Errorf("create model instance: %w", err) + } + responses, err := modelInstance.GenerateContent(ctx, &req) + if err != nil { + return nil, fmt.Errorf("generate response: %w", err) + } + for response := range responses { + if response.Error != nil { + return nil, fmt.Errorf("response error: %v", response.Error) + } + if response.IsFinalResponse() { + return response, nil + } + } + return nil, fmt.Errorf("no final response") +} diff --git a/evaluation/evaluator/llm/llm_test.go b/evaluation/evaluator/llm/llm_test.go new file mode 100644 index 000000000..7547b29fe --- /dev/null +++ b/evaluation/evaluator/llm/llm_test.go @@ -0,0 +1,345 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package llm + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/llm" + "trpc.group/trpc-go/trpc-agent-go/evaluation/status" + "trpc.group/trpc-go/trpc-agent-go/model" + "trpc.group/trpc-go/trpc-agent-go/model/provider" +) + +type fakeLLMEvaluator struct { + constructMessagesCalled int + scoreBasedOnResponseCalls int + aggregateSamplesCalls int + aggregateInvocationsCalls int + receivedSamples []*evaluator.PerInvocationResult + receivedInvocations []*evaluator.PerInvocationResult +} + +func (f *fakeLLMEvaluator) Name() string { return "fake" } + +func (f *fakeLLMEvaluator) Description() string { return "fake desc" } + +func (f *fakeLLMEvaluator) Evaluate(_ context.Context, _ []*evalset.Invocation, _ []*evalset.Invocation, + _ *metric.EvalMetric) (*evaluator.EvaluateResult, error) { + return nil, nil +} + +func (f *fakeLLMEvaluator) ConstructMessages(actual, expected *evalset.Invocation, + _ *metric.EvalMetric) ([]model.Message, error) { + f.constructMessagesCalled++ + return []model.Message{{ + Role: "user", + Content: actual.InvocationID + expected.InvocationID, + }}, nil +} + +func (f *fakeLLMEvaluator) ScoreBasedOnResponse(_ *model.Response, + _ *metric.EvalMetric) (*evalresult.ScoreResult, error) { + f.scoreBasedOnResponseCalls++ + score := 0.9 + return &evalresult.ScoreResult{Score: score}, nil +} + +func (f *fakeLLMEvaluator) AggregateSamples(samples []*evaluator.PerInvocationResult, + _ *metric.EvalMetric) (*evaluator.PerInvocationResult, error) { + f.aggregateSamplesCalls++ + f.receivedSamples = samples + return &evaluator.PerInvocationResult{ + Score: samples[0].Score, + Status: samples[0].Status, + }, nil +} + +func (f *fakeLLMEvaluator) AggregateInvocations(results []*evaluator.PerInvocationResult, + _ *metric.EvalMetric) (*evaluator.EvaluateResult, error) { + f.aggregateInvocationsCalls++ + f.receivedInvocations = results + return &evaluator.EvaluateResult{ + OverallScore: results[0].Score, + OverallStatus: results[0].Status, + PerInvocationResults: results, + }, nil +} + +type fakeModel struct { + responses []*model.Response + err error +} + +func (f *fakeModel) GenerateContent(_ context.Context, _ *model.Request) (<-chan *model.Response, error) { + if f.err != nil { + return nil, f.err + } + ch := make(chan *model.Response, len(f.responses)) + for _, rsp := range f.responses { + ch <- rsp + } + close(ch) + return ch, nil +} + +func (f *fakeModel) Info() model.Info { + return model.Info{Name: "fake"} +} + +func buildEvalMetric(providerName string, numSamples int) *metric.EvalMetric { + return &metric.EvalMetric{ + Threshold: 0.5, + Criterion: &criterion.Criterion{ + LLMJudge: &llm.LLMCriterion{ + JudgeModel: &llm.JudgeModelOptions{ + ProviderName: providerName, + ModelName: "fake-model", + NumSamples: numSamples, + Generation: &model.GenerationConfig{}, + }, + }, + }, + } +} + +func TestLLMBaseEvaluator_EvaluateSuccess(t *testing.T) { + provider.Register("llm-test-provider", func(_ *provider.Options) (model.Model, error) { + return &fakeModel{responses: []*model.Response{{ + Choices: []model.Choice{{Message: model.Message{Content: "ok"}}}, + Done: true, + }}}, nil + }) + stub := &fakeLLMEvaluator{} + base := &LLMBaseEvaluator{LLMEvaluator: stub} + evalMetric := buildEvalMetric("llm-test-provider", 1) + actual := &evalset.Invocation{InvocationID: "a"} + expected := &evalset.Invocation{InvocationID: "b"} + + res, err := base.Evaluate( + context.Background(), + []*evalset.Invocation{actual}, + []*evalset.Invocation{expected}, + evalMetric, + ) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, 1, stub.constructMessagesCalled) + assert.Equal(t, 1, stub.scoreBasedOnResponseCalls) + assert.Equal(t, 1, stub.aggregateSamplesCalls) + assert.Equal(t, 1, stub.aggregateInvocationsCalls) + require.Len(t, stub.receivedSamples, 1) + assert.Equal(t, actual, stub.receivedSamples[0].ActualInvocation) + assert.Equal(t, expected, stub.receivedSamples[0].ExpectedInvocation) + require.Len(t, stub.receivedInvocations, 1) + assert.Equal(t, stub.receivedSamples[0].Score, stub.receivedInvocations[0].Score) +} + +func TestLLMBaseEvaluator_EvaluateValidationErrors(t *testing.T) { + stub := &fakeLLMEvaluator{} + base := &LLMBaseEvaluator{LLMEvaluator: stub} + + _, err := base.Evaluate(context.Background(), nil, nil, nil) + require.Error(t, err) + + evalMetric := buildEvalMetric("provider", 0) + _, err = base.Evaluate(context.Background(), nil, nil, evalMetric) + require.Error(t, err) + + evalMetric = buildEvalMetric("provider", 1) + _, err = base.Evaluate( + context.Background(), + []*evalset.Invocation{{}}, + []*evalset.Invocation{}, + evalMetric, + ) + require.Error(t, err) +} + +type scriptedLLMEvaluator struct { + constructErr error + scoreErr error + scoreValue float64 + aggregateSamplesErr error +} + +func (s *scriptedLLMEvaluator) Name() string { return "scripted" } + +func (s *scriptedLLMEvaluator) Description() string { return "scripted" } + +func (s *scriptedLLMEvaluator) Evaluate(context.Context, []*evalset.Invocation, []*evalset.Invocation, + *metric.EvalMetric) (*evaluator.EvaluateResult, error) { + return nil, nil +} + +func (s *scriptedLLMEvaluator) ConstructMessages(*evalset.Invocation, *evalset.Invocation, + *metric.EvalMetric) ([]model.Message, error) { + if s.constructErr != nil { + return nil, s.constructErr + } + return []model.Message{{Role: "user", Content: "prompt"}}, nil +} + +func (s *scriptedLLMEvaluator) ScoreBasedOnResponse(*model.Response, + *metric.EvalMetric) (*evalresult.ScoreResult, error) { + if s.scoreErr != nil { + return nil, s.scoreErr + } + score := s.scoreValue + return &evalresult.ScoreResult{Score: score}, nil +} + +func (s *scriptedLLMEvaluator) AggregateSamples(samples []*evaluator.PerInvocationResult, + _ *metric.EvalMetric) (*evaluator.PerInvocationResult, error) { + if s.aggregateSamplesErr != nil { + return nil, s.aggregateSamplesErr + } + return &evaluator.PerInvocationResult{ + ActualInvocation: samples[0].ActualInvocation, + ExpectedInvocation: samples[0].ExpectedInvocation, + Score: samples[0].Score, + Status: samples[0].Status, + }, nil +} + +func (s *scriptedLLMEvaluator) AggregateInvocations(results []*evaluator.PerInvocationResult, + _ *metric.EvalMetric) (*evaluator.EvaluateResult, error) { + return &evaluator.EvaluateResult{ + OverallScore: results[0].Score, + OverallStatus: results[0].Status, + PerInvocationResults: results, + }, nil +} + +func TestLLMBaseEvaluator_ErrorPaths(t *testing.T) { + provider.Register("llm-test-provider", func(_ *provider.Options) (model.Model, error) { + return &fakeModel{responses: []*model.Response{{ + Choices: []model.Choice{{Message: model.Message{Content: "ok"}}}, + Done: true, + }}}, nil + }) + evalMetric := buildEvalMetric("llm-test-provider", 1) + base := &LLMBaseEvaluator{LLMEvaluator: &scriptedLLMEvaluator{constructErr: assert.AnError}} + _, err := base.Evaluate( + context.Background(), + []*evalset.Invocation{{}}, + []*evalset.Invocation{{}}, + evalMetric, + ) + require.Error(t, err) + + base.LLMEvaluator = &scriptedLLMEvaluator{scoreErr: assert.AnError, scoreValue: 1} + _, err = base.Evaluate( + context.Background(), + []*evalset.Invocation{{}}, + []*evalset.Invocation{{}}, + evalMetric, + ) + require.Error(t, err) + + base.LLMEvaluator = &scriptedLLMEvaluator{aggregateSamplesErr: assert.AnError, scoreValue: 1} + _, err = base.Evaluate( + context.Background(), + []*evalset.Invocation{{}}, + []*evalset.Invocation{{}}, + evalMetric, + ) + require.Error(t, err) +} + +func TestLLMBaseEvaluator_ScoreBelowThreshold(t *testing.T) { + provider.Register("llm-low-score-provider", func(_ *provider.Options) (model.Model, error) { + return &fakeModel{responses: []*model.Response{{ + Choices: []model.Choice{{Message: model.Message{Content: "ok"}}}, + Done: true, + }}}, nil + }) + base := &LLMBaseEvaluator{LLMEvaluator: &scriptedLLMEvaluator{scoreValue: 0}} + evalMetric := buildEvalMetric("llm-low-score-provider", 1) + actual := &evalset.Invocation{InvocationID: "a"} + expected := &evalset.Invocation{InvocationID: "b"} + + res, err := base.Evaluate( + context.Background(), + []*evalset.Invocation{actual}, + []*evalset.Invocation{expected}, + evalMetric, + ) + require.NoError(t, err) + require.Len(t, res.PerInvocationResults, 1) + assert.Equal(t, status.EvalStatusFailed, res.PerInvocationResults[0].Status) +} + +func TestJudgeModelResponse_UnknownProvider(t *testing.T) { + evalMetric := buildEvalMetric("unknown-provider", 1) + _, err := judgeModelResponse(context.Background(), []model.Message{}, evalMetric) + require.Error(t, err) +} + +func TestLLMBaseEvaluator_JudgeModelError(t *testing.T) { + base := &LLMBaseEvaluator{LLMEvaluator: &scriptedLLMEvaluator{scoreValue: 1}} + evalMetric := buildEvalMetric("unknown-provider", 1) + _, err := base.Evaluate( + context.Background(), + []*evalset.Invocation{{}}, + []*evalset.Invocation{{}}, + evalMetric, + ) + require.Error(t, err) +} + +func TestJudgeModelResponseErrors(t *testing.T) { + provider.Register("llm-error-provider", func(_ *provider.Options) (model.Model, error) { + return &fakeModel{err: assert.AnError}, nil + }) + evalMetric := buildEvalMetric("llm-error-provider", 1) + _, err := judgeModelResponse(context.Background(), []model.Message{}, evalMetric) + require.Error(t, err) + + provider.Register("llm-response-error-provider", func(_ *provider.Options) (model.Model, error) { + return &fakeModel{responses: []*model.Response{{ + Error: &model.ResponseError{Message: "bad"}, + Done: true, + }}}, nil + }) + evalMetric = buildEvalMetric("llm-response-error-provider", 1) + _, err = judgeModelResponse(context.Background(), []model.Message{}, evalMetric) + require.Error(t, err) + + provider.Register("llm-no-final-provider", func(_ *provider.Options) (model.Model, error) { + return &fakeModel{responses: []*model.Response{}}, nil + }) + evalMetric = buildEvalMetric("llm-no-final-provider", 1) + _, err = judgeModelResponse(context.Background(), []model.Message{}, evalMetric) + require.Error(t, err) +} + +func TestLLMBaseEvaluator_NameDescription(t *testing.T) { + base := &LLMBaseEvaluator{} + assert.Equal(t, "llm_base_evaluator", base.Name()) + assert.Equal(t, "Base evaluator for LLM judge", base.Description()) +} + +func TestLLMBaseEvaluator_New(t *testing.T) { + stub := &fakeLLMEvaluator{} + res := New(stub) + base, ok := res.(*LLMBaseEvaluator) + require.True(t, ok) + assert.Equal(t, stub, base.LLMEvaluator) +} diff --git a/evaluation/evaluator/registry/registry.go b/evaluation/evaluator/registry/registry.go index 026f20493..90b123094 100644 --- a/evaluation/evaluator/registry/registry.go +++ b/evaluation/evaluator/registry/registry.go @@ -18,6 +18,7 @@ import ( "sync" "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/llm/finalresponse" "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/tooltrajectory" ) @@ -44,6 +45,8 @@ func New() Registry { } toolTrajectory := tooltrajectory.New() r.Register(toolTrajectory.Name(), toolTrajectory) + finalResponse := finalresponse.New() + r.Register(finalResponse.Name(), finalResponse) return r } diff --git a/evaluation/evaluator/tooltrajectory/tooltrajectory.go b/evaluation/evaluator/tooltrajectory/tooltrajectory.go index 0b5d56a49..1d783b1fc 100644 --- a/evaluation/evaluator/tooltrajectory/tooltrajectory.go +++ b/evaluation/evaluator/tooltrajectory/tooltrajectory.go @@ -13,13 +13,13 @@ package tooltrajectory import ( "context" "fmt" - "reflect" - "google.golang.org/genai" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + ctooltrajectory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" "trpc.group/trpc-go/trpc-agent-go/evaluation/status" + "trpc.group/trpc-go/trpc-agent-go/log" ) // toolTrajectoryEvaluator is a tool trajectory evaluator implementation for evaluator. @@ -48,19 +48,20 @@ func (e *toolTrajectoryEvaluator) Evaluate(ctx context.Context, actuals, expecte return nil, fmt.Errorf("tooltrajectory: actual invocations (%d) and expected invocations (%d) count mismatch", len(actuals), len(expecteds)) } - perInvocation := make([]evaluator.PerInvocationResult, 0, len(actuals)) + perInvocation := make([]*evaluator.PerInvocationResult, 0, len(actuals)) var totalScore float64 for i := range len(actuals) { actual := actuals[i] expected := expecteds[i] - actualCalls := getToolCalls(actual) - expectedCalls := getToolCalls(expected) score := 0.0 - if toolCallsEqual(actualCalls, expectedCalls) { + ok, err := toolCallsMatch(actual, expected, evalMetric.Criterion.ToolTrajectory) + if err != nil { + log.Errorf("tool trajectory mismatch: %v", err) + } else if ok { score = 1.0 } status := e.statusForScore(score, evalMetric) - perInvocation = append(perInvocation, evaluator.PerInvocationResult{ + perInvocation = append(perInvocation, &evaluator.PerInvocationResult{ ActualInvocation: actual, ExpectedInvocation: expected, Score: score, @@ -88,24 +89,14 @@ func (e *toolTrajectoryEvaluator) statusForScore(score float64, evalMetric *metr return status.EvalStatusFailed } -func getToolCalls(invocation *evalset.Invocation) []*genai.FunctionCall { - if invocation == nil || invocation.IntermediateData == nil { - return nil +func toolCallsMatch(actual, expected *evalset.Invocation, + criterion *ctooltrajectory.ToolTrajectoryCriterion) (bool, error) { + if criterion == nil { + return false, fmt.Errorf("criterion is nil") } - return invocation.IntermediateData.ToolUses -} - -func toolCallsEqual(actual, expected []*genai.FunctionCall) bool { - if len(actual) != len(expected) { - return false - } - for i := range actual { - if actual[i].Name != expected[i].Name { - return false - } - if !reflect.DeepEqual(actual[i].Args, expected[i].Args) { - return false - } + ok, err := criterion.Match(actual, expected) + if err != nil { + return false, fmt.Errorf("tool trajectory mismatch: %w", err) } - return true + return ok, nil } diff --git a/evaluation/evaluator/tooltrajectory/tooltrajectory_test.go b/evaluation/evaluator/tooltrajectory/tooltrajectory_test.go index 5e826a283..086194ffc 100644 --- a/evaluation/evaluator/tooltrajectory/tooltrajectory_test.go +++ b/evaluation/evaluator/tooltrajectory/tooltrajectory_test.go @@ -10,89 +10,39 @@ package tooltrajectory import ( - "context" + "encoding/json" "testing" "github.com/stretchr/testify/assert" - "google.golang.org/genai" - "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" - "trpc.group/trpc-go/trpc-agent-go/evaluation/status" + criterionjson "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" ) -func TestToolTrajectoryEvaluateSuccess(t *testing.T) { - inst := New() - assert.NotEmpty(t, inst.Description()) - assert.Equal(t, "tool_trajectory_avg_score", inst.Name()) - - e := inst.(*toolTrajectoryEvaluator) - actual := makeInvocation([]*genai.FunctionCall{ - {Name: "lookup", Args: map[string]any{"id": 1}}, - }) - expected := makeInvocation([]*genai.FunctionCall{ - {Name: "lookup", Args: map[string]any{"id": 1}}, - }) - - result, err := e.Evaluate(context.Background(), []*evalset.Invocation{actual}, []*evalset.Invocation{expected}, &metric.EvalMetric{Threshold: 0.5}) - assert.NoError(t, err) - assert.Equal(t, 1.0, result.OverallScore) - assert.Equal(t, status.EvalStatusPassed, result.OverallStatus) - assert.Len(t, result.PerInvocationResults, 1) - assert.Equal(t, actual, result.PerInvocationResults[0].ActualInvocation) - assert.Equal(t, expected, result.PerInvocationResults[0].ExpectedInvocation) - assert.Equal(t, status.EvalStatusPassed, result.PerInvocationResults[0].Status) -} - -func TestToolTrajectoryEvaluateMismatch(t *testing.T) { - e := New().(*toolTrajectoryEvaluator) - _, err := e.Evaluate(context.Background(), []*evalset.Invocation{}, []*evalset.Invocation{makeInvocation(nil)}, &metric.EvalMetric{Threshold: 1}) - assert.Error(t, err) - assert.Contains(t, err.Error(), "count mismatch") -} - -func TestToolTrajectoryEvaluateFailureStatus(t *testing.T) { - e := New().(*toolTrajectoryEvaluator) - actual := makeInvocation([]*genai.FunctionCall{ - {Name: "lookup", Args: map[string]any{"id": 1}}, - }) - expected := makeInvocation([]*genai.FunctionCall{ - {Name: "lookup", Args: map[string]any{"id": 2}}, - }) - - result, err := e.Evaluate(context.Background(), []*evalset.Invocation{actual}, []*evalset.Invocation{expected}, &metric.EvalMetric{Threshold: 0.9}) +func TestConfigJSONRoundTrip(t *testing.T) { + cfg := &tooltrajectory.ToolTrajectoryCriterion{ + DefaultStrategy: &tooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, + Arguments: &criterionjson.JSONCriterion{MatchStrategy: criterionjson.JSONMatchStrategyExact}, + Response: &criterionjson.JSONCriterion{MatchStrategy: criterionjson.JSONMatchStrategyExact}, + }, + ToolStrategy: map[string]*tooltrajectory.ToolTrajectoryStrategy{ + "custom": { + Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyRegex}, + }, + }, + OrderInsensitive: true, + } + data, err := json.Marshal(cfg) assert.NoError(t, err) - assert.Zero(t, result.OverallScore) - assert.Equal(t, status.EvalStatusFailed, result.OverallStatus) - assert.Equal(t, status.EvalStatusFailed, result.PerInvocationResults[0].Status) -} + assert.Contains(t, string(data), `"orderInsensitive":true`) + assert.Contains(t, string(data), `"custom"`) -func TestToolTrajectoryEvaluateNotEvaluated(t *testing.T) { - e := New().(*toolTrajectoryEvaluator) - result, err := e.Evaluate(context.Background(), []*evalset.Invocation{}, []*evalset.Invocation{}, &metric.EvalMetric{Threshold: 1}) + var decoded tooltrajectory.ToolTrajectoryCriterion + err = json.Unmarshal(data, &decoded) assert.NoError(t, err) - assert.Equal(t, status.EvalStatusNotEvaluated, result.OverallStatus) - assert.Nil(t, result.PerInvocationResults) -} - -func TestGetToolCallsAndEqual(t *testing.T) { - assert.Nil(t, getToolCalls(nil)) - assert.Nil(t, getToolCalls(&evalset.Invocation{})) - - callA := []*genai.FunctionCall{{Name: "a", Args: map[string]any{"x": 1}}} - callB := []*genai.FunctionCall{{Name: "a", Args: map[string]any{"x": 1}}} - assert.True(t, toolCallsEqual(callA, callB)) - - callNameDiff := []*genai.FunctionCall{{Name: "b", Args: map[string]any{"x": 1}}} - callArgsDiff := []*genai.FunctionCall{{Name: "a", Args: map[string]any{"x": 2}}} - assert.False(t, toolCallsEqual(callA, callNameDiff)) - assert.False(t, toolCallsEqual(callA, callArgsDiff)) - assert.False(t, toolCallsEqual(callA, []*genai.FunctionCall{})) -} - -func makeInvocation(calls []*genai.FunctionCall) *evalset.Invocation { - return &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: calls, - }, - } + assert.True(t, decoded.OrderInsensitive) + assert.NotNil(t, decoded.DefaultStrategy) + assert.NotNil(t, decoded.ToolStrategy["custom"]) + assert.Equal(t, text.TextMatchStrategyRegex, decoded.ToolStrategy["custom"].Name.MatchStrategy) } diff --git a/evaluation/go.mod b/evaluation/go.mod index 8653e219d..0dff4da23 100644 --- a/evaluation/go.mod +++ b/evaluation/go.mod @@ -2,19 +2,25 @@ module trpc.group/trpc-go/trpc-agent-go/evaluation go 1.24.4 -replace trpc.group/trpc-go/trpc-agent-go => ../ +replace ( + trpc.group/trpc-go/trpc-agent-go => ../ + trpc.group/trpc-go/trpc-agent-go/model/anthropic => ../model/anthropic + trpc.group/trpc-go/trpc-agent-go/model/provider => ../model/provider +) require ( github.com/google/uuid v1.6.0 github.com/stretchr/testify v1.10.0 google.golang.org/genai v1.29.0 trpc.group/trpc-go/trpc-agent-go v0.0.0-00010101000000-000000000000 + trpc.group/trpc-go/trpc-agent-go/model/provider v0.0.0-00010101000000-000000000000 ) require ( cloud.google.com/go v0.116.0 // indirect cloud.google.com/go/auth v0.9.3 // indirect cloud.google.com/go/compute/metadata v0.5.0 // indirect + github.com/anthropics/anthropic-sdk-go v1.16.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/go-logr/logr v1.4.2 // indirect @@ -25,8 +31,13 @@ require ( github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect github.com/gorilla/websocket v1.5.3 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 // indirect + github.com/openai/openai-go v1.12.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect + github.com/tidwall/gjson v1.18.0 // indirect + github.com/tidwall/match v1.1.1 // indirect + github.com/tidwall/pretty v1.2.1 // indirect + github.com/tidwall/sjson v1.2.5 // indirect go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/otel v1.29.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0 // indirect @@ -38,14 +49,15 @@ require ( go.opentelemetry.io/proto/otlp v1.3.1 // indirect go.uber.org/multierr v1.10.0 // indirect go.uber.org/zap v1.27.0 // indirect - golang.org/x/crypto v0.32.0 // indirect - golang.org/x/net v0.34.0 // indirect - golang.org/x/sys v0.30.0 // indirect - golang.org/x/text v0.21.0 // indirect + golang.org/x/crypto v0.40.0 // indirect + golang.org/x/net v0.41.0 // indirect + golang.org/x/sys v0.34.0 // indirect + golang.org/x/text v0.27.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect google.golang.org/grpc v1.66.2 // indirect google.golang.org/protobuf v1.34.2 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect trpc.group/trpc-go/trpc-a2a-go v0.2.5-0.20251023030722-7f02b57fd14a // indirect + trpc.group/trpc-go/trpc-agent-go/model/anthropic v0.0.0-00010101000000-000000000000 // indirect ) diff --git a/evaluation/go.sum b/evaluation/go.sum index 9c8206ce8..c916caadb 100644 --- a/evaluation/go.sum +++ b/evaluation/go.sum @@ -6,6 +6,8 @@ cloud.google.com/go/auth v0.9.3/go.mod h1:7z6VY+7h3KUdRov5F1i8NDP5ZzWKYmEPO842Bg cloud.google.com/go/compute/metadata v0.5.0 h1:Zr0eK8JbFv6+Wi4ilXAR8FJ3wyNdpxHKJNPos6LTZOY= cloud.google.com/go/compute/metadata v0.5.0/go.mod h1:aHnloV2TPI38yx4s9+wAZhHykWvVCfu7hQbF+9CWoiY= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/anthropics/anthropic-sdk-go v1.16.0 h1:nRkOFDqYXsHteoIhjdJr/5dsiKbFF3rflSv8ax50y8o= +github.com/anthropics/anthropic-sdk-go v1.16.0/go.mod h1:WTz31rIUHUHqai2UslPpw5CwXrQP3geYBioRV4WOLvE= github.com/bmatcuk/doublestar/v4 v4.9.1 h1:X8jg9rRZmJd4yRy7ZeNDRnM+T3ZfHv15JiBJ/avrEXE= github.com/bmatcuk/doublestar/v4 v4.9.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= @@ -83,10 +85,12 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/tidwall/gjson v1.14.4 h1:uo0p8EbA09J7RQaflQ1aBRffTR7xedD2bcIVSYxLnkM= -github.com/tidwall/gjson v1.14.4/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= @@ -119,8 +123,8 @@ go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc= -golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= +golang.org/x/crypto v0.40.0 h1:r4x+VvoG5Fm+eJcxMaY8CQM7Lb0l1lsmjGBQ6s8BfKM= +golang.org/x/crypto v0.40.0/go.mod h1:Qr1vMER5WyS2dfPHAlsOj01wgLbsyWtFn/aY+5+ZdxY= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= @@ -131,24 +135,24 @@ golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73r golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0= -golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= +golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= +golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= -golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= -golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= +golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= +golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= diff --git a/evaluation/metric/criterion/criterion.go b/evaluation/metric/criterion/criterion.go new file mode 100644 index 000000000..59ffdd240 --- /dev/null +++ b/evaluation/metric/criterion/criterion.go @@ -0,0 +1,33 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package criterion provides configurable evaluation criteria. +package criterion + +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/llm" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +// Criterion encapsulates multiple evaluation criteria for comprehensive model behavior assessment. +type Criterion struct { + // ToolTrajectory configures checks for tool call and response sequences. + ToolTrajectory *tooltrajectory.ToolTrajectoryCriterion `json:"toolTrajectory,omitempty"` + // LLMJudge configures the LLM-based judge criterion. + LLMJudge *llm.LLMCriterion `json:"llmJudge,omitempty"` +} + +// New creates a Criterion with the provided options. +func New(opt ...Option) *Criterion { + opts := newOptions(opt...) + return &Criterion{ + ToolTrajectory: opts.toolTrajectory, + LLMJudge: opts.llmJudge, + } +} diff --git a/evaluation/metric/criterion/criterion_test.go b/evaluation/metric/criterion/criterion_test.go new file mode 100644 index 000000000..65480a4b4 --- /dev/null +++ b/evaluation/metric/criterion/criterion_test.go @@ -0,0 +1,42 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package criterion + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +func TestCriterionNewDefaults(t *testing.T) { + c := New() + assert.NotNil(t, c.ToolTrajectory) +} + +func TestCriterionWithToolTrajectory(t *testing.T) { + custom := tooltrajectory.New() + c := New(WithToolTrajectory(custom)) + assert.Equal(t, custom, c.ToolTrajectory) +} + +func TestCriterionJSONRoundTrip(t *testing.T) { + c := &Criterion{ + ToolTrajectory: tooltrajectory.New(), + } + data, err := json.Marshal(c) + assert.NoError(t, err) + + var decoded Criterion + err = json.Unmarshal(data, &decoded) + assert.NoError(t, err) + assert.NotNil(t, decoded.ToolTrajectory) +} diff --git a/evaluation/metric/criterion/json/json.go b/evaluation/metric/criterion/json/json.go new file mode 100644 index 000000000..f1e1c196e --- /dev/null +++ b/evaluation/metric/criterion/json/json.go @@ -0,0 +1,54 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package json defines json-based comparison criteria. +package json + +import ( + "fmt" + "reflect" +) + +// JSONCriterion compares two JSON objects using exact matching. +type JSONCriterion struct { + // Ignore skips comparison when true. + Ignore bool `json:"ignore,omitempty"` + // MatchStrategy selects the comparison rule. + MatchStrategy JSONMatchStrategy `json:"matchStrategy,omitempty"` + // Compare overrides default comparison when provided. + Compare func(actual, expected map[string]any) (bool, error) `json:"-"` +} + +// JSONMatchStrategy enumerates supported JSON comparison strategies. +type JSONMatchStrategy string + +const ( + // JSONMatchStrategyExact matches json objects exactly. + JSONMatchStrategyExact JSONMatchStrategy = "exact" +) + +// Match compares two JSON objects using custom logic or deep equality. +func (j *JSONCriterion) Match(actual, expected map[string]any) (bool, error) { + if j.Ignore { + return true, nil + } + if j.Compare != nil { + return j.Compare(actual, expected) + } + switch j.MatchStrategy { + // Default to exact match. + case JSONMatchStrategyExact, "": + if reflect.DeepEqual(actual, expected) { + return true, nil + } + return false, fmt.Errorf("actual %v and expected %v do not match", actual, expected) + default: + return false, fmt.Errorf("invalid match strategy %s", j.MatchStrategy) + } +} diff --git a/evaluation/metric/criterion/json/json_test.go b/evaluation/metric/criterion/json/json_test.go new file mode 100644 index 000000000..b3eea033f --- /dev/null +++ b/evaluation/metric/criterion/json/json_test.go @@ -0,0 +1,69 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package json + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMapCriterionCompareOverride(t *testing.T) { + called := false + criterion := &JSONCriterion{ + Compare: func(actual, expected map[string]any) (bool, error) { + called = true + return true, nil + }, + } + ok, err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "v"}) + assert.True(t, ok) + assert.NoError(t, err) + assert.True(t, called) +} + +func TestMapCriterionDeepEqualMismatch(t *testing.T) { + criterion := &JSONCriterion{} + ok, err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "diff"}) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestMapCriterionDeepEqualSuccess(t *testing.T) { + criterion := &JSONCriterion{} + ok, err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "v"}) + assert.True(t, ok) + assert.NoError(t, err) +} + +func TestJSONCriterionIgnoreSkipsCompare(t *testing.T) { + called := false + criterion := &JSONCriterion{ + Ignore: true, + Compare: func(actual, expected map[string]any) (bool, error) { + called = true + return false, nil + }, + MatchStrategy: JSONMatchStrategyExact, + } + ok, err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "diff"}) + assert.True(t, ok) + assert.NoError(t, err) + assert.False(t, called) +} + +func TestJSONCriterionInvalidMatchStrategy(t *testing.T) { + criterion := &JSONCriterion{ + MatchStrategy: JSONMatchStrategy("invalid"), + } + ok, err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "v"}) + assert.False(t, ok) + assert.Error(t, err) +} diff --git a/evaluation/metric/criterion/llm/llm.go b/evaluation/metric/criterion/llm/llm.go new file mode 100644 index 000000000..2d1f8988d --- /dev/null +++ b/evaluation/metric/criterion/llm/llm.go @@ -0,0 +1,52 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package llm defines criteria for LLM-based judging. +package llm + +import "trpc.group/trpc-go/trpc-agent-go/model" + +// LLMCriterion configures an LLM judge for evaluation. +type LLMCriterion struct { + JudgeModel *JudgeModelOptions `json:"judgeModel,omitempty"` // JudgeModel holds configuration for the judge model. +} + +// JudgeModelOptions captures model and generation configuration for the judge. +type JudgeModelOptions struct { + // ProviderName is the LLM provider name. + ProviderName string `json:"providerName,omitempty"` + // ModelName identifies the judge model. + ModelName string `json:"modelName,omitempty"` + // BaseURL is an optional custom endpoint. + BaseURL string `json:"baseURL,omitempty"` + // APIKey is used for the judge provider. + APIKey string `json:"apiKey,omitempty"` + // ExtraFields carries extra fields. + ExtraFields map[string]any `json:"extraFields,omitempty"` + // NumSamples sets how many judge samples to collect. + NumSamples int `json:"numSamples,omitempty"` + // Generation holds generation parameters for the judge. + Generation *model.GenerationConfig `json:"generationConfig,omitempty"` +} + +// New builds an LlmCriterion with judge model settings. +func New(providerName, modelName string, opt ...Option) *LLMCriterion { + opts := newOptions(opt...) + return &LLMCriterion{ + JudgeModel: &JudgeModelOptions{ + ProviderName: providerName, + ModelName: modelName, + BaseURL: opts.baseURL, + APIKey: opts.apiKey, + ExtraFields: opts.extraFields, + NumSamples: opts.numSamples, + Generation: opts.generation, + }, + } +} diff --git a/evaluation/metric/criterion/llm/llm_test.go b/evaluation/metric/criterion/llm/llm_test.go new file mode 100644 index 000000000..f0ca8a49f --- /dev/null +++ b/evaluation/metric/criterion/llm/llm_test.go @@ -0,0 +1,26 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package llm + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewLlmCriterion(t *testing.T) { + crit := New("provider", "model", WithNumSamples(2), WithBaseURL("base")) + require.NotNil(t, crit.JudgeModel) + assert.Equal(t, "provider", crit.JudgeModel.ProviderName) + assert.Equal(t, "model", crit.JudgeModel.ModelName) + assert.Equal(t, 2, crit.JudgeModel.NumSamples) + assert.Equal(t, "base", crit.JudgeModel.BaseURL) +} diff --git a/evaluation/metric/criterion/llm/options.go b/evaluation/metric/criterion/llm/options.go new file mode 100644 index 000000000..72f188972 --- /dev/null +++ b/evaluation/metric/criterion/llm/options.go @@ -0,0 +1,82 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package llm + +import "trpc.group/trpc-go/trpc-agent-go/model" + +var ( + defaultNumSamples = 1 // defaultNumSamples sets the default judge sample count. + defaultMaxTokens = 2000 // defaultMaxTokens sets the default max tokens for judge generation. + defaultTemperature = 0.8 // defaultTemperature sets the default temperature for judge generation. + defaultStream = false // defaultStream sets the default streaming behavior for judge generation. +) + +// options captures judge model configuration overrides. +type options struct { + baseURL string // baseURL is a custom base URL for the judge model. + apiKey string // apiKey is the credential for the judge model provider. + extraFields map[string]any // extraFields holds provider-specific extras. + numSamples int // numSamples is the number of samples to request. + generation *model.GenerationConfig // generation configures the judge model generation behavior. +} + +// newOptions applies Option overrides on top of sensible defaults. +func newOptions(opt ...Option) *options { + opts := &options{ + numSamples: defaultNumSamples, + generation: &model.GenerationConfig{ + MaxTokens: &defaultMaxTokens, + Temperature: &defaultTemperature, + Stream: defaultStream, + }, + } + for _, o := range opt { + o(opts) + } + return opts +} + +// Option configures judge model settings. +type Option func(*options) + +// WithBaseURL sets a custom base URL for the judge model endpoint. +func WithBaseURL(baseURL string) Option { + return func(o *options) { + o.baseURL = baseURL + } +} + +// WithAPIKey sets the API key used when invoking the judge model provider. +func WithAPIKey(apiKey string) Option { + return func(o *options) { + o.apiKey = apiKey + } +} + +// WithExtraFields supplies provider-specific parameters for the judge model. +func WithExtraFields(extraFields map[string]any) Option { + return func(o *options) { + o.extraFields = extraFields + } +} + +// WithNumSamples overrides how many judge samples to collect. +func WithNumSamples(numSamples int) Option { + return func(o *options) { + o.numSamples = numSamples + } +} + +// WithGeneration sets the generation configuration for the judge model. +func WithGeneration(generation *model.GenerationConfig) Option { + return func(o *options) { + o.generation = generation + } +} diff --git a/evaluation/metric/criterion/llm/options_test.go b/evaluation/metric/criterion/llm/options_test.go new file mode 100644 index 000000000..279bd61be --- /dev/null +++ b/evaluation/metric/criterion/llm/options_test.go @@ -0,0 +1,47 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package llm + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "trpc.group/trpc-go/trpc-agent-go/model" +) + +func TestNewOptionsDefaults(t *testing.T) { + opts := newOptions() + assert.Equal(t, defaultNumSamples, opts.numSamples) + require.NotNil(t, opts.generation) + assert.Equal(t, defaultStream, opts.generation.Stream) + require.NotNil(t, opts.generation.MaxTokens) + assert.Equal(t, defaultMaxTokens, *opts.generation.MaxTokens) + require.NotNil(t, opts.generation.Temperature) + assert.Equal(t, defaultTemperature, *opts.generation.Temperature) +} + +func TestOptionOverrides(t *testing.T) { + gen := &model.GenerationConfig{Stream: true} + opts := newOptions( + WithBaseURL("base"), + WithAPIKey("key"), + WithExtraFields(map[string]any{"x": "y"}), + WithNumSamples(3), + WithGeneration(gen), + ) + assert.Equal(t, "base", opts.baseURL) + assert.Equal(t, "key", opts.apiKey) + require.Contains(t, opts.extraFields, "x") + assert.Equal(t, "y", opts.extraFields["x"]) + assert.Equal(t, 3, opts.numSamples) + assert.Equal(t, gen, opts.generation) +} diff --git a/evaluation/metric/criterion/options.go b/evaluation/metric/criterion/options.go new file mode 100644 index 000000000..89611ab3e --- /dev/null +++ b/evaluation/metric/criterion/options.go @@ -0,0 +1,51 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package criterion + +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/llm" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +// options aggregates configurable parts of Criterion. +type options struct { + // ToolTrajectory sets the default tool trajectory criterion. + toolTrajectory *tooltrajectory.ToolTrajectoryCriterion + // llmJudge sets the LLM judge criterion. + llmJudge *llm.LLMCriterion +} + +// newOptions creates a Options with the provided options. +func newOptions(opt ...Option) *options { + opts := &options{ + toolTrajectory: tooltrajectory.New(), + } + for _, o := range opt { + o(opts) + } + return opts +} + +// Option is a function that configures Criterion. +type Option func(*options) + +// WithToolTrajectory sets the tool trajectory criterion. +func WithToolTrajectory(toolTrajectory *tooltrajectory.ToolTrajectoryCriterion) Option { + return func(o *options) { + o.toolTrajectory = toolTrajectory + } +} + +// WithLLMJudge sets the LLM judge criterion. +func WithLLMJudge(llmJudge *llm.LLMCriterion) Option { + return func(o *options) { + o.llmJudge = llmJudge + } +} diff --git a/evaluation/metric/criterion/options_test.go b/evaluation/metric/criterion/options_test.go new file mode 100644 index 000000000..35a013130 --- /dev/null +++ b/evaluation/metric/criterion/options_test.go @@ -0,0 +1,35 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package criterion + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/llm" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +func TestNewOptionsDefaults(t *testing.T) { + opts := newOptions() + assert.NotNil(t, opts.toolTrajectory) +} + +func TestWithToolTrajectory(t *testing.T) { + custom := tooltrajectory.New(tooltrajectory.WithOrderInsensitive(true)) + opts := newOptions(WithToolTrajectory(custom)) + assert.Equal(t, custom, opts.toolTrajectory) +} + +func TestWithLlmJudge(t *testing.T) { + llmJudge := llm.New("p", "m") + opts := newOptions(WithLLMJudge(llmJudge)) + assert.Equal(t, llmJudge, opts.llmJudge) +} diff --git a/evaluation/metric/criterion/text/text.go b/evaluation/metric/criterion/text/text.go new file mode 100644 index 000000000..356bc361f --- /dev/null +++ b/evaluation/metric/criterion/text/text.go @@ -0,0 +1,79 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package text defines text comparison criteria. +package text + +import ( + "fmt" + "regexp" + "strings" +) + +// TextCriterion governs how two strings should be compared. +type TextCriterion struct { + // Ignore skips comparison when true. + Ignore bool `json:"ignore,omitempty"` + // CaseInsensitive toggles lowercase comparison. + CaseInsensitive bool `json:"caseInsensitive,omitempty"` + // MatchStrategy selects the comparison rule. + MatchStrategy TextMatchStrategy `json:"matchStrategy,omitempty"` + // Compare overrides built-in strategies. + Compare func(actual, expected string) (bool, error) `json:"-"` +} + +// TextMatchStrategy enumerates supported text comparison strategies. +type TextMatchStrategy string + +const ( + // TextMatchStrategyExact matches strings exactly. + TextMatchStrategyExact TextMatchStrategy = "exact" + // TextMatchStrategyContains matches strings that contain the target. + TextMatchStrategyContains TextMatchStrategy = "contains" + // TextMatchStrategyRegex matches strings that match the regex. + TextMatchStrategyRegex TextMatchStrategy = "regex" +) + +// Match compares source and target using the configured strategy. +func (t *TextCriterion) Match(source, target string) (bool, error) { + if t.Ignore { + return true, nil + } + if t.Compare != nil { + return t.Compare(source, target) + } + if t.CaseInsensitive { + source = strings.ToLower(source) + target = strings.ToLower(target) + } + switch t.MatchStrategy { + // Default to exact match. + case TextMatchStrategyExact, "": + if source == target { + return true, nil + } + return false, fmt.Errorf("source %s and target %s do not match", source, target) + case TextMatchStrategyContains: + if strings.Contains(source, target) { + return true, nil + } + return false, fmt.Errorf("source %s does not contain target %s", source, target) + case TextMatchStrategyRegex: + re, err := regexp.Compile(target) + if err != nil { + return false, fmt.Errorf("invalid regex %s: %w", target, err) + } + if re.MatchString(source) { + return true, nil + } + return false, fmt.Errorf("source %s does not match regex %s", source, target) + default: + return false, fmt.Errorf("invalid match strategy %s", t.MatchStrategy) + } +} diff --git a/evaluation/metric/criterion/text/text_test.go b/evaluation/metric/criterion/text/text_test.go new file mode 100644 index 000000000..238d75770 --- /dev/null +++ b/evaluation/metric/criterion/text/text_test.go @@ -0,0 +1,113 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package text + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestTextCriterionJSONRoundTrip(t *testing.T) { + criterion := &TextCriterion{ + Ignore: true, + CaseInsensitive: true, + MatchStrategy: TextMatchStrategyRegex, + } + data, err := json.Marshal(criterion) + assert.NoError(t, err) + assert.JSONEq(t, `{"ignore":true,"caseInsensitive":true,"matchStrategy":"regex"}`, string(data)) + + var decoded TextCriterion + err = json.Unmarshal(data, &decoded) + assert.NoError(t, err) + assert.Equal(t, criterion.Ignore, decoded.Ignore) + assert.Equal(t, criterion.CaseInsensitive, decoded.CaseInsensitive) + assert.Equal(t, criterion.MatchStrategy, decoded.MatchStrategy) +} + +func TestTextCriterionMatchStrategies(t *testing.T) { + criterion := &TextCriterion{ + CaseInsensitive: true, + MatchStrategy: TextMatchStrategyContains, + } + ok, err := criterion.Match("Hello World", "hello") + assert.NoError(t, err) + assert.True(t, ok) +} + +func TestTextCriterionIgnore(t *testing.T) { + criterion := &TextCriterion{ + Ignore: true, + } + ok, err := criterion.Match("anything", "value") + assert.NoError(t, err) + assert.True(t, ok) +} + +func TestTextCriterionRegexInvalid(t *testing.T) { + criterion := &TextCriterion{ + MatchStrategy: TextMatchStrategyRegex, + } + ok, err := criterion.Match("source", "[invalid(") + assert.False(t, ok) + assert.Error(t, err) +} + +func TestTextCriterionUnknownStrategy(t *testing.T) { + criterion := &TextCriterion{ + MatchStrategy: TextMatchStrategy("unknown"), + } + ok, err := criterion.Match("a", "b") + assert.False(t, ok) + assert.Error(t, err) +} + +func TestTextCriterionAllBranches(t *testing.T) { + customCalled := false + custom := &TextCriterion{ + Compare: func(actual, expected string) (bool, error) { + customCalled = true + return true, nil + }, + } + ok, err := custom.Match("x", "y") + assert.True(t, ok) + assert.NoError(t, err) + assert.True(t, customCalled) + + exact := &TextCriterion{ + MatchStrategy: TextMatchStrategyExact, + } + ok, err = exact.Match("same", "same") + assert.True(t, ok) + assert.NoError(t, err) + ok, err = exact.Match("same", "diff") + assert.False(t, ok) + assert.Error(t, err) + + contains := &TextCriterion{ + MatchStrategy: TextMatchStrategyContains, + } + ok, err = contains.Match("hello", "missing") + assert.False(t, ok) + assert.Error(t, err) + + regex := &TextCriterion{ + MatchStrategy: TextMatchStrategyRegex, + } + ok, err = regex.Match("abc123", "abc[0-9]+") + assert.True(t, ok) + assert.NoError(t, err) + ok, err = regex.Match("xyz", "abc[0-9]+") + assert.False(t, ok) + assert.Error(t, err) +} diff --git a/evaluation/metric/criterion/tooltrajectory/options.go b/evaluation/metric/criterion/tooltrajectory/options.go new file mode 100644 index 000000000..cc2fb4965 --- /dev/null +++ b/evaluation/metric/criterion/tooltrajectory/options.go @@ -0,0 +1,80 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package tooltrajectory + +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" +) + +// defaultToolTrajectoryStrategy is used when no user strategy is supplied. +var defaultToolTrajectoryStrategy = &ToolTrajectoryStrategy{ + Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, + Arguments: &json.JSONCriterion{MatchStrategy: json.JSONMatchStrategyExact}, + Response: &json.JSONCriterion{MatchStrategy: json.JSONMatchStrategyExact}, +} + +// options configures ToolTrajectoryCriterion. +type options struct { + // defaultStrategy sets the fallback strategy when no tool-specific strategy is defined. + defaultStrategy *ToolTrajectoryStrategy + // toolStrategy configures per-tool strategies keyed by tool name. + toolStrategy map[string]*ToolTrajectoryStrategy + // orderInsensitive toggles order-agnostic comparison for args and responses. + orderInsensitive bool + // compare allows overriding comparison logic entirely. + compare func(actual, expected *evalset.Invocation) (bool, error) +} + +// newOptions applies provided options for ToolTrajectoryCriterion. +func newOptions(opt ...Option) *options { + opts := &options{ + defaultStrategy: defaultToolTrajectoryStrategy, + toolStrategy: nil, + orderInsensitive: false, + compare: nil, + } + for _, o := range opt { + o(opts) + } + return opts +} + +// Option is a function that configures ToolTrajectoryCriterion. +type Option func(*options) + +// WithDefault sets the default tool trajectory strategy. +func WithDefault(defaultStrategy *ToolTrajectoryStrategy) Option { + return func(o *options) { + o.defaultStrategy = defaultStrategy + } +} + +// WithTool sets the per-tool strategies keyed by tool name. +func WithTool(tool map[string]*ToolTrajectoryStrategy) Option { + return func(o *options) { + o.toolStrategy = tool + } +} + +// WithOrderInsensitive sets the order-agnostic comparison for tool calls and responses. +func WithOrderInsensitive(orderInsensitive bool) Option { + return func(o *options) { + o.orderInsensitive = orderInsensitive + } +} + +// WithCompare sets the tool trajectory comparison logic. +func WithCompare(compare func(actual, expected *evalset.Invocation) (bool, error)) Option { + return func(o *options) { + o.compare = compare + } +} diff --git a/evaluation/metric/criterion/tooltrajectory/options_test.go b/evaluation/metric/criterion/tooltrajectory/options_test.go new file mode 100644 index 000000000..64f79b376 --- /dev/null +++ b/evaluation/metric/criterion/tooltrajectory/options_test.go @@ -0,0 +1,74 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package tooltrajectory + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" +) + +func TestNewOptionsDefaults(t *testing.T) { + opts := newOptions() + assert.Equal(t, defaultToolTrajectoryStrategy, opts.defaultStrategy) + assert.Nil(t, opts.toolStrategy) + assert.False(t, opts.orderInsensitive) + assert.Nil(t, opts.compare) +} + +func TestWithDefault(t *testing.T) { + custom := &ToolTrajectoryStrategy{} + opts := newOptions(WithDefault(custom)) + assert.Equal(t, custom, opts.defaultStrategy) +} + +func TestWithTool(t *testing.T) { + tool := map[string]*ToolTrajectoryStrategy{ + "custom": {}, + } + opts := newOptions(WithTool(tool)) + assert.Equal(t, tool, opts.toolStrategy) +} + +func TestWithOrderInsensitive(t *testing.T) { + opts := newOptions(WithOrderInsensitive(true)) + assert.True(t, opts.orderInsensitive) +} + +func TestWithCompare(t *testing.T) { + var called bool + compare := func(actual, expected *evalset.Invocation) (bool, error) { + called = true + return true, nil + } + opts := newOptions(WithCompare(compare)) + assert.NotNil(t, opts.compare) + ok, err := opts.compare(nil, nil) + assert.True(t, ok) + assert.NoError(t, err) + assert.True(t, called) +} + +func TestDefaultToolTrajectoryStrategyDeepEqualMismatch(t *testing.T) { + ok, errArgs := defaultToolTrajectoryStrategy.Arguments.Match( + map[string]any{"a": 1}, + map[string]any{"a": 2}, + ) + assert.False(t, ok) + assert.Error(t, errArgs) + + ok, errResp := defaultToolTrajectoryStrategy.Response.Match( + map[string]any{"r": 1}, + map[string]any{"r": 3}, + ) + assert.False(t, ok) + assert.Error(t, errResp) +} diff --git a/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go b/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go new file mode 100644 index 000000000..ad1c83f69 --- /dev/null +++ b/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go @@ -0,0 +1,266 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package tooltrajectory defines tool trajectory comparison criteria. +package tooltrajectory + +import ( + "encoding/json" + "errors" + "fmt" + "sort" + + "google.golang.org/genai" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + criterionjson "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" +) + +// New creates a ToolTrajectoryCriterion with the provided options. +func New(opt ...Option) *ToolTrajectoryCriterion { + opts := newOptions(opt...) + return &ToolTrajectoryCriterion{ + DefaultStrategy: opts.defaultStrategy, + ToolStrategy: opts.toolStrategy, + OrderInsensitive: opts.orderInsensitive, + Compare: opts.compare, + } +} + +// ToolTrajectoryCriterion provides comparison rules for tool call and response sequences. +type ToolTrajectoryCriterion struct { + // DefaultStrategy applies when no tool-specific strategy is provided. + DefaultStrategy *ToolTrajectoryStrategy `json:"defaultStrategy,omitempty"` + // ToolStrategy holds per-tool strategies keyed by tool name. + ToolStrategy map[string]*ToolTrajectoryStrategy `json:"toolStrategy,omitempty"` + // OrderInsensitive toggles comparison order for args and responses. + OrderInsensitive bool `json:"orderInsensitive,omitempty"` + // Compare allows custom comparison override. + Compare func(actual, expected *evalset.Invocation) (bool, error) `json:"-"` +} + +// ToolTrajectoryStrategy defines comparison strategies for a single tool. +type ToolTrajectoryStrategy struct { + Name *text.TextCriterion `json:"name,omitempty"` // Name compares tool names. + Arguments *criterionjson.JSONCriterion `json:"arguments,omitempty"` // Arguments compares tool call arguments. + Response *criterionjson.JSONCriterion `json:"response,omitempty"` // Response compares tool call responses. +} + +// Match compares actual and expected invocations according to tool trajectory rules. +func (t *ToolTrajectoryCriterion) Match(actual, expected *evalset.Invocation) (bool, error) { + if t.Compare != nil { + return t.Compare(actual, expected) + } + if actual == nil || expected == nil { + return false, fmt.Errorf("actual or expected invocation is nil") + } + if actual.IntermediateData == nil || expected.IntermediateData == nil { + return false, fmt.Errorf("actual or expected intermediate data is nil") + } + // Ensure one-to-one mapping between tool calls and responses on actual invocation. + if len(actual.IntermediateData.ToolUses) != len(actual.IntermediateData.ToolResponses) { + return false, fmt.Errorf("tool uses and tool responses count mismatch: %d != %d", + len(actual.IntermediateData.ToolUses), len(actual.IntermediateData.ToolResponses)) + } + // Ensure one-to-one mapping between tool calls and responses on expected invocation. + if len(expected.IntermediateData.ToolUses) != len(expected.IntermediateData.ToolResponses) { + return false, fmt.Errorf("tool uses and tool responses count mismatch: %d != %d", + len(expected.IntermediateData.ToolUses), len(expected.IntermediateData.ToolResponses)) + } + // Ensure the same number of tool uses before detailed comparison. + if len(actual.IntermediateData.ToolUses) != len(expected.IntermediateData.ToolUses) { + return false, fmt.Errorf("tool uses count mismatch: %d != %d", + len(actual.IntermediateData.ToolUses), len(expected.IntermediateData.ToolUses)) + } + if len(actual.IntermediateData.ToolUses) == 0 { + return true, nil + } + actualTools, err := getToolComparers( + actual.IntermediateData.ToolUses, + actual.IntermediateData.ToolResponses, + t.OrderInsensitive, + ) + if err != nil { + return false, fmt.Errorf("get actual tools: %w", err) + } + expectedTools, err := getToolComparers( + expected.IntermediateData.ToolUses, + expected.IntermediateData.ToolResponses, + t.OrderInsensitive, + ) + if err != nil { + return false, fmt.Errorf("get expected tools: %w", err) + } + if t.OrderInsensitive { + sort.Slice(actualTools, func(i, j int) bool { + return actualTools[i].lessThan(actualTools[j]) + }) + sort.Slice(expectedTools, func(i, j int) bool { + return expectedTools[i].lessThan(expectedTools[j]) + }) + } + for i := range len(actualTools) { + strategy := getStrategy(t, actualTools[i], expectedTools[i]) + ok, err := strategy.match(actualTools[i], expectedTools[i]) + if err != nil { + return false, fmt.Errorf("tool %s mismatch: %w", actualTools[i].name, err) + } + if !ok { + return false, fmt.Errorf("tool %s mismatch", actualTools[i].name) + } + } + return true, nil +} + +// Match validates a single tool call pair using configured criteria. +func (t *ToolTrajectoryStrategy) match(actual, expected *toolComparer) (bool, error) { + if t.Name != nil { + ok, err := t.Name.Match(actual.name, expected.name) + if err != nil { + return false, fmt.Errorf("name mismatch: %w", err) + } + if !ok { + return false, fmt.Errorf("name mismatch") + } + } + if t.Arguments != nil { + ok, err := t.Arguments.Match(actual.args, expected.args) + if err != nil { + return false, fmt.Errorf("arguments mismatch: %w", err) + } + if !ok { + return false, fmt.Errorf("arguments mismatch") + } + } + if t.Response != nil { + ok, err := t.Response.Match(actual.response, expected.response) + if err != nil { + return false, fmt.Errorf("response mismatch: %w", err) + } + if !ok { + return false, fmt.Errorf("response mismatch") + } + } + return true, nil +} + +// toolComparer normalizes tool call and response data for comparison. +type toolComparer struct { + name string // name holds the tool name. + args map[string]any // args holds parsed tool arguments. + response map[string]any // response holds parsed tool response payload. + argsOrder string // argsOrder caches JSON for order-insensitive compare. + responseOrder string // responseOrder caches JSON for order-insensitive compare. +} + +// lessThan provides deterministic ordering when order-insensitive compares require sorting. +func (t *toolComparer) lessThan(other *toolComparer) bool { + if t.name != other.name { + return t.name < other.name + } + if t.argsOrder != other.argsOrder { + return t.argsOrder < other.argsOrder + } + if t.responseOrder != other.responseOrder { + return t.responseOrder < other.responseOrder + } + return false +} + +// getToolComparers aligns tool uses with their responses and builds toolComparer. +func getToolComparers(toolUses []*genai.FunctionCall, toolResponses []*genai.FunctionResponse, + orderInsensitive bool) ([]*toolComparer, error) { + // toolCallIDs ensures every tool use can be matched by ID. + // Map from tool call id to index. + toolCallIDs := make(map[string]int) + for i := range len(toolUses) { + if toolUses[i].ID == "" { + return nil, fmt.Errorf("tool use id is empty") + } + if _, ok := toolCallIDs[toolUses[i].ID]; ok { + return nil, fmt.Errorf("tool use id %s is duplicated", toolUses[i].ID) + } + toolCallIDs[toolUses[i].ID] = i + } + // toolResponseIDs ensures every tool response can be matched by ID. + // Map from tool response id to index. + toolResponseIDs := make(map[string]int) + for i := range len(toolResponses) { + if toolResponses[i].ID == "" { + return nil, fmt.Errorf("tool response id is empty") + } + if _, ok := toolResponseIDs[toolResponses[i].ID]; ok { + return nil, fmt.Errorf("tool response id %s is duplicated", toolResponses[i].ID) + } + toolResponseIDs[toolResponses[i].ID] = i + } + for toolID := range toolCallIDs { + if _, ok := toolResponseIDs[toolID]; !ok { + return nil, fmt.Errorf("tool id %s is missing response", toolID) + } + } + toolComparers := make([]*toolComparer, 0, len(toolUses)) + for i := range len(toolUses) { + toolComparer, err := getToolComparer( + toolUses[i], + toolResponses[toolResponseIDs[toolUses[i].ID]], + orderInsensitive, + ) + if err != nil { + return nil, fmt.Errorf("get tool comparer: %w", err) + } + toolComparers = append(toolComparers, toolComparer) + } + return toolComparers, nil +} + +// getToolComparer pairs a tool use with its response and precomputes ordering hints. +func getToolComparer(toolUse *genai.FunctionCall, toolResponse *genai.FunctionResponse, + orderInsensitive bool) (*toolComparer, error) { + if toolUse == nil || toolResponse == nil { + return nil, errors.New("tool use or tool response is nil") + } + tool := &toolComparer{ + name: toolUse.Name, + args: toolUse.Args, + response: toolResponse.Response, + } + if orderInsensitive { + args, err := json.Marshal(toolUse.Args) + if err != nil { + return nil, fmt.Errorf("marshal arguments: %w", err) + } + response, err := json.Marshal(toolResponse.Response) + if err != nil { + return nil, fmt.Errorf("marshal response: %w", err) + } + tool.argsOrder = string(args) + tool.responseOrder = string(response) + } + return tool, nil +} + +// getStrategy picks the comparison strategy for a specific tool pair. +func getStrategy(t *ToolTrajectoryCriterion, actualTool, + expectedTool *toolComparer) *ToolTrajectoryStrategy { + if t.ToolStrategy != nil { + strategy, ok := t.ToolStrategy[actualTool.name] + if ok { + return strategy + } + strategy, ok = t.ToolStrategy[expectedTool.name] + if ok { + return strategy + } + } + if t.DefaultStrategy != nil { + return t.DefaultStrategy + } + return defaultToolTrajectoryStrategy +} diff --git a/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go b/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go new file mode 100644 index 000000000..344dd241f --- /dev/null +++ b/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go @@ -0,0 +1,650 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package tooltrajectory + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "google.golang.org/genai" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + criterionjson "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" +) + +func TestToolTrajectoryCriterionJSONRoundTrip(t *testing.T) { + criterion := &ToolTrajectoryCriterion{ + DefaultStrategy: &ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + Ignore: true, + CaseInsensitive: true, + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &criterionjson.JSONCriterion{}, + Response: &criterionjson.JSONCriterion{}, + }, + ToolStrategy: map[string]*ToolTrajectoryStrategy{ + "foo": { + Name: &text.TextCriterion{ + Ignore: true, + CaseInsensitive: true, + MatchStrategy: text.TextMatchStrategyContains, + }, + }, + }, + OrderInsensitive: true, + } + data, err := json.Marshal(criterion) + assert.NoError(t, err) + assert.JSONEq(t, `{ + "defaultStrategy":{ + "name":{"ignore":true,"caseInsensitive":true,"matchStrategy":"exact"}, + "arguments":{}, + "response":{} + }, + "toolStrategy":{ + "foo":{"name":{"ignore":true,"caseInsensitive":true,"matchStrategy":"contains"}} + }, + "orderInsensitive":true + }`, string(data)) + var decoded ToolTrajectoryCriterion + err = json.Unmarshal(data, &decoded) + assert.NoError(t, err) + assert.True(t, decoded.OrderInsensitive) + assert.NotNil(t, decoded.DefaultStrategy) + assert.Equal(t, text.TextMatchStrategyExact, decoded.DefaultStrategy.Name.MatchStrategy) + assert.True(t, decoded.DefaultStrategy.Name.Ignore) + assert.True(t, decoded.DefaultStrategy.Name.CaseInsensitive) + assert.NotNil(t, decoded.ToolStrategy["foo"]) + assert.Equal(t, text.TextMatchStrategyContains, decoded.ToolStrategy["foo"].Name.MatchStrategy) + assert.True(t, decoded.ToolStrategy["foo"].Name.Ignore) + assert.True(t, decoded.ToolStrategy["foo"].Name.CaseInsensitive) +} + +func TestToolTrajectoryCriterionJSONOmitEmpty(t *testing.T) { + criterion := &ToolTrajectoryCriterion{} + data, err := json.Marshal(criterion) + assert.NoError(t, err) + assert.JSONEq(t, `{}`, string(data)) +} + +func TestToolTrajectoryStrategyJSONRoundTrip(t *testing.T) { + strategy := &ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + Ignore: true, + CaseInsensitive: true, + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &criterionjson.JSONCriterion{}, + Response: &criterionjson.JSONCriterion{}, + } + data, err := json.Marshal(strategy) + assert.NoError(t, err) + assert.JSONEq(t, `{ + "name":{"ignore":true,"caseInsensitive":true,"matchStrategy":"exact"}, + "arguments":{}, + "response":{} + }`, string(data)) + + var decoded ToolTrajectoryStrategy + err = json.Unmarshal(data, &decoded) + assert.NoError(t, err) + assert.Equal(t, text.TextMatchStrategyExact, decoded.Name.MatchStrategy) + assert.True(t, decoded.Name.Ignore) + assert.True(t, decoded.Name.CaseInsensitive) + assert.NotNil(t, decoded.Arguments) + assert.NotNil(t, decoded.Response) +} + +func TestToolTrajectoryCriterionMatchOrderInsensitive(t *testing.T) { + actual := makeInvocation( + []toolData{ + {id: "call-1", name: "shared", args: map[string]any{"a": 1}, response: map[string]any{"r": 2}}, + {id: "call-2", name: "shared", args: map[string]any{"a": 2}, response: map[string]any{"r": 3}}, + }, + ) + expected := makeInvocation( + []toolData{ + {id: "call-2", name: "shared", args: map[string]any{"a": 2}, response: map[string]any{"r": 3}}, + {id: "call-1", name: "shared", args: map[string]any{"a": 1}, response: map[string]any{"r": 2}}, + }, + ) + + criterion := &ToolTrajectoryCriterion{ + OrderInsensitive: true, + } + ok, err := criterion.Match(actual, expected) + assert.True(t, ok) + assert.NoError(t, err) +} + +func TestToolTrajectoryCriterionMissingResponse(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{}, + }, + } + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool"}, + }, + }, + } + criterion := &ToolTrajectoryCriterion{} + ok, err := criterion.Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionCustomStrategy(t *testing.T) { + actual := makeInvocation( + []toolData{ + {id: "call-1", name: "custom", args: map[string]any{"k": "v"}, response: map[string]any{"r": "x"}}, + }, + ) + expected := makeInvocation( + []toolData{ + {id: "call-1", name: "custom", args: map[string]any{"k": "v"}, response: map[string]any{"r": "x"}}, + }, + ) + customStrategy := &ToolTrajectoryStrategy{ + Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, + } + criterion := &ToolTrajectoryCriterion{ + ToolStrategy: map[string]*ToolTrajectoryStrategy{ + "custom": customStrategy, + }, + } + ok, err := criterion.Match(actual, expected) + assert.True(t, ok) + assert.NoError(t, err) +} + +type toolData struct { + id string + name string + args map[string]any + response map[string]any +} + +func makeInvocation(tools []toolData) *evalset.Invocation { + toolUses := make([]*genai.FunctionCall, 0, len(tools)) + toolResponses := make([]*genai.FunctionResponse, 0, len(tools)) + for _, t := range tools { + toolUses = append(toolUses, &genai.FunctionCall{ + ID: t.id, + Name: t.name, + Args: t.args, + }) + toolResponses = append(toolResponses, &genai.FunctionResponse{ + ID: t.id, + Name: t.name, + Response: t.response, + }) + } + return &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: toolUses, + ToolResponses: toolResponses, + }, + } +} + +func TestToolTrajectoryCriterionIDMismatch(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "use-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "resp-1", Name: "tool"}, + }, + }, + } + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "use-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "use-1", Name: "tool"}, + }, + }, + } + criterion := New() + ok, err := criterion.Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionNilInvocation(t *testing.T) { + criterion := New() + ok, err := criterion.Match(nil, makeInvocation(nil)) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionNilIntermediate(t *testing.T) { + criterion := New() + ok, err := criterion.Match( + &evalset.Invocation{}, + &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}}, + ) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionEmptyToolUseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "resp-1", Name: "tool"}, + }, + }, + } + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "resp-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "resp-1", Name: "tool"}, + }, + }, + } + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionDuplicateResponseID(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + actual.IntermediateData.ToolResponses = append(actual.IntermediateData.ToolResponses, &genai.FunctionResponse{ + ID: "call-1", + Name: "tool", + Response: map[string]any{"r": 2}, + }) + ok, err := New().Match(actual, makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + })) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionCustomCompare(t *testing.T) { + var called bool + criterion := &ToolTrajectoryCriterion{ + Compare: func(actual, expected *evalset.Invocation) (bool, error) { + called = true + return true, nil + }, + } + ok, err := criterion.Match(&evalset.Invocation{}, &evalset.Invocation{}) + assert.True(t, ok) + assert.NoError(t, err) + assert.True(t, called) +} + +func TestToolTrajectoryCriterionExpectedResponseCountMismatch(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool", Args: map[string]any{"a": 1}}, + }, + ToolResponses: []*genai.FunctionResponse{}, + }, + } + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionToolUsesCountMismatch(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + {id: "call-2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, + }) + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionZeroTools(t *testing.T) { + actual := &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}} + expected := &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}} + ok, err := New().Match(actual, expected) + assert.True(t, ok) + assert.NoError(t, err) +} + +func TestToolTrajectoryCriterionExpectedInvalidID(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "", Name: "tool", Args: map[string]any{"a": 1}}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool", Response: map[string]any{"r": 1}}, + }, + }, + } + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionStrategyMismatch(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool-A", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool-B", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + strategy := &ToolTrajectoryStrategy{ + Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, + } + criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{"tool-A": strategy})) + ok, err := criterion.Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionDuplicateToolUseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "dup", Name: "tool"}, + {ID: "dup", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "dup", Name: "tool"}, + {ID: "dup2", Name: "tool"}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "dup", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + {id: "dup2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, + }) + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionDuplicateToolResponseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + {ID: "call-2", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool"}, + {ID: "call-1", Name: "tool"}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + {id: "call-2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, + }) + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionMissingResponseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "other", Name: "tool"}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolComparerOrderInsensitiveMarshalError(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool", Args: map[string]any{"bad": make(chan int)}}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool", Response: map[string]any{"r": 1}}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{}, response: map[string]any{"r": 1}}, + }) + ok, err := New(WithOrderInsensitive(true)).Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolComparerOrderInsensitiveMarshalResponseError(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool", Args: map[string]any{"a": 1}}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool", Response: map[string]any{"bad": make(chan int)}}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + ok, err := New(WithOrderInsensitive(true)).Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolComparerLessThanBranches(t *testing.T) { + left := &toolComparer{name: "a", argsOrder: "1", responseOrder: "1"} + right := &toolComparer{name: "b", argsOrder: "0", responseOrder: "0"} + assert.True(t, left.lessThan(right)) + + left2 := &toolComparer{name: "a", argsOrder: "2", responseOrder: "1"} + right2 := &toolComparer{name: "a", argsOrder: "3", responseOrder: "0"} + assert.True(t, left2.lessThan(right2)) + + left3 := &toolComparer{name: "a", argsOrder: "1", responseOrder: "2"} + right3 := &toolComparer{name: "a", argsOrder: "1", responseOrder: "3"} + assert.True(t, left3.lessThan(right3)) +} + +func TestToolTrajectoryStrategyArgumentAndResponseMismatch(t *testing.T) { + strategy := &ToolTrajectoryStrategy{ + Arguments: &criterionjson.JSONCriterion{}, + Response: &criterionjson.JSONCriterion{}, + } + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 3}}, + }) + criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{ + "tool": strategy, + })) + ok, err := criterion.Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestGetToolComparerNilInputs(t *testing.T) { + _, err := getToolComparer(nil, &genai.FunctionResponse{}, false) + assert.Error(t, err) + _, err = getToolComparer(&genai.FunctionCall{}, nil, false) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionMissingResponseSet(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool"}, + }, + }, + } + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "other", Name: "tool"}, + }, + }, + } + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionFallbackDefault(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + criterion := &ToolTrajectoryCriterion{ + DefaultStrategy: nil, + ToolStrategy: nil, + } + ok, err := criterion.Match(actual, expected) + assert.True(t, ok) + assert.NoError(t, err) +} + +func TestToolTrajectoryCriterionFallbackDefaultStrategy(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + criterion := &ToolTrajectoryCriterion{ + DefaultStrategy: nil, + ToolStrategy: nil, + } + ok, err := criterion.Match(actual, expected) + assert.True(t, ok) + assert.NoError(t, err) +} + +func TestToolTrajectoryCriterionEmptyToolResponseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "", Name: "tool"}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{}, response: map[string]any{}}, + }) + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionStrategyLookupByExpectedName(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "unknown", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "custom", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + customStrategy := &ToolTrajectoryStrategy{} + criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{ + "custom": customStrategy, + })) + ok, err := criterion.Match(actual, expected) + assert.True(t, ok) + assert.NoError(t, err) +} + +func TestToolTrajectoryStrategyResponseMismatchOnly(t *testing.T) { + strategy := &ToolTrajectoryStrategy{ + Arguments: &criterionjson.JSONCriterion{}, + Response: &criterionjson.JSONCriterion{}, + } + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 2}}, + }) + criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{ + "tool": strategy, + })) + ok, err := criterion.Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolComparerLessThanEqual(t *testing.T) { + left := &toolComparer{name: "same", argsOrder: "1", responseOrder: "1"} + right := &toolComparer{name: "same", argsOrder: "1", responseOrder: "1"} + assert.False(t, left.lessThan(right)) +} + +func TestInternalTextAndMapWrappers(t *testing.T) { + txt := &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact} + ok, err := txt.Match("same", "same") + assert.True(t, ok) + assert.NoError(t, err) + + crit := &criterionjson.JSONCriterion{} + ok, err = crit.Match(map[string]any{"a": 1}, map[string]any{"a": 1}) + assert.True(t, ok) + assert.NoError(t, err) +} diff --git a/evaluation/metric/metric.go b/evaluation/metric/metric.go index e8fb4beb2..ca82bbc3a 100644 --- a/evaluation/metric/metric.go +++ b/evaluation/metric/metric.go @@ -10,15 +10,18 @@ // Package metric provides evaluation metrics. package metric -import "context" +import ( + "context" + + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" +) // EvalMetric represents a metric used to evaluate a particular aspect of an eval case. // It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvalMetric struct { - // MetricName identifies the metric. - MetricName string `json:"metricName,omitempty"` - // Threshold value for this metric. - Threshold float64 `json:"threshold,omitempty"` + MetricName string `json:"metricName,omitempty"` // MetricName identifies the metric. + Threshold float64 `json:"threshold,omitempty"` // Threshold value for this metric. + Criterion *criterion.Criterion `json:"criterion,omitempty"` // Evaluation criterion used by the metric. } // Manager defines the interface for managing evaluation metrics. diff --git a/evaluation/service/internal/inference/inference.go b/evaluation/service/internal/inference/inference.go index 9b12fd833..af17773d2 100644 --- a/evaluation/service/internal/inference/inference.go +++ b/evaluation/service/internal/inference/inference.go @@ -72,11 +72,12 @@ func inferenceInvocation( if err != nil { return nil, fmt.Errorf("runner run: %w", err) } - // Capture the invocation ID, final response, and tool uses. + // Capture the invocation ID, final response, tool uses, and tool responses. var ( invocationID string finalResponse *genai.Content toolUses []*genai.FunctionCall + toolResponses []*genai.FunctionResponse ) for event := range events { if event == nil { @@ -105,6 +106,14 @@ func inferenceInvocation( } toolUses = append(toolUses, uses...) } + // Capture tool call responses. + if event.IsToolResultResponse() { + responses, err := convertToolResultResponse(event) + if err != nil { + return nil, fmt.Errorf("convert tool result response: %w", err) + } + toolResponses = append(toolResponses, responses...) + } } // Convert the final response to evalset content. return &evalset.Invocation{ @@ -112,7 +121,8 @@ func inferenceInvocation( UserContent: invocation.UserContent, FinalResponse: finalResponse, IntermediateData: &evalset.IntermediateData{ - ToolUses: toolUses, + ToolUses: toolUses, + ToolResponses: toolResponses, }, }, nil } @@ -132,6 +142,27 @@ func convertToolCallResponse(event *event.Event) ([]*genai.FunctionCall, error) return toolUses, nil } +// convertToolResultResponse converts the tool result response to function responses. +func convertToolResultResponse(event *event.Event) ([]*genai.FunctionResponse, error) { + toolResponses := []*genai.FunctionResponse{} + for _, choice := range event.Response.Choices { + if choice.Message.ToolID == "" { + continue + } + var response map[string]any + if err := json.Unmarshal([]byte(choice.Message.Content), &response); err != nil { + return nil, fmt.Errorf("unmarshal tool result response: %w", err) + } + toolResponse := &genai.FunctionResponse{ + ID: choice.Message.ToolID, + Name: choice.Message.ToolName, + Response: response, + } + toolResponses = append(toolResponses, toolResponse) + } + return toolResponses, nil +} + // convertContentToMessage transforms evalset input content into a model message. func convertContentToMessage(content *genai.Content) (*model.Message, error) { if content == nil { diff --git a/evaluation/service/internal/inference/inference_test.go b/evaluation/service/internal/inference/inference_test.go index 6ef0f9827..38b12387e 100644 --- a/evaluation/service/internal/inference/inference_test.go +++ b/evaluation/service/internal/inference/inference_test.go @@ -265,3 +265,54 @@ func TestConvertToolCallResponse(t *testing.T) { assert.Equal(t, "tool", result[0].Name) assert.Equal(t, float64(1), result[0].Args["count"]) } + +func TestConvertToolResultResponse(t *testing.T) { + ev := &event.Event{ + Response: &model.Response{ + Choices: []model.Choice{ + { + Message: model.Message{ + ToolID: "call-1", + ToolName: "tool", + Content: `{"result":42}`, + }, + }, + }, + }, + } + result, err := convertToolResultResponse(ev) + assert.NoError(t, err) + assert.Len(t, result, 1) + assert.Equal(t, "call-1", result[0].ID) + assert.Equal(t, "tool", result[0].Name) + assert.Equal(t, float64(42), result[0].Response["result"]) +} + +func TestConvertToolResultResponseSkipEmptyID(t *testing.T) { + ev := &event.Event{ + Response: &model.Response{ + Choices: []model.Choice{ + {Message: model.Message{Content: "{}", ToolID: ""}}, + {Message: model.Message{Content: `{"ok":true}`, ToolID: "id-1", ToolName: "t"}}, + }, + }, + } + result, err := convertToolResultResponse(ev) + assert.NoError(t, err) + assert.Len(t, result, 1) + assert.Equal(t, "id-1", result[0].ID) + assert.Equal(t, "t", result[0].Name) + assert.Equal(t, true, result[0].Response["ok"]) +} + +func TestConvertToolResultResponseInvalidJSON(t *testing.T) { + ev := &event.Event{ + Response: &model.Response{ + Choices: []model.Choice{ + {Message: model.Message{Content: "{", ToolID: "bad"}}, + }, + }, + } + _, err := convertToolResultResponse(ev) + assert.Error(t, err) +} diff --git a/evaluation/service/local/local.go b/evaluation/service/local/local.go index 70cb015ff..93f10b29d 100644 --- a/evaluation/service/local/local.go +++ b/evaluation/service/local/local.go @@ -202,6 +202,7 @@ func (s *local) evaluatePerCase(ctx context.Context, inferenceResult *service.In overallMetricResults = append(overallMetricResults, &evalresult.EvalMetricResult{ MetricName: evalMetric.MetricName, Threshold: evalMetric.Threshold, + Criterion: evalMetric.Criterion, Score: result.OverallScore, EvalStatus: result.OverallStatus, }) @@ -214,6 +215,7 @@ func (s *local) evaluatePerCase(ctx context.Context, inferenceResult *service.In evalMetricResult := &evalresult.EvalMetricResult{ MetricName: evalMetric.MetricName, Threshold: evalMetric.Threshold, + Criterion: evalMetric.Criterion, Score: invocationResult.Score, EvalStatus: invocationResult.Status, } diff --git a/evaluation/service/local/local_test.go b/evaluation/service/local/local_test.go index 93b738719..79959d7d9 100644 --- a/evaluation/service/local/local_test.go +++ b/evaluation/service/local/local_test.go @@ -317,7 +317,7 @@ func TestLocalEvaluateSuccess(t *testing.T) { result: &evaluator.EvaluateResult{ OverallScore: 0.8, OverallStatus: status.EvalStatusPassed, - PerInvocationResults: []evaluator.PerInvocationResult{ + PerInvocationResults: []*evaluator.PerInvocationResult{ {Score: 0.8, Status: status.EvalStatusPassed}, }, }, @@ -467,7 +467,7 @@ func TestLocalEvaluatePerCaseErrors(t *testing.T) { result: &evaluator.EvaluateResult{ OverallScore: 1, OverallStatus: status.EvalStatusPassed, - PerInvocationResults: []evaluator.PerInvocationResult{}, + PerInvocationResults: []*evaluator.PerInvocationResult{}, }, } assert.NoError(t, reg.Register(metricName, fakeEval)) @@ -491,7 +491,7 @@ func TestLocalEvaluatePerCaseErrors(t *testing.T) { result: &evaluator.EvaluateResult{ OverallScore: 0, OverallStatus: status.EvalStatusUnknown, - PerInvocationResults: []evaluator.PerInvocationResult{{Score: 0, Status: status.EvalStatusNotEvaluated}}, + PerInvocationResults: []*evaluator.PerInvocationResult{{Score: 0, Status: status.EvalStatusNotEvaluated}}, }, } assert.NoError(t, reg.Register(metricName, fakeEval)) diff --git a/examples/evaluation/go.mod b/examples/evaluation/go.mod index 8e24d2a3d..77523f1ac 100644 --- a/examples/evaluation/go.mod +++ b/examples/evaluation/go.mod @@ -5,6 +5,8 @@ go 1.24.4 replace ( trpc.group/trpc-go/trpc-agent-go => ../../ trpc.group/trpc-go/trpc-agent-go/evaluation => ../../evaluation + trpc.group/trpc-go/trpc-agent-go/model/anthropic => ../../model/anthropic + trpc.group/trpc-go/trpc-agent-go/model/provider => ../../model/provider trpc.group/trpc-go/trpc-agent-go/server/debug => ../../server/debug ) @@ -19,6 +21,7 @@ require ( cloud.google.com/go v0.116.0 // indirect cloud.google.com/go/auth v0.9.3 // indirect cloud.google.com/go/compute/metadata v0.5.0 // indirect + github.com/anthropics/anthropic-sdk-go v1.16.0 // indirect github.com/bmatcuk/doublestar/v4 v4.9.1 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/go-logr/logr v1.4.3 // indirect @@ -35,7 +38,7 @@ require ( github.com/panjf2000/ants/v2 v2.10.0 // indirect github.com/rs/cors v1.11.1 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect - github.com/tidwall/gjson v1.14.4 // indirect + github.com/tidwall/gjson v1.18.0 // indirect github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.1 // indirect github.com/tidwall/sjson v1.2.5 // indirect @@ -51,14 +54,16 @@ require ( go.opentelemetry.io/proto/otlp v1.3.1 // indirect go.uber.org/multierr v1.10.0 // indirect go.uber.org/zap v1.27.0 // indirect - golang.org/x/crypto v0.32.0 // indirect - golang.org/x/net v0.34.0 // indirect - golang.org/x/sync v0.10.0 // indirect + golang.org/x/crypto v0.40.0 // indirect + golang.org/x/net v0.41.0 // indirect + golang.org/x/sync v0.16.0 // indirect golang.org/x/sys v0.35.0 // indirect - golang.org/x/text v0.21.0 // indirect + golang.org/x/text v0.27.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect google.golang.org/grpc v1.66.2 // indirect google.golang.org/protobuf v1.34.2 // indirect trpc.group/trpc-go/trpc-a2a-go v0.2.5-0.20251023030722-7f02b57fd14a // indirect + trpc.group/trpc-go/trpc-agent-go/model/anthropic v0.0.0-00010101000000-000000000000 // indirect + trpc.group/trpc-go/trpc-agent-go/model/provider v0.0.0-00010101000000-000000000000 // indirect ) diff --git a/examples/evaluation/go.sum b/examples/evaluation/go.sum index 7d199233e..caefb93ea 100644 --- a/examples/evaluation/go.sum +++ b/examples/evaluation/go.sum @@ -6,6 +6,8 @@ cloud.google.com/go/auth v0.9.3/go.mod h1:7z6VY+7h3KUdRov5F1i8NDP5ZzWKYmEPO842Bg cloud.google.com/go/compute/metadata v0.5.0 h1:Zr0eK8JbFv6+Wi4ilXAR8FJ3wyNdpxHKJNPos6LTZOY= cloud.google.com/go/compute/metadata v0.5.0/go.mod h1:aHnloV2TPI38yx4s9+wAZhHykWvVCfu7hQbF+9CWoiY= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/anthropics/anthropic-sdk-go v1.16.0 h1:nRkOFDqYXsHteoIhjdJr/5dsiKbFF3rflSv8ax50y8o= +github.com/anthropics/anthropic-sdk-go v1.16.0/go.mod h1:WTz31rIUHUHqai2UslPpw5CwXrQP3geYBioRV4WOLvE= github.com/bmatcuk/doublestar/v4 v4.9.1 h1:X8jg9rRZmJd4yRy7ZeNDRnM+T3ZfHv15JiBJ/avrEXE= github.com/bmatcuk/doublestar/v4 v4.9.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= @@ -83,8 +85,8 @@ github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= -github.com/tidwall/gjson v1.14.4 h1:uo0p8EbA09J7RQaflQ1aBRffTR7xedD2bcIVSYxLnkM= -github.com/tidwall/gjson v1.14.4/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= @@ -122,8 +124,8 @@ go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc= -golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= +golang.org/x/crypto v0.40.0 h1:r4x+VvoG5Fm+eJcxMaY8CQM7Lb0l1lsmjGBQ6s8BfKM= +golang.org/x/crypto v0.40.0/go.mod h1:Qr1vMER5WyS2dfPHAlsOj01wgLbsyWtFn/aY+5+ZdxY= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= @@ -134,15 +136,15 @@ golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73r golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0= -golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= +golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= +golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= -golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= -golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -151,8 +153,8 @@ golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= +golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= diff --git a/examples/evaluation/inmemory/main.go b/examples/evaluation/inmemory/main.go index 449159d58..5655bd876 100644 --- a/examples/evaluation/inmemory/main.go +++ b/examples/evaluation/inmemory/main.go @@ -15,6 +15,10 @@ import ( evalsetinmemory "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset/inmemory" "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/registry" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + cjson "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" + ctooltrajectory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" metricinmemory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/inmemory" "trpc.group/trpc-go/trpc-agent-go/runner" ) @@ -35,10 +39,10 @@ func main() { ctx := context.Background() // New runner. run := runner.NewRunner(appName, newCalculatorAgent(*modelName, *streaming)) - + // Ensure runner resources are cleaned up (trpc-agent-go >= v0.5.0) defer run.Close() - + // New manager and registry for evaluation. evalSetManager := evalsetinmemory.New() metricManager := metricinmemory.New() @@ -146,6 +150,7 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { IntermediateData: &evalset.IntermediateData{ ToolUses: []*genai.FunctionCall{ { + ID: "tool_use_1", Name: "calculator", Args: map[string]interface{}{ "operation": "add", @@ -154,6 +159,18 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { }, }, }, + ToolResponses: []*genai.FunctionResponse{ + { + ID: "tool_use_1", + Name: "calculator", + Response: map[string]interface{}{ + "a": 2.0, + "b": 3.0, + "operation": "add", + "result": 5.0, + }, + }, + }, }, }, }, @@ -186,6 +203,7 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { IntermediateData: &evalset.IntermediateData{ ToolUses: []*genai.FunctionCall{ { + ID: "tool_use_2", Name: "calculator", Args: map[string]interface{}{ "operation": "multiply", @@ -194,6 +212,18 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { }, }, }, + ToolResponses: []*genai.FunctionResponse{ + { + ID: "tool_use_2", + Name: "calculator", + Response: map[string]interface{}{ + "a": 6.0, + "b": 7.0, + "operation": "multiply", + "result": 42.0, + }, + }, + }, }, }, }, @@ -215,6 +245,25 @@ func prepareMetric(ctx context.Context, metricManager metric.Manager) error { evalMetric := &metric.EvalMetric{ MetricName: "tool_trajectory_avg_score", Threshold: 1.0, + Criterion: criterion.New( + criterion.WithToolTrajectory( + ctooltrajectory.New( + ctooltrajectory.WithDefault( + &ctooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &cjson.JSONCriterion{ + MatchStrategy: cjson.JSONMatchStrategyExact, + }, + Response: &cjson.JSONCriterion{ + MatchStrategy: cjson.JSONMatchStrategyExact, + }, + }, + ), + ), + ), + ), } return metricManager.Add(ctx, appName, evalSetID, evalMetric) } diff --git a/examples/evaluation/llm/finalresponse/README.md b/examples/evaluation/llm/finalresponse/README.md new file mode 100644 index 000000000..f39dbebb5 --- /dev/null +++ b/examples/evaluation/llm/finalresponse/README.md @@ -0,0 +1,46 @@ +# Final Response (LLM) Evaluation Example + +This example runs a final-response evaluation using the built-in `llm_final_response` evaluator with local file-backed managers. Eval sets, metrics, and results live on disk so you can inspect or version them. + +## Environment Variables + +| Variable | Description | Default | +| --- | --- | --- | +| `OPENAI_API_KEY` | API key for the judge/agent model (required) | `` | +| `OPENAI_BASE_URL` | Optional custom endpoint for OpenAI-compatible APIs | `https://api.openai.com/v1` | + +## Flags + +| Flag | Description | Default | +| --- | --- | --- | +| `-model` | Model identifier used by the agent | `gpt-4o-mini` | +| `-streaming` | Enable streaming responses from the agent | `false` | +| `-data-dir` | Directory containing `.evalset.json` and `.metrics.json` | `./data` | +| `-output-dir` | Directory where evaluation results are written | `./output` | +| `-eval-set` | Evaluation set ID to execute | `final-response-basic` | + +## Run + +```bash +cd examples/evaluation/llm/finalresponse +OPENAI_API_KEY=sk-... go run . \ + -model "gpt-4o-mini" \ + -data-dir "./data" \ + -output-dir "./output" \ + -eval-set "final-response-basic" +``` + +The example issues one QA prompt, asks the agent to answer, then uses the `llm_final_response` evaluator to judge the agent’s final reply against the reference. + +## Data Layout + +``` +data/ +└── final-response-app/ + ├── final-response-basic.evalset.json # EvalSet with one QA case + └── final-response-basic.metrics.json # Uses llm_final_response metric +``` + +## Output + +Results are written under `./output/final-response-app`, mirroring the eval set structure. The console prints a short summary of overall and per-case outcomes. diff --git a/examples/evaluation/llm/finalresponse/agent.go b/examples/evaluation/llm/finalresponse/agent.go new file mode 100644 index 000000000..1c1c7be2f --- /dev/null +++ b/examples/evaluation/llm/finalresponse/agent.go @@ -0,0 +1,84 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package main + +import ( + "context" + "math" + "strings" + + "trpc.group/trpc-go/trpc-agent-go/agent" + "trpc.group/trpc-go/trpc-agent-go/agent/llmagent" + "trpc.group/trpc-go/trpc-agent-go/model" + "trpc.group/trpc-go/trpc-agent-go/model/openai" + "trpc.group/trpc-go/trpc-agent-go/tool" + "trpc.group/trpc-go/trpc-agent-go/tool/function" +) + +func newQAAgent(modelName string, stream bool) agent.Agent { + calculatorTool := function.NewFunctionTool( + calculate, + function.WithName("calculator"), + function.WithDescription("Perform arithmetic operations including add, subtract, multiply, divide, power."), + ) + genCfg := model.GenerationConfig{ + MaxTokens: intPtr(512), + Temperature: floatPtr(0.2), + Stream: stream, + } + return llmagent.New( + "final-response-agent", + llmagent.WithModel(openai.New(modelName)), + llmagent.WithTools([]tool.Tool{calculatorTool}), + llmagent.WithInstruction("Answer the user concisely and accurately."), + llmagent.WithDescription("Simple LLM agent for final-response evaluation."), + llmagent.WithGenerationConfig(genCfg), + ) +} + +type calculatorArgs struct { + Operation string `json:"operation"` + A float64 `json:"a"` + B float64 `json:"b"` +} + +type calculatorResult struct { + Operation string `json:"operation"` + A float64 `json:"a"` + B float64 `json:"b"` + Result float64 `json:"result"` +} + +func calculate(_ context.Context, args calculatorArgs) (calculatorResult, error) { + var result float64 + switch strings.ToLower(args.Operation) { + case "add", "+": + result = args.A + args.B + case "subtract", "-": + result = args.A - args.B + case "multiply", "*": + result = args.A * args.B + case "divide", "/": + if args.B != 0 { + result = args.A / args.B + } + case "power", "^": + result = math.Pow(args.A, args.B) + } + return calculatorResult{ + Operation: args.Operation, + A: args.A, + B: args.B, + Result: result, + }, nil +} + +func intPtr(v int) *int { return &v } +func floatPtr(v float64) *float64 { return &v } diff --git a/examples/evaluation/llm/finalresponse/data/final-response-app/final-response-basic.evalset.json b/examples/evaluation/llm/finalresponse/data/final-response-app/final-response-basic.evalset.json new file mode 100644 index 000000000..0f32012f7 --- /dev/null +++ b/examples/evaluation/llm/finalresponse/data/final-response-app/final-response-basic.evalset.json @@ -0,0 +1,34 @@ +{ + "evalSetId": "final-response-basic", + "name": "final-response-basic", + "evalCases": [ + { + "evalId": "qa_paris", + "conversation": [ + { + "invocationId": "qa_paris-1", + "userContent": { + "role": "user", + "parts": [ + { + "text": "With a principal of 1000 dollars and a compound annual interest rate of 10%, what will the profit be after 30 years?" + } + ] + }, + "finalResponse": { + "role": "assistant", + "parts": [ + { + "text": "16,449.40 dollars" + } + ] + } + } + ], + "sessionInput": { + "appName": "final-response-app", + "userId": "demo-user" + } + } + ] +} \ No newline at end of file diff --git a/examples/evaluation/llm/finalresponse/data/final-response-app/final-response-basic.metrics.json b/examples/evaluation/llm/finalresponse/data/final-response-app/final-response-basic.metrics.json new file mode 100644 index 000000000..885f81bb1 --- /dev/null +++ b/examples/evaluation/llm/finalresponse/data/final-response-app/final-response-basic.metrics.json @@ -0,0 +1,20 @@ +[ + { + "metricName": "llm_final_response", + "threshold": 0.9, + "criterion": { + "llmJudge": { + "judgeModel": { + "providerName": "openai", + "modelName": "deepseek-chat", + "numSamples": 3, + "generationConfig": { + "max_tokens": 512, + "temperature": 1.0, + "stream": false + } + } + } + } + } +] diff --git a/examples/evaluation/llm/finalresponse/main.go b/examples/evaluation/llm/finalresponse/main.go new file mode 100644 index 000000000..686c2dea8 --- /dev/null +++ b/examples/evaluation/llm/finalresponse/main.go @@ -0,0 +1,94 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package main implements functions for the final response evaluation. +package main + +import ( + "context" + "flag" + "fmt" + "log" + + "trpc.group/trpc-go/trpc-agent-go/evaluation" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" + evalresultlocal "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult/local" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + evalsetlocal "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset/local" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/registry" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + metriclocal "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/local" + "trpc.group/trpc-go/trpc-agent-go/runner" +) + +var ( + dataDir = flag.String("data-dir", "./data", "Directory containing evaluation set and metric files") + outputDir = flag.String("output-dir", "./output", "Directory where evaluation results will be stored") + modelName = flag.String("model", "deepseek-chat", "Model to use for evaluation runs") + streaming = flag.Bool("streaming", false, "Enable streaming responses from the agent") + evalSetID = flag.String("eval-set", "final-response-basic", "Evaluation set identifier to execute") +) + +const appName = "final-response-app" + +func main() { + flag.Parse() + ctx := context.Background() + runner := runner.NewRunner(appName, newQAAgent(*modelName, *streaming)) + defer runner.Close() + + evalSetManager := evalsetlocal.New(evalset.WithBaseDir(*dataDir)) + metricManager := metriclocal.New(metric.WithBaseDir(*dataDir)) + evalResultManager := evalresultlocal.New(evalresult.WithBaseDir(*outputDir)) + registry := registry.New() + + agentEvaluator, err := evaluation.New( + appName, + runner, + evaluation.WithEvalSetManager(evalSetManager), + evaluation.WithMetricManager(metricManager), + evaluation.WithEvalResultManager(evalResultManager), + evaluation.WithRegistry(registry), + ) + if err != nil { + log.Fatalf("create evaluator: %v", err) + } + + result, err := agentEvaluator.Evaluate(ctx, *evalSetID) + if err != nil { + log.Fatalf("evaluate: %v", err) + } + printSummary(result, *outputDir) +} + +func printSummary(result *evaluation.EvaluationResult, outDir string) { + fmt.Println("✅ Final-response evaluation completed with local storage") + fmt.Printf("App: %s\n", result.AppName) + fmt.Printf("Eval Set: %s\n", result.EvalSetID) + fmt.Printf("Overall Status: %s\n", result.OverallStatus.String()) + runs := 0 + if len(result.EvalCases) > 0 { + runs = len(result.EvalCases[0].EvalCaseResults) + } + fmt.Printf("Runs: %d\n", runs) + + for _, caseResult := range result.EvalCases { + fmt.Printf("Case %s -> %s\n", caseResult.EvalCaseID, caseResult.OverallStatus.String()) + for _, metricResult := range caseResult.MetricResults { + fmt.Printf(" Metric %s: score %.2f (threshold %.2f) => %s\n", + metricResult.MetricName, + metricResult.Score, + metricResult.Threshold, + metricResult.EvalStatus.String(), + ) + } + fmt.Println() + } + fmt.Printf("Results saved under: %s\n", outDir) +} diff --git a/examples/evaluation/llm/finalresponse/output/final-response-app/final-response-app_final-response-basic_1d6d48a3-e80a-40a7-9354-8cf5af4b43af.evalset_result.json b/examples/evaluation/llm/finalresponse/output/final-response-app/final-response-app_final-response-basic_1d6d48a3-e80a-40a7-9354-8cf5af4b43af.evalset_result.json new file mode 100644 index 000000000..07b1a3e9f --- /dev/null +++ b/examples/evaluation/llm/finalresponse/output/final-response-app/final-response-app_final-response-basic_1d6d48a3-e80a-40a7-9354-8cf5af4b43af.evalset_result.json @@ -0,0 +1,164 @@ +{ + "evalSetResultId": "final-response-app_final-response-basic_1d6d48a3-e80a-40a7-9354-8cf5af4b43af", + "evalSetResultName": "final-response-app_final-response-basic_1d6d48a3-e80a-40a7-9354-8cf5af4b43af", + "evalSetId": "final-response-basic", + "evalCaseResults": [ + { + "evalSetId": "final-response-basic", + "evalId": "qa_paris", + "finalEvalStatus": 1, + "overallEvalMetricResults": [ + { + "metricName": "llm_final_response", + "score": 1, + "evalStatus": 1, + "threshold": 0.9, + "criterion": { + "llmJudge": { + "judgeModel": { + "providerName": "openai", + "modelName": "deepseek-chat", + "numSamples": 3, + "generationConfig": { + "max_tokens": 512, + "temperature": 1, + "stream": false + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "f47764ec-1a7e-4b64-8338-57e74d85ed96", + "userContent": { + "parts": [ + { + "text": "With a principal of 1000 dollars and a compound annual interest rate of 10%, what will the profit be after 30 years?" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "With a principal of $1,000 and a compound annual interest rate of 10%, after 30 years:\n\n- **Total amount**: $17,449.40\n- **Profit**: $16,449.40\n\nThe profit represents the interest earned over the 30-year period through compound growth." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_YWIcUqqaAkKLrtmwvoRhoKGQ", + "args": { + "a": 1.1, + "b": 30, + "operation": "power" + }, + "name": "calculator" + }, + { + "id": "call_00_KXFc4Xkm0BHfTRLpt3DhqBgc", + "args": { + "a": 1000, + "b": 17.449402268886445, + "operation": "multiply" + }, + "name": "calculator" + }, + { + "id": "call_00_OMbS3dK399YcWhRnromUJ9Jh", + "args": { + "a": 17449.402268886446, + "b": 1000, + "operation": "subtract" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_YWIcUqqaAkKLrtmwvoRhoKGQ", + "name": "calculator", + "response": { + "a": 1.1, + "b": 30, + "operation": "power", + "result": 17.449402268886445 + } + }, + { + "id": "call_00_KXFc4Xkm0BHfTRLpt3DhqBgc", + "name": "calculator", + "response": { + "a": 1000, + "b": 17.449402268886445, + "operation": "multiply", + "result": 17449.402268886446 + } + }, + { + "id": "call_00_OMbS3dK399YcWhRnromUJ9Jh", + "name": "calculator", + "response": { + "a": 17449.402268886446, + "b": 1000, + "operation": "subtract", + "result": 16449.402268886446 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "qa_paris-1", + "userContent": { + "parts": [ + { + "text": "With a principal of 1000 dollars and a compound annual interest rate of 10%, what will the profit be after 30 years?" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "16,449.40 dollars" + } + ], + "role": "assistant" + } + }, + "evalMetricResults": [ + { + "metricName": "llm_final_response", + "score": 1, + "evalStatus": 1, + "threshold": 0.9, + "criterion": { + "llmJudge": { + "judgeModel": { + "providerName": "openai", + "modelName": "deepseek-chat", + "numSamples": 3, + "generationConfig": { + "max_tokens": 512, + "temperature": 1, + "stream": false + } + } + } + } + } + ] + } + ], + "sessionId": "acd2ad86-6185-44a5-abbc-bfb44012c16c", + "userId": "demo-user" + } + ], + "creationTimestamp": 1764087706.0861804 +} diff --git a/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json b/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json index 9069826a4..c932d867f 100644 --- a/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json +++ b/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json @@ -1,91 +1,113 @@ { - "eval_set_id": "math-basic", + "evalSetId": "math-basic", "name": "math-basic", - "eval_cases": [ + "evalCases": [ { - "eval_id": "calc_add", - "conversation": [ - { - "invocation_id": "calc_add-1", - "user_content": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "final_response": { - "parts": [ - { - "text": "calc result: 5" - } - ], - "role": "assistant" - }, - "intermediate_data": { - "tool_uses": [ - { - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ] - }, - "creation_timestamp": 1761134484.981062 + "evalId": "calc_add", + "conversation": [ + { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + } + ], + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" } - ], - "session_input": { - "app_name": "math-eval-app", - "user_id": "user" }, - "creation_timestamp": 1761134484.981062 - }, - { - "eval_id": "calc_multiply", - "conversation": [ - { - "invocation_id": "calc_multiply-1", - "user_content": { - "parts": [ - { - "text": "calc multiply 6 7" - } - ], - "role": "user" - }, - "final_response": { - "parts": [ - { - "text": "calc result: 42" - } - ], - "role": "assistant" - }, - "intermediate_data": { - "tool_uses": [ - { - "args": { - "a": 6, - "b": 7, - "operation": "multiply" - }, - "name": "calculator" - } - ] - }, - "creation_timestamp": 1761134484.9812014 + { + "evalId": "calc_multiply", + "conversation": [ + { + "invocationId": "calc_multiply-1", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 42" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_2", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_2", + "name": "calculator", + "response": { + "a": 6, + "b": 7, + "operation": "multiply", + "result": 42 + } + } + ] + } + } + ], + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" } - ], - "session_input": { - "app_name": "math-eval-app", - "user_id": "user" - }, - "creation_timestamp": 1761134484.9812014 - } + } ], - "creation_timestamp": 1761134484.9804401 -} \ No newline at end of file + "creationTimestamp": 1761134484.9804401 +} diff --git a/examples/evaluation/local/data/math-eval-app/math-basic.metrics.json b/examples/evaluation/local/data/math-eval-app/math-basic.metrics.json index c57bd213f..7d2822b83 100644 --- a/examples/evaluation/local/data/math-eval-app/math-basic.metrics.json +++ b/examples/evaluation/local/data/math-eval-app/math-basic.metrics.json @@ -1,6 +1,21 @@ [ { - "metric_name": "tool_trajectory_avg_score", - "threshold": 1 + "metricName": "tool_trajectory_avg_score", + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } } ] \ No newline at end of file diff --git a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc.evalset_result.json b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc.evalset_result.json deleted file mode 100644 index 21454b672..000000000 --- a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc.evalset_result.json +++ /dev/null @@ -1 +0,0 @@ -"{\"eval_set_result_id\":\"math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc\",\"eval_set_result_name\":\"math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc\",\"eval_set_id\":\"math-basic\",\"eval_case_results\":[{\"eval_set_id\":\"math-basic\",\"eval_id\":\"calc_add\",\"final_eval_status\":1,\"overall_eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}],\"eval_metric_result_per_invocation\":[{\"actual_invocation\":{\"invocation_id\":\"8b205b3f-682e-409a-b751-89ef805d0221\",\"user_content\":{\"parts\":[{\"text\":\"calc add 2 3\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"The result of adding 2 and 3 is **5**.\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"id\":\"call_00_j75SIh8A9xSlG61OrC1ARIab\",\"args\":{\"a\":2,\"b\":3,\"operation\":\"add\"},\"name\":\"calculator\"}]}},\"expected_invocation\":{\"invocation_id\":\"calc_add-1\",\"user_content\":{\"parts\":[{\"text\":\"calc add 2 3\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"calc result: 5\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"args\":{\"a\":2,\"b\":3,\"operation\":\"add\"},\"name\":\"calculator\"}]},\"creation_timestamp\":1761134484.981062},\"eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}]}],\"session_id\":\"74252944-b1a7-4c17-8f39-4a5809395d1d\",\"user_id\":\"user\"},{\"eval_set_id\":\"math-basic\",\"eval_id\":\"calc_multiply\",\"final_eval_status\":1,\"overall_eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}],\"eval_metric_result_per_invocation\":[{\"actual_invocation\":{\"invocation_id\":\"65226930-d45c-43ae-ab88-9c35f3abce70\",\"user_content\":{\"parts\":[{\"text\":\"calc multiply 6 7\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"6 × 7 = 42\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"id\":\"call_00_b3Gj4Y3fJu9Blkbl6H0MLquO\",\"args\":{\"a\":6,\"b\":7,\"operation\":\"multiply\"},\"name\":\"calculator\"}]}},\"expected_invocation\":{\"invocation_id\":\"calc_multiply-1\",\"user_content\":{\"parts\":[{\"text\":\"calc multiply 6 7\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"calc result: 42\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"args\":{\"a\":6,\"b\":7,\"operation\":\"multiply\"},\"name\":\"calculator\"}]},\"creation_timestamp\":1761134484.9812014},\"eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}]}],\"session_id\":\"6393fabd-ab50-49b7-8656-59fcb0a29758\",\"user_id\":\"user\"}],\"creation_timestamp\":1761134849.3572516}" \ No newline at end of file diff --git a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494.evalset_result.json b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494.evalset_result.json new file mode 100644 index 000000000..09a53b9ff --- /dev/null +++ b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494.evalset_result.json @@ -0,0 +1,298 @@ +{ + "evalSetResultId": "math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494", + "evalSetResultName": "math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494", + "evalSetId": "math-basic", + "evalCaseResults": [ + { + "evalSetId": "math-basic", + "evalId": "calc_add", + "finalEvalStatus": 1, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "53845847-16e0-4960-9d00-d3abf0ab1807", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "The result of 2 + 3 is **5**." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_J7WhW8PJurYtWji3J5H7ITN9", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_J7WhW8PJurYtWji3J5H7ITN9", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } + } + ] + } + ], + "sessionId": "e9cc851f-8c89-45f4-b430-7c54991c7dda", + "userId": "user" + }, + { + "evalSetId": "math-basic", + "evalId": "calc_multiply", + "finalEvalStatus": 1, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "bfc1f3ef-7b7c-4d36-ac50-6fa3a9991abb", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "6 multiplied by 7 equals 42." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_xcGeTAsvZJxhKMA8oT478nMP", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_xcGeTAsvZJxhKMA8oT478nMP", + "name": "calculator", + "response": { + "a": 6, + "b": 7, + "operation": "multiply", + "result": 42 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_multiply-1", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 42" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_2", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_2", + "name": "calculator", + "response": { + "a": 6, + "b": 7, + "operation": "multiply", + "result": 42 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } + } + ] + } + ], + "sessionId": "bd844ee7-066c-43b0-adfa-34e1d8bffeb6", + "userId": "user" + } + ], + "creationTimestamp": 1763997862.5581782 +} diff --git a/server/debug/go.mod b/server/debug/go.mod index 5df137ef3..21da3d62d 100644 --- a/server/debug/go.mod +++ b/server/debug/go.mod @@ -39,7 +39,7 @@ require ( github.com/panjf2000/ants/v2 v2.10.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect - github.com/tidwall/gjson v1.14.4 // indirect + github.com/tidwall/gjson v1.18.0 // indirect github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.1 // indirect github.com/tidwall/sjson v1.2.5 // indirect @@ -52,11 +52,11 @@ require ( go.opentelemetry.io/proto/otlp v1.3.1 // indirect go.uber.org/multierr v1.10.0 // indirect go.uber.org/zap v1.27.0 // indirect - golang.org/x/crypto v0.32.0 // indirect - golang.org/x/net v0.34.0 // indirect - golang.org/x/sync v0.10.0 // indirect + golang.org/x/crypto v0.40.0 // indirect + golang.org/x/net v0.41.0 // indirect + golang.org/x/sync v0.16.0 // indirect golang.org/x/sys v0.35.0 // indirect - golang.org/x/text v0.21.0 // indirect + golang.org/x/text v0.27.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect google.golang.org/grpc v1.66.2 // indirect diff --git a/server/debug/go.sum b/server/debug/go.sum index a73060dc5..3d4e62c0f 100644 --- a/server/debug/go.sum +++ b/server/debug/go.sum @@ -91,6 +91,8 @@ github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/tidwall/gjson v1.14.4 h1:uo0p8EbA09J7RQaflQ1aBRffTR7xedD2bcIVSYxLnkM= github.com/tidwall/gjson v1.14.4/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= @@ -130,6 +132,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc= golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= +golang.org/x/crypto v0.40.0 h1:r4x+VvoG5Fm+eJcxMaY8CQM7Lb0l1lsmjGBQ6s8BfKM= +golang.org/x/crypto v0.40.0/go.mod h1:Qr1vMER5WyS2dfPHAlsOj01wgLbsyWtFn/aY+5+ZdxY= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= @@ -142,6 +146,8 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0= golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= +golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= +golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -149,6 +155,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -159,6 +167,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= +golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= diff --git a/server/debug/server_test.go b/server/debug/server_test.go index 577968dca..7886369e5 100644 --- a/server/debug/server_test.go +++ b/server/debug/server_test.go @@ -635,10 +635,10 @@ func (f *fakeEvaluatorImpl) Evaluate(ctx context.Context, actuals, expecteds []* result := &evaluator.EvaluateResult{ OverallScore: 1, OverallStatus: status.EvalStatusPassed, - PerInvocationResults: make([]evaluator.PerInvocationResult, len(actuals)), + PerInvocationResults: make([]*evaluator.PerInvocationResult, len(actuals)), } for i := range actuals { - result.PerInvocationResults[i] = evaluator.PerInvocationResult{ + result.PerInvocationResults[i] = &evaluator.PerInvocationResult{ ActualInvocation: actuals[i], ExpectedInvocation: expecteds[i], Score: 1,