From 7ee361b9b645cce9cf5de1af0c5978bcfa73f72a Mon Sep 17 00:00:00 2001 From: hackerli Date: Thu, 20 Nov 2025 10:53:42 +0800 Subject: [PATCH 01/14] feat --- docs/mkdocs/zh/evaluation.md | 10 + .../{internal => }/epochtime/epochtime.go | 0 .../epochtime/epochtime_test.go | 0 evaluation/evalresult/evalresult.go | 44 +- evaluation/evalresult/evalresult_test.go | 66 +- evaluation/evalresult/inmemory/inmemory.go | 2 +- evaluation/evalresult/local/local.go | 2 +- evaluation/evalset/evalcase.go | 36 +- evaluation/evalset/evalset.go | 10 +- evaluation/evalset/evalset_test.go | 32 +- evaluation/evalset/inmemory/inmemory.go | 2 +- evaluation/evalset/inmemory/inmemory_test.go | 2 +- evaluation/evalset/local/local.go | 2 +- evaluation/evaluation.go | 18 +- evaluation/evaluation_test.go | 1 - evaluation/evaluator/evaluator.go | 4 +- evaluation/evaluator/registry/registry.go | 15 + evaluation/go.mod | 2 +- evaluation/go.sum | 4 +- evaluation/metric/metric.go | 4 +- evaluation/metric/metric_test.go | 2 +- evaluation/service/local/local.go | 6 +- evaluation/service/local/local_test.go | 38 +- evaluation/service/service.go | 34 +- examples/evaluation/debug/README.md | 67 ++ examples/evaluation/debug/agent.go | 118 ++++ examples/evaluation/debug/main.go | 48 ++ examples/evaluation/go.mod | 21 +- examples/evaluation/go.sum | 46 +- go.mod | 4 +- go.sum | 8 +- server/debug/go.mod | 66 ++ server/debug/go.sum | 206 ++++++ server/debug/internal/schema/schema.go | 76 +++ server/debug/server.go | 623 +++++++++++++++++- server/debug/server_test.go | 349 ++++++++++ 36 files changed, 1776 insertions(+), 192 deletions(-) rename evaluation/{internal => }/epochtime/epochtime.go (100%) rename evaluation/{internal => }/epochtime/epochtime_test.go (100%) create mode 100644 examples/evaluation/debug/README.md create mode 100644 examples/evaluation/debug/agent.go create mode 100644 examples/evaluation/debug/main.go create mode 100644 server/debug/go.mod create mode 100644 server/debug/go.sum diff --git a/docs/mkdocs/zh/evaluation.md b/docs/mkdocs/zh/evaluation.md index 461d2b9eb..9edd3656c 100644 --- a/docs/mkdocs/zh/evaluation.md +++ b/docs/mkdocs/zh/evaluation.md @@ -6,6 +6,16 @@ Evaluation 提供完整的 Agent 评估框架,支持本地文件和内存两 本节介绍如何在本地文件系统 local 或内存 inmemory 模式下执行 Agent 评估流程。 +### Evaluation 搭配 Debug Server + +若需要在 ADK Web 中一边调试 Agent、一边把真实会话转成评估用例,可以直接复用 `examples/evaluation/debug` 示例: + +1. 通过 `debug.New` 启动调试服务器时,传入 `debug.WithEvalSetManager`、`debug.WithEvalResultManager`、`debug.WithMetricManager` 等选项,把评估集、指标以及评估结果落盘到指定目录。 +2. ADK Web 连接到该服务器后,聊天产生的 session 可在 UI 里使用 “Convert to Eval Case” 功能写入本地 `*.evalset.json`/`*.metrics.json`。 +3. UI 中的 Eval 标签页或 `run`/`run_eval` API 会自动读取这些配置,并在 `-output-dir` 写出 `*.evalset_result.json`,便于离线分析或版本管理。 + +完整流程(含命令行参数示例与数据目录结构)见 [examples/evaluation/debug](https://github.com/trpc-group/trpc-agent-go/tree/main/examples/evaluation/debug)。 + ### 本地文件系统 local local 在本地文件系统上维护评估集、评估指标和评估结果。 diff --git a/evaluation/internal/epochtime/epochtime.go b/evaluation/epochtime/epochtime.go similarity index 100% rename from evaluation/internal/epochtime/epochtime.go rename to evaluation/epochtime/epochtime.go diff --git a/evaluation/internal/epochtime/epochtime_test.go b/evaluation/epochtime/epochtime_test.go similarity index 100% rename from evaluation/internal/epochtime/epochtime_test.go rename to evaluation/epochtime/epochtime_test.go diff --git a/evaluation/evalresult/evalresult.go b/evaluation/evalresult/evalresult.go index ff6aa715c..da40fde7b 100644 --- a/evaluation/evalresult/evalresult.go +++ b/evaluation/evalresult/evalresult.go @@ -13,54 +13,54 @@ package evalresult import ( "context" + "trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" - "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/epochtime" "trpc.group/trpc-go/trpc-agent-go/evaluation/status" ) // EvalSetResult represents the evaluation result for an entire eval set. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvalSetResult struct { // EvalSetResultID uniquely identifies this result. - EvalSetResultID string `json:"eval_set_result_id,omitempty"` + EvalSetResultID string `json:"evalSetResultId,omitempty"` // EvalSetResultName is the name of this result. - EvalSetResultName string `json:"eval_set_result_name,omitempty"` + EvalSetResultName string `json:"evalSetResultName,omitempty"` // EvalSetID identifies the eval set. - EvalSetID string `json:"eval_set_id,omitempty"` + EvalSetID string `json:"evalSetId,omitempty"` // EvalCaseResults contains results for each eval case. - EvalCaseResults []*EvalCaseResult `json:"eval_case_results,omitempty"` + EvalCaseResults []*EvalCaseResult `json:"evalCaseResults,omitempty"` // CreationTimestamp when this result was created. - CreationTimestamp *epochtime.EpochTime `json:"creation_timestamp,omitempty"` + CreationTimestamp *epochtime.EpochTime `json:"creationTimestamp,omitempty"` } // EvalCaseResult represents the result of a single evaluation case. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvalCaseResult struct { // EvalSetID identifies the eval set. - EvalSetID string `json:"eval_set_id,omitempty"` + EvalSetID string `json:"evalSetId,omitempty"` // EvalID identifies the eval case. - EvalID string `json:"eval_id,omitempty"` + EvalID string `json:"evalId,omitempty"` // FinalEvalStatus is the final eval status for this eval case. - FinalEvalStatus status.EvalStatus `json:"final_eval_status,omitempty"` + FinalEvalStatus status.EvalStatus `json:"finalEvalStatus,omitempty"` // OverallEvalMetricResults contains overall result for each metric for the entire eval case. - OverallEvalMetricResults []*EvalMetricResult `json:"overall_eval_metric_results,omitempty"` + OverallEvalMetricResults []*EvalMetricResult `json:"overallEvalMetricResults,omitempty"` // EvalMetricResultPerInvocation contains result for each metric on a per invocation basis. - EvalMetricResultPerInvocation []*EvalMetricResultPerInvocation `json:"eval_metric_result_per_invocation,omitempty"` + EvalMetricResultPerInvocation []*EvalMetricResultPerInvocation `json:"evalMetricResultPerInvocation,omitempty"` // SessionID is the session id of the session generated as result of inferencing stage of the eval. - SessionID string `json:"session_id,omitempty"` + SessionID string `json:"sessionId,omitempty"` // UserID is the user id used during inferencing stage of the eval. - UserID string `json:"user_id,omitempty"` + UserID string `json:"userId,omitempty"` } // EvalMetricResult represents the result of a single metric evaluation. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvalMetricResult struct { // MetricName identifies the metric. - MetricName string `json:"metric_name,omitempty"` + MetricName string `json:"metricName,omitempty"` // Score obtained for this metric. Score float64 `json:"score,omitempty"` // EvalStatus of this metric evaluation. - EvalStatus status.EvalStatus `json:"eval_status,omitempty"` + EvalStatus status.EvalStatus `json:"evalStatus,omitempty"` // Threshold that was used. Threshold float64 `json:"threshold,omitempty"` // Details contains additional metric-specific information. @@ -68,14 +68,14 @@ type EvalMetricResult struct { } // EvalMetricResultPerInvocation represents metric results for a single invocation. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvalMetricResultPerInvocation struct { // ActualInvocation is the actual invocation, captured from agent run. - ActualInvocation *evalset.Invocation `json:"actual_invocation,omitempty"` + ActualInvocation *evalset.Invocation `json:"actualInvocation,omitempty"` // ExpectedInvocation is the expected invocation. - ExpectedInvocation *evalset.Invocation `json:"expected_invocation,omitempty"` + ExpectedInvocation *evalset.Invocation `json:"expectedInvocation,omitempty"` // EvalMetricResults contains results for each metric for this invocation. - EvalMetricResults []*EvalMetricResult `json:"eval_metric_results,omitempty"` + EvalMetricResults []*EvalMetricResult `json:"evalMetricResults,omitempty"` } // Manager defines the interface for managing evaluation results. diff --git a/evaluation/evalresult/evalresult_test.go b/evaluation/evalresult/evalresult_test.go index fe9d77b63..79b54e8af 100644 --- a/evaluation/evalresult/evalresult_test.go +++ b/evaluation/evalresult/evalresult_test.go @@ -20,30 +20,30 @@ import ( func TestEvalSetResultJSONRoundTrip(t *testing.T) { const raw = `{ - "eval_set_result_id": "result-1", - "eval_set_result_name": "result-name", - "eval_set_id": "greeting-set", - "eval_case_results": [ + "evalSetResultId": "result-1", + "evalSetResultName": "result-name", + "evalSetId": "greeting-set", + "evalCaseResults": [ { - "eval_set_id": "greeting-set", - "eval_id": "case-1", - "final_eval_status": 1, - "overall_eval_metric_results": [ + "evalSetId": "greeting-set", + "evalId": "case-1", + "finalEvalStatus": 1, + "overallEvalMetricResults": [ { - "metric_name": "tool_trajectory_avg_score", + "metricName": "tool_trajectory_avg_score", "score": 0.9, - "eval_status": 1, + "evalStatus": 1, "threshold": 0.8, "details": { "comment": "trajectory matched" } } ], - "eval_metric_result_per_invocation": [ + "evalMetricResultPerInvocation": [ { - "actual_invocation": { - "invocation_id": "invocation-actual", - "user_content": { + "actualInvocation": { + "invocationId": "invocation-actual", + "userContent": { "role": "user", "parts": [ { @@ -51,7 +51,7 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { } ] }, - "final_response": { + "finalResponse": { "role": "assistant", "parts": [ { @@ -59,8 +59,8 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { } ] }, - "intermediate_data": { - "tool_uses": [ + "intermediateData": { + "toolUses": [ { "id": "tool-call-1", "name": "calculator", @@ -71,7 +71,7 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { } } ], - "intermediate_responses": [ + "intermediateResponses": [ [ "assistant", [ @@ -82,11 +82,11 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { ] ] }, - "creation_timestamp": 1700000000 + "creationTimestamp": 1700000000 }, - "expected_invocation": { - "invocation_id": "invocation-expected", - "user_content": { + "expectedInvocation": { + "invocationId": "invocation-expected", + "userContent": { "role": "user", "parts": [ { @@ -94,7 +94,7 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { } ] }, - "final_response": { + "finalResponse": { "role": "assistant", "parts": [ { @@ -102,8 +102,8 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { } ] }, - "intermediate_data": { - "tool_uses": [ + "intermediateData": { + "toolUses": [ { "name": "calculator", "args": { @@ -113,7 +113,7 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { } } ], - "intermediate_responses": [ + "intermediateResponses": [ [ "assistant", [ @@ -124,13 +124,13 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { ] ] }, - "creation_timestamp": 1700000000 + "creationTimestamp": 1700000000 }, - "eval_metric_results": [ + "evalMetricResults": [ { - "metric_name": "tool_trajectory_avg_score", + "metricName": "tool_trajectory_avg_score", "score": 0.9, - "eval_status": 1, + "evalStatus": 1, "threshold": 0.8, "details": { "comment": "per invocation matched" @@ -139,11 +139,11 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) { ] } ], - "session_id": "session-1", - "user_id": "user-1" + "sessionId": "session-1", + "userId": "user-1" } ], - "creation_timestamp": 1700000000 + "creationTimestamp": 1700000000 }` var result EvalSetResult diff --git a/evaluation/evalresult/inmemory/inmemory.go b/evaluation/evalresult/inmemory/inmemory.go index 3e0fc82d6..22860625a 100644 --- a/evaluation/evalresult/inmemory/inmemory.go +++ b/evaluation/evalresult/inmemory/inmemory.go @@ -19,9 +19,9 @@ import ( "time" "github.com/google/uuid" + "trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/clone" - "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/epochtime" ) // manager implements evalresult.Manager backed by in-memory. diff --git a/evaluation/evalresult/local/local.go b/evaluation/evalresult/local/local.go index ce8483d31..9f87dd7a1 100644 --- a/evaluation/evalresult/local/local.go +++ b/evaluation/evalresult/local/local.go @@ -21,9 +21,9 @@ import ( "time" "github.com/google/uuid" + "trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/clone" - "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/epochtime" ) const ( diff --git a/evaluation/evalset/evalcase.go b/evaluation/evalset/evalcase.go index 91f50fc55..577524925 100644 --- a/evaluation/evalset/evalcase.go +++ b/evaluation/evalset/evalcase.go @@ -11,57 +11,57 @@ package evalset import ( "google.golang.org/genai" - "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/epochtime" + "trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime" ) // EvalCase represents a single evaluation case. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvalCase struct { // EvalID uniquely identifies this evaluation case. - EvalID string `json:"eval_id,omitempty"` + EvalID string `json:"evalId,omitempty"` // Conversation contains the sequence of invocations. Conversation []*Invocation `json:"conversation,omitempty"` // SessionInput contains initialization data for the session. - SessionInput *SessionInput `json:"session_input,omitempty"` + SessionInput *SessionInput `json:"sessionInput,omitempty"` // CreationTimestamp when this eval case was created. - CreationTimestamp *epochtime.EpochTime `json:"creation_timestamp,omitempty"` + CreationTimestamp *epochtime.EpochTime `json:"creationTimestamp,omitempty"` } // Invocation represents a single invocation in a conversation. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type Invocation struct { // InvocationID uniquely identifies this invocation. - InvocationID string `json:"invocation_id,omitempty"` + InvocationID string `json:"invocationId,omitempty"` // UserContent represents the user's input. - UserContent *genai.Content `json:"user_content,omitempty"` + UserContent *genai.Content `json:"userContent,omitempty"` // FinalResponse represents the agent's final response. - FinalResponse *genai.Content `json:"final_response,omitempty"` + FinalResponse *genai.Content `json:"finalResponse,omitempty"` // IntermediateData contains intermediate steps during execution. - IntermediateData *IntermediateData `json:"intermediate_data,omitempty"` + IntermediateData *IntermediateData `json:"intermediateData,omitempty"` // CreationTimestamp when this invocation was created. - CreationTimestamp *epochtime.EpochTime `json:"creation_timestamp,omitempty"` + CreationTimestamp *epochtime.EpochTime `json:"creationTimestamp,omitempty"` } // IntermediateData contains intermediate execution data. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type IntermediateData struct { // ToolUses represents tool calls made during execution. - ToolUses []*genai.FunctionCall `json:"tool_uses,omitempty"` + ToolUses []*genai.FunctionCall `json:"toolUses,omitempty"` // ToolResponses represents tool responses made during execution. - ToolResponses []*genai.FunctionResponse `json:"tool_responses,omitempty"` + ToolResponses []*genai.FunctionResponse `json:"toolResponses,omitempty"` // IntermediateResponses represents intermediate responses, including text responses and tool responses. // For each intermediate response, the first element is the author string, // and the second element is the genai.Part slice. - IntermediateResponses [][]any `json:"intermediate_responses,omitempty"` + IntermediateResponses [][]any `json:"intermediateResponses,omitempty"` } // SessionInput represents values that help initialize a session. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type SessionInput struct { // AppName identifies the app. - AppName string `json:"app_name,omitempty"` + AppName string `json:"appName,omitempty"` // UserID identifies the user. - UserID string `json:"user_id,omitempty"` + UserID string `json:"userId,omitempty"` // State contains the initial state of the session. State map[string]any `json:"state,omitempty"` } diff --git a/evaluation/evalset/evalset.go b/evaluation/evalset/evalset.go index 37bda3433..707d00ff6 100644 --- a/evaluation/evalset/evalset.go +++ b/evaluation/evalset/evalset.go @@ -13,22 +13,22 @@ package evalset import ( "context" - "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/epochtime" + "trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime" ) // EvalSet represents a collection of evaluation cases. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvalSet struct { // EvalSetID uniquely identifies this evaluation set. - EvalSetID string `json:"eval_set_id,omitempty"` + EvalSetID string `json:"evalSetId,omitempty"` // Name of the evaluation set. Name string `json:"name,omitempty"` // Description of the evaluation set. Description string `json:"description,omitempty"` // EvalCases contains all the evaluation cases. - EvalCases []*EvalCase `json:"eval_cases,omitempty"` + EvalCases []*EvalCase `json:"evalCases,omitempty"` // CreationTimestamp when this eval set was created. - CreationTimestamp *epochtime.EpochTime `json:"creation_timestamp,omitempty"` + CreationTimestamp *epochtime.EpochTime `json:"creationTimestamp,omitempty"` } // Manager defines the interface that an evaluation set manager must satisfy. diff --git a/evaluation/evalset/evalset_test.go b/evaluation/evalset/evalset_test.go index 980897a9f..88e28b949 100644 --- a/evaluation/evalset/evalset_test.go +++ b/evaluation/evalset/evalset_test.go @@ -19,16 +19,16 @@ import ( func TestEvalSetJSONRoundTrip(t *testing.T) { jsonData := `{ - "eval_set_id": "test-set", + "evalSetId": "test-set", "name": "Test Set", "description": "Complete eval set JSON for testing.", - "eval_cases": [ + "evalCases": [ { - "eval_id": "case-42", + "evalId": "case-42", "conversation": [ { - "invocation_id": "invoke-1", - "user_content": { + "invocationId": "invoke-1", + "userContent": { "role": "user", "parts": [ { @@ -36,7 +36,7 @@ func TestEvalSetJSONRoundTrip(t *testing.T) { } ] }, - "final_response": { + "finalResponse": { "role": "assistant", "parts": [ { @@ -44,8 +44,8 @@ func TestEvalSetJSONRoundTrip(t *testing.T) { } ] }, - "intermediate_data": { - "tool_uses": [ + "intermediateData": { + "toolUses": [ { "name": "calculator", "args": { @@ -55,7 +55,7 @@ func TestEvalSetJSONRoundTrip(t *testing.T) { } } ], - "tool_responses": [ + "toolResponses": [ { "name": "calculator", "response": { @@ -63,7 +63,7 @@ func TestEvalSetJSONRoundTrip(t *testing.T) { } } ], - "intermediate_responses": [ + "intermediateResponses": [ [ "assistant", [ @@ -74,21 +74,21 @@ func TestEvalSetJSONRoundTrip(t *testing.T) { ] ] }, - "creation_timestamp": 1700000100 + "creationTimestamp": 1700000100 } ], - "session_input": { - "app_name": "demo-app", - "user_id": "user-42", + "sessionInput": { + "appName": "demo-app", + "userId": "user-42", "state": { "language": "en", "isPremium": true } }, - "creation_timestamp": 1700000200 + "creationTimestamp": 1700000200 } ], - "creation_timestamp": 1700000000 + "creationTimestamp": 1700000000 }` var evalSet EvalSet diff --git a/evaluation/evalset/inmemory/inmemory.go b/evaluation/evalset/inmemory/inmemory.go index de207528b..ddcfef080 100644 --- a/evaluation/evalset/inmemory/inmemory.go +++ b/evaluation/evalset/inmemory/inmemory.go @@ -18,9 +18,9 @@ import ( "sync" "time" + "trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/clone" - "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/epochtime" ) // Manager implements the evalset.Manager interface using in-memory manager. diff --git a/evaluation/evalset/inmemory/inmemory_test.go b/evaluation/evalset/inmemory/inmemory_test.go index 51009c2cd..d2fc12e50 100644 --- a/evaluation/evalset/inmemory/inmemory_test.go +++ b/evaluation/evalset/inmemory/inmemory_test.go @@ -15,8 +15,8 @@ import ( "time" "github.com/stretchr/testify/assert" + "trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" - "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/epochtime" ) func TestManager(t *testing.T) { diff --git a/evaluation/evalset/local/local.go b/evaluation/evalset/local/local.go index b2e0e95ed..f8569a0de 100644 --- a/evaluation/evalset/local/local.go +++ b/evaluation/evalset/local/local.go @@ -20,9 +20,9 @@ import ( "sync" "time" + "trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/clone" - "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/epochtime" ) const ( diff --git a/evaluation/evaluation.go b/evaluation/evaluation.go index ad1de6616..cd69416fc 100644 --- a/evaluation/evaluation.go +++ b/evaluation/evaluation.go @@ -80,19 +80,19 @@ type agentEvaluator struct { // EvaluationResult contains the aggregated outcome of running an evaluation across multiple runs. type EvaluationResult struct { - AppName string // AppName identifies the agent being evaluated. - EvalSetID string // EvalSetID identifies the evaluation set used in this run. - OverallStatus status.EvalStatus // OverallStatus summarizes the aggregated evaluation status across cases. - ExecutionTime time.Duration // ExecutionTime records the total latency for the evaluation run. - EvalCases []*EvaluationCaseResult // EvalCases contains aggregated results for each evaluation case. + AppName string `json:"appName"` // AppName identifies the agent being evaluated. + EvalSetID string `json:"evalSetId"` // EvalSetID identifies the evaluation set used in this run. + OverallStatus status.EvalStatus `json:"overallStatus"` // OverallStatus summarizes the aggregated evaluation status across cases. + ExecutionTime time.Duration `json:"executionTime"` // ExecutionTime records the total latency for the evaluation run. + EvalCases []*EvaluationCaseResult `json:"evalCases"` // EvalCases contains aggregated results for each evaluation case. } // EvaluationCaseResult aggregates the outcome of a single eval case across multiple runs. type EvaluationCaseResult struct { - EvalCaseID string // EvalCaseID identifies the evaluation case. - OverallStatus status.EvalStatus // OverallStatus summarizes the overall status of case across runs. - EvalCaseResults []*evalresult.EvalCaseResult // EvalCaseResults stores the per-run results for this case. - MetricResults []*evalresult.EvalMetricResult // MetricResults lists aggregated metric outcomes across runs. + EvalCaseID string `json:"evalId"` // EvalCaseID identifies the evaluation case. + OverallStatus status.EvalStatus `json:"overallStatus"` // OverallStatus summarizes the overall status of case across runs. + EvalCaseResults []*evalresult.EvalCaseResult `json:"evalCaseResults"` // EvalCaseResults stores the per-run results for this case. + MetricResults []*evalresult.EvalMetricResult `json:"metricsResults"` // MetricResults lists aggregated metric outcomes across runs. } // Evaluate evaluates agent against the specified eval set across multiple runs. diff --git a/evaluation/evaluation_test.go b/evaluation/evaluation_test.go index b1032168f..802f13cf8 100644 --- a/evaluation/evaluation_test.go +++ b/evaluation/evaluation_test.go @@ -15,7 +15,6 @@ import ( "testing" "github.com/stretchr/testify/assert" - "trpc.group/trpc-go/trpc-agent-go/agent" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" evalresultinmemory "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult/inmemory" diff --git a/evaluation/evaluator/evaluator.go b/evaluation/evaluator/evaluator.go index 117285fba..db330cbb3 100644 --- a/evaluation/evaluator/evaluator.go +++ b/evaluation/evaluator/evaluator.go @@ -30,7 +30,7 @@ type Evaluator interface { } // EvaluateResult represents the aggregated outcome of running an evaluator over a set of invocations. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvaluateResult struct { // OverallScore is the overall score for this evaluation. OverallScore float64 `json:"overall_score,omitempty"` @@ -41,7 +41,7 @@ type EvaluateResult struct { } // PerInvocationResult represents the evaluation result for a single invocation. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type PerInvocationResult struct { // ActualInvocation is the invocation generated by the agent. ActualInvocation *evalset.Invocation `json:"actual_invocation,omitempty"` diff --git a/evaluation/evaluator/registry/registry.go b/evaluation/evaluator/registry/registry.go index b36dfbfbb..026f20493 100644 --- a/evaluation/evaluator/registry/registry.go +++ b/evaluation/evaluator/registry/registry.go @@ -14,6 +14,7 @@ import ( "errors" "fmt" "os" + "sort" "sync" "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator" @@ -26,6 +27,8 @@ type Registry interface { Register(name string, e evaluator.Evaluator) error // Get retrieves an evaluator by name. Get(name string) (evaluator.Evaluator, error) + // List returns the names of all registered evaluators. + List() []string } // registry is the default implementation of Registry. @@ -72,3 +75,15 @@ func (r *registry) Get(name string) (evaluator.Evaluator, error) { } return nil, fmt.Errorf("get evaluator %s: %w", name, os.ErrNotExist) } + +// List returns the names of all registered evaluators sorted lexicographically. +func (r *registry) List() []string { + r.mu.RLock() + defer r.mu.RUnlock() + names := make([]string, 0, len(r.evaluators)) + for name := range r.evaluators { + names = append(names, name) + } + sort.Strings(names) + return names +} diff --git a/evaluation/go.mod b/evaluation/go.mod index 5334f731d..8653e219d 100644 --- a/evaluation/go.mod +++ b/evaluation/go.mod @@ -8,7 +8,7 @@ require ( github.com/google/uuid v1.6.0 github.com/stretchr/testify v1.10.0 google.golang.org/genai v1.29.0 - trpc.group/trpc-go/trpc-agent-go v0.2.2 + trpc.group/trpc-go/trpc-agent-go v0.0.0-00010101000000-000000000000 ) require ( diff --git a/evaluation/go.sum b/evaluation/go.sum index 0f334ae4e..9c8206ce8 100644 --- a/evaluation/go.sum +++ b/evaluation/go.sum @@ -66,8 +66,8 @@ github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuE github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/openai/openai-go v1.12.0 h1:NBQCnXzqOTv5wsgNC36PrFEiskGfO5wccfCWDo9S1U0= github.com/openai/openai-go v1.12.0/go.mod h1:g461MYGXEXBVdV5SaR/5tNzNbSfwTBBefwc+LlDCK0Y= -github.com/panjf2000/ants/v2 v2.9.0 h1:SztCLkVxBRigbg+vt0S5QvF5vxAbxbKt09/YfAJ0tEo= -github.com/panjf2000/ants/v2 v2.9.0/go.mod h1:7ZxyxsqE4vvW0M7LSD8aI3cKwgFhBHbxnlN8mDqHa1I= +github.com/panjf2000/ants/v2 v2.10.0 h1:zhRg1pQUtkyRiOFo2Sbqwjp0GfBNo9cUY2/Grpx1p+8= +github.com/panjf2000/ants/v2 v2.10.0/go.mod h1:7ZxyxsqE4vvW0M7LSD8aI3cKwgFhBHbxnlN8mDqHa1I= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= diff --git a/evaluation/metric/metric.go b/evaluation/metric/metric.go index 3073759b8..e8fb4beb2 100644 --- a/evaluation/metric/metric.go +++ b/evaluation/metric/metric.go @@ -13,10 +13,10 @@ package metric import "context" // EvalMetric represents a metric used to evaluate a particular aspect of an eval case. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvalMetric struct { // MetricName identifies the metric. - MetricName string `json:"metric_name,omitempty"` + MetricName string `json:"metricName,omitempty"` // Threshold value for this metric. Threshold float64 `json:"threshold,omitempty"` } diff --git a/evaluation/metric/metric_test.go b/evaluation/metric/metric_test.go index 27bbc8031..279b05cca 100644 --- a/evaluation/metric/metric_test.go +++ b/evaluation/metric/metric_test.go @@ -24,7 +24,7 @@ func TestEvalMetricJSONMarshalling(t *testing.T) { data, err := json.Marshal(metric) assert.NoError(t, err) - assert.JSONEq(t, `{"metric_name":"accuracy","threshold":0.8}`, string(data)) + assert.JSONEq(t, `{"metricName":"accuracy","threshold":0.8}`, string(data)) var decoded EvalMetric err = json.Unmarshal(data, &decoded) diff --git a/evaluation/service/local/local.go b/evaluation/service/local/local.go index 52eeeb3e0..70cb015ff 100644 --- a/evaluation/service/local/local.go +++ b/evaluation/service/local/local.go @@ -14,14 +14,15 @@ import ( "context" "errors" "fmt" + "os" "slices" "time" + "trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator" "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/registry" - "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/epochtime" istatus "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/status" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" "trpc.group/trpc-go/trpc-agent-go/evaluation/service" @@ -193,6 +194,9 @@ func (s *local) evaluatePerCase(ctx context.Context, inferenceResult *service.In for _, evalMetric := range evaluateConfig.EvalMetrics { result, err := s.evaluateMetric(ctx, evalMetric, inferenceResult.Inferences, evalCase.Conversation) if err != nil { + if errors.Is(err, os.ErrNotExist) { + continue + } return nil, fmt.Errorf("run evaluation for metric %s: %w", evalMetric.MetricName, err) } overallMetricResults = append(overallMetricResults, &evalresult.EvalMetricResult{ diff --git a/evaluation/service/local/local_test.go b/evaluation/service/local/local_test.go index 93542d866..93b738719 100644 --- a/evaluation/service/local/local_test.go +++ b/evaluation/service/local/local_test.go @@ -376,18 +376,21 @@ func TestLocalEvaluatePerCaseErrors(t *testing.T) { } tests := []struct { - name string - setup func(t *testing.T) (*local, *service.InferenceResult, *service.EvaluateConfig) + name string + expectErr bool + setup func(t *testing.T) (*local, *service.InferenceResult, *service.EvaluateConfig) }{ { - name: "nil inference result", + name: "nil inference result", + expectErr: true, setup: func(t *testing.T) (*local, *service.InferenceResult, *service.EvaluateConfig) { svc, _, _ := prepare(t) return svc, nil, &service.EvaluateConfig{} }, }, { - name: "nil evaluate config", + name: "nil evaluate config", + expectErr: true, setup: func(t *testing.T) (*local, *service.InferenceResult, *service.EvaluateConfig) { svc, _, _ := prepare(t) inference := makeInferenceResult(appName, evalSetID, "case", "session", nil) @@ -395,7 +398,8 @@ func TestLocalEvaluatePerCaseErrors(t *testing.T) { }, }, { - name: "missing eval case", + name: "missing eval case", + expectErr: true, setup: func(t *testing.T) (*local, *service.InferenceResult, *service.EvaluateConfig) { svc, _, _ := prepare(t) inference := makeInferenceResult(appName, evalSetID, "missing", "session", []*evalset.Invocation{}) @@ -404,7 +408,8 @@ func TestLocalEvaluatePerCaseErrors(t *testing.T) { }, }, { - name: "invalid eval case", + name: "invalid eval case", + expectErr: true, setup: func(t *testing.T) (*local, *service.InferenceResult, *service.EvaluateConfig) { svc, mgr, _ := prepare(t) _, err := mgr.Create(ctx, appName, evalSetID) @@ -422,7 +427,8 @@ func TestLocalEvaluatePerCaseErrors(t *testing.T) { }, }, { - name: "mismatched inference count", + name: "mismatched inference count", + expectErr: true, setup: func(t *testing.T) (*local, *service.InferenceResult, *service.EvaluateConfig) { svc, mgr, _ := prepare(t) _, err := mgr.Create(ctx, appName, evalSetID) @@ -434,7 +440,8 @@ func TestLocalEvaluatePerCaseErrors(t *testing.T) { }, }, { - name: "missing evaluator", + name: "missing evaluator", + expectErr: false, setup: func(t *testing.T) (*local, *service.InferenceResult, *service.EvaluateConfig) { svc, mgr, _ := prepare(t) _, err := mgr.Create(ctx, appName, evalSetID) @@ -447,7 +454,8 @@ func TestLocalEvaluatePerCaseErrors(t *testing.T) { }, }, { - name: "per invocation mismatch", + name: "per invocation mismatch", + expectErr: true, setup: func(t *testing.T) (*local, *service.InferenceResult, *service.EvaluateConfig) { svc, mgr, reg := prepare(t) _, err := mgr.Create(ctx, appName, evalSetID) @@ -470,7 +478,8 @@ func TestLocalEvaluatePerCaseErrors(t *testing.T) { }, }, { - name: "summarize failure", + name: "summarize failure", + expectErr: true, setup: func(t *testing.T) (*local, *service.InferenceResult, *service.EvaluateConfig) { svc, mgr, reg := prepare(t) _, err := mgr.Create(ctx, appName, evalSetID) @@ -493,7 +502,8 @@ func TestLocalEvaluatePerCaseErrors(t *testing.T) { }, }, { - name: "evaluator error", + name: "evaluator error", + expectErr: true, setup: func(t *testing.T) (*local, *service.InferenceResult, *service.EvaluateConfig) { svc, mgr, reg := prepare(t) _, err := mgr.Create(ctx, appName, evalSetID) @@ -514,7 +524,11 @@ func TestLocalEvaluatePerCaseErrors(t *testing.T) { t.Run(tc.name, func(t *testing.T) { svc, inference, config := tc.setup(t) _, err := svc.evaluatePerCase(ctx, inference, config) - assert.Error(t, err) + if tc.expectErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } }) } } diff --git a/evaluation/service/service.go b/evaluation/service/service.go index 07487fc04..92e0f5170 100644 --- a/evaluation/service/service.go +++ b/evaluation/service/service.go @@ -30,52 +30,52 @@ type Service interface { } // InferenceRequest represents a request for running the agent inference on an eval set. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type InferenceRequest struct { // AppName is the name of the app. - AppName string `json:"app_name,omitempty"` + AppName string `json:"appName,omitempty"` // EvalSetID is the ID of the eval set. - EvalSetID string `json:"eval_set_id,omitempty"` + EvalSetID string `json:"evalSetId,omitempty"` // EvalCaseIDs are the IDs of eval cases to process. // If not specified, all eval cases in the eval set will be processed. - EvalCaseIDs []string `json:"eval_case_ids,omitempty"` + EvalCaseIDs []string `json:"evalCaseIds,omitempty"` } // InferenceResult contains the inference results for a single eval case. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type InferenceResult struct { // AppName is the name of the app. - AppName string `json:"app_name,omitempty"` + AppName string `json:"appName,omitempty"` // EvalSetID is the ID of the eval set. - EvalSetID string `json:"eval_set_id,omitempty"` + EvalSetID string `json:"evalSetId,omitempty"` // EvalCaseID is the ID of the eval case. - EvalCaseID string `json:"eval_case_id,omitempty"` + EvalCaseID string `json:"evalCaseId,omitempty"` // Inferences are the inference results. Inferences []*evalset.Invocation `json:"inferences,omitempty"` // SessionID is the ID of the inference session. - SessionID string `json:"session_id,omitempty"` + SessionID string `json:"sessionId,omitempty"` // Status is the status of the inference. Status status.EvalStatus `json:"status,omitempty"` // ErrorMessage contains the error message if inference failed. - ErrorMessage string `json:"error_message,omitempty"` + ErrorMessage string `json:"errorMessage,omitempty"` } // EvaluateRequest represents a request for running the evaluation on the inference results. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvaluateRequest struct { // AppName is the name of the app. - AppName string `json:"app_name,omitempty"` + AppName string `json:"appName,omitempty"` // EvalSetID is the ID of the eval set. - EvalSetID string `json:"eval_set_id,omitempty"` + EvalSetID string `json:"evalSetId,omitempty"` // InferenceResults are the inference results to be evaluated. - InferenceResults []*InferenceResult `json:"inference_results,omitempty"` + InferenceResults []*InferenceResult `json:"inferenceResults,omitempty"` // EvaluateConfig contains the evaluation configuration used during evaluation. - EvaluateConfig *EvaluateConfig `json:"evaluate_config,omitempty"` + EvaluateConfig *EvaluateConfig `json:"evaluateConfig,omitempty"` } // EvaluateConfig contains evaluation configuration used during evaluation. -// It mirrors the schema used by ADK Web, with field names in snake_case to align with the JSON format. +// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvaluateConfig struct { // EvalMetrics contains the metrics to be evaluated. - EvalMetrics []*metric.EvalMetric `json:"eval_metrics,omitempty"` + EvalMetrics []*metric.EvalMetric `json:"evalMetrics,omitempty"` } diff --git a/examples/evaluation/debug/README.md b/examples/evaluation/debug/README.md new file mode 100644 index 000000000..2f7d5c651 --- /dev/null +++ b/examples/evaluation/debug/README.md @@ -0,0 +1,67 @@ +# Debug + Evaluation Server Example + +This sample pairs the trpc-agent-go debug server with the evaluation pipeline. +It lets you collect real conversations through ADK Web, convert them into eval +sets, persist metric definitions, and run scoring jobs directly from the UI. + +## Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `OPENAI_API_KEY` | API key for the OpenAI-compatible backend (required) | `` | +| `OPENAI_BASE_URL` | Base URL for the LLM endpoint | `https://api.openai.com/v1` | + +## Command Line Flags + +| Flag | Description | Default | +|------|-------------|---------| +| `-model` | Model identifier used by the demo agent | `deepseek-chat` | +| `-addr` | HTTP listen address | `:8080` | +| `-app` | Application name exposed to ADK Web | `assistant` | +| `-data-dir` | Directory containing eval set JSON + metric configs | `./data` | +| `-output-dir` | Directory where eval results are persisted | `./output` | + +## Run the Server + +```bash +cd examples/evaluation/debug +OPENAI_API_KEY=sk-your-key \ +go run . \ + -model deepseek-chat \ + -addr 127.0.0.1:8080 \ + -app assistant \ + -data-dir ./data \ + -output-dir ./output +``` + +The process keeps running until interrupted. Sessions themselves remain +in-memory, but eval sets, metric definitions, and eval results live under the +directories specified by `-data-dir` and `-output-dir`. + +## Use with ADK Web + +1. Clone [ADK Web](https://github.com/google/adk-web), install dependencies, and + start it pointing to `http://127.0.0.1:8080`. +2. Create a chat session in the UI under the `-app` name and talk to the agent. + Events are streamed and stored verbatim. +3. Convert a completed session into an eval case via the UI (or the REST API). + The server stores the resulting `*.evalset.json` and `*.metrics.json` inside + `-data-dir`. +4. Run evaluations from the Eval tab or call `/apps/{app}/eval-sets/{id}/run`. + Metrics come from the stored configs; overriding them in the request also + persists the new values. +5. Download the produced eval results either from the UI or under + `-output-dir/{app}`. + +## Data Layout + +``` +data/ +└── assistant/ + ├── math-demo.evalset.json + └── math-demo.metrics.json + +output/ +└── assistant/ + └── assistant_math-demo_.evalset_result.json +``` diff --git a/examples/evaluation/debug/agent.go b/examples/evaluation/debug/agent.go new file mode 100644 index 000000000..25f6d0387 --- /dev/null +++ b/examples/evaluation/debug/agent.go @@ -0,0 +1,118 @@ +package main + +import ( + "context" + "strings" + "time" + + "trpc.group/trpc-go/trpc-agent-go/agent" + "trpc.group/trpc-go/trpc-agent-go/agent/llmagent" + "trpc.group/trpc-go/trpc-agent-go/model" + "trpc.group/trpc-go/trpc-agent-go/model/openai" + "trpc.group/trpc-go/trpc-agent-go/tool" + "trpc.group/trpc-go/trpc-agent-go/tool/function" +) + +const ( + calculatorToolName = "calculator" + timeToolName = "current_time" +) + +// newDemoAgent wires a calculator and a clock tool into a single LLM agent. +func newDemoAgent(agentName, modelName string, stream bool) agent.Agent { + calculatorTool := function.NewFunctionTool( + calculate, + function.WithName(calculatorToolName), + function.WithDescription("Perform arithmetic operations including add, subtract, multiply, and divide."), + ) + timeTool := function.NewFunctionTool( + getCurrentTime, + function.WithName(timeToolName), + function.WithDescription("Return the current date and time for the requested timezone."), + ) + cfg := model.GenerationConfig{ + MaxTokens: intPtr(1024), + Temperature: floatPtr(0.3), + Stream: stream, + } + return llmagent.New( + agentName, + llmagent.WithModel(openai.New(modelName)), + llmagent.WithInstruction("Use the calculator tool for math related questions and the current_time tool for timezone lookups."), + llmagent.WithDescription("Demo agent used by the debug+evaluation example."), + llmagent.WithTools([]tool.Tool{calculatorTool, timeTool}), + llmagent.WithGenerationConfig(cfg), + ) +} + +type calculatorArgs struct { + Operation string `json:"operation"` + A float64 `json:"a"` + B float64 `json:"b"` +} + +type calculatorResult struct { + Operation string `json:"operation"` + A float64 `json:"a"` + B float64 `json:"b"` + Result float64 `json:"result"` +} + +type timeArgs struct { + Timezone string `json:"timezone"` +} + +type timeResult struct { + Timezone string `json:"timezone"` + Time string `json:"time"` + Date string `json:"date"` + Weekday string `json:"weekday"` +} + +// calculate executes a math operation requested by the agent. +func calculate(_ context.Context, args calculatorArgs) (calculatorResult, error) { + var result float64 + switch strings.ToLower(args.Operation) { + case "add", "+": + result = args.A + args.B + case "subtract", "-": + result = args.A - args.B + case "multiply", "*": + result = args.A * args.B + case "divide", "/": + if args.B != 0 { + result = args.A / args.B + } + } + return calculatorResult{ + Operation: args.Operation, + A: args.A, + B: args.B, + Result: result + 10, + }, nil +} + +// getCurrentTime returns the local time for the requested timezone. +func getCurrentTime(_ context.Context, args timeArgs) (timeResult, error) { + loc := time.Local + if args.Timezone != "" { + if tz, err := time.LoadLocation(args.Timezone); err == nil { + loc = tz + } + } + now := time.Now().In(loc) + return timeResult{ + Timezone: loc.String(), + Time: now.Format("15:04:05"), + Date: now.Format("2006-01-02"), + Weekday: now.Weekday().String(), + }, nil +} + +func floatPtr(val float64) *float64 { + return &val +} + +func intPtr(val int) *int { + return &val +} diff --git a/examples/evaluation/debug/main.go b/examples/evaluation/debug/main.go new file mode 100644 index 000000000..206578ad4 --- /dev/null +++ b/examples/evaluation/debug/main.go @@ -0,0 +1,48 @@ +package main + +import ( + "flag" + "net/http" + + "trpc.group/trpc-go/trpc-agent-go/agent" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" + evalresultlocal "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult/local" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + evalsetlocal "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset/local" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + metriclocal "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/local" + "trpc.group/trpc-go/trpc-agent-go/log" + "trpc.group/trpc-go/trpc-agent-go/server/debug" +) + +const ( + defaultListenAddr = ":8080" + defaultAppName = "evaluation-assistant" +) + +func main() { + modelName := flag.String("model", "deepseek-chat", "Name of the model to use.") + addr := flag.String("addr", defaultListenAddr, "Listen address.") + appName := flag.String("app", defaultAppName, "App name registered in the debug server.") + dataDir := flag.String("data-dir", "./data", "Directory where eval sets and metric configs are stored.") + outputDir := flag.String("output-dir", "./output", "Directory where eval results are stored.") + flag.Parse() + + agents := map[string]agent.Agent{ + *appName: newDemoAgent(*appName, *modelName, true), + } + evalSetManager := evalsetlocal.New(evalset.WithBaseDir(*dataDir)) + evalResultManager := evalresultlocal.New(evalresult.WithBaseDir(*outputDir)) + metricManager := metriclocal.New(metric.WithBaseDir(*dataDir)) + server := debug.New( + agents, + debug.WithEvalSetManager(evalSetManager), + debug.WithEvalResultManager(evalResultManager), + debug.WithMetricManager(metricManager), + ) + + log.Infof("debug+evaluation server listening on %s (app=%s, model=%s)", *addr, *appName, *modelName) + if err := http.ListenAndServe(*addr, server.Handler()); err != nil { + log.Fatalf("server error: %v", err) + } +} diff --git a/examples/evaluation/go.mod b/examples/evaluation/go.mod index b079f9fbf..8e24d2a3d 100644 --- a/examples/evaluation/go.mod +++ b/examples/evaluation/go.mod @@ -5,12 +5,14 @@ go 1.24.4 replace ( trpc.group/trpc-go/trpc-agent-go => ../../ trpc.group/trpc-go/trpc-agent-go/evaluation => ../../evaluation + trpc.group/trpc-go/trpc-agent-go/server/debug => ../../server/debug ) require ( - google.golang.org/genai v1.31.0 + google.golang.org/genai v1.33.0 trpc.group/trpc-go/trpc-agent-go v0.2.2 trpc.group/trpc-go/trpc-agent-go/evaluation v0.0.0-00010101000000-000000000000 + trpc.group/trpc-go/trpc-agent-go/server/debug v0.0.0-00010101000000-000000000000 ) require ( @@ -19,37 +21,40 @@ require ( cloud.google.com/go/compute/metadata v0.5.0 // indirect github.com/bmatcuk/doublestar/v4 v4.9.1 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect - github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect - github.com/google/go-cmp v0.6.0 // indirect + github.com/google/go-cmp v0.7.0 // indirect github.com/google/s2a-go v0.1.8 // indirect github.com/google/uuid v1.6.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect + github.com/gorilla/mux v1.8.1 // indirect github.com/gorilla/websocket v1.5.3 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 // indirect github.com/openai/openai-go v1.12.0 // indirect github.com/panjf2000/ants/v2 v2.10.0 // indirect + github.com/rs/cors v1.11.1 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/tidwall/gjson v1.14.4 // indirect github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.1 // indirect github.com/tidwall/sjson v1.2.5 // indirect go.opencensus.io v0.24.0 // indirect - go.opentelemetry.io/otel v1.29.0 // indirect + go.opentelemetry.io/auto/sdk v1.1.0 // indirect + go.opentelemetry.io/otel v1.38.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.29.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.29.0 // indirect - go.opentelemetry.io/otel/metric v1.29.0 // indirect - go.opentelemetry.io/otel/sdk v1.29.0 // indirect - go.opentelemetry.io/otel/trace v1.29.0 // indirect + go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.opentelemetry.io/otel/sdk v1.38.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect go.opentelemetry.io/proto/otlp v1.3.1 // indirect go.uber.org/multierr v1.10.0 // indirect go.uber.org/zap v1.27.0 // indirect golang.org/x/crypto v0.32.0 // indirect golang.org/x/net v0.34.0 // indirect golang.org/x/sync v0.10.0 // indirect - golang.org/x/sys v0.30.0 // indirect + golang.org/x/sys v0.35.0 // indirect golang.org/x/text v0.21.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect diff --git a/examples/evaluation/go.sum b/examples/evaluation/go.sum index 6c9098e37..7d199233e 100644 --- a/examples/evaluation/go.sum +++ b/examples/evaluation/go.sum @@ -21,8 +21,8 @@ github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.m github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= @@ -45,8 +45,8 @@ github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/s2a-go v0.1.8 h1:zZDs9gcbt9ZPLV0ndSyQk6Kacx2g/X+SKYovpnz3SMM= github.com/google/s2a-go v0.1.8/go.mod h1:6iNWHTpQ+nfNRN5E00MSdfDwVesa8hhS32PhPO8deJA= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= @@ -54,6 +54,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.3.4 h1:XYIDZApgAnrN1c855gTgghdIA6Stxb52D5RnLI1SLyw= github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA= +github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= +github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 h1:asbCHRVmodnJTuQ3qamDwqVOIjwqUPTYmYuemVOx+Ys= @@ -67,6 +69,8 @@ github.com/panjf2000/ants/v2 v2.10.0/go.mod h1:7ZxyxsqE4vvW0M7LSD8aI3cKwgFhBHbxn github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/rs/cors v1.11.1 h1:eU3gRzXLRK57F5rKMGMZURNdIG4EoAmX8k94r9wXWHA= +github.com/rs/cors v1.11.1/go.mod h1:XyqrcTp5zjWr1wsJ8PIRZssZ8b/WMcMf71DJnit4EMU= github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -76,8 +80,8 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/tidwall/gjson v1.14.4 h1:uo0p8EbA09J7RQaflQ1aBRffTR7xedD2bcIVSYxLnkM= github.com/tidwall/gjson v1.14.4/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= @@ -90,22 +94,24 @@ github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= -go.opentelemetry.io/otel v1.29.0 h1:PdomN/Al4q/lN6iBJEN3AwPvUiHPMlt93c8bqTG5Llw= -go.opentelemetry.io/otel v1.29.0/go.mod h1:N/WtXPs1CNCUEx+Agz5uouwCba+i+bJGFicT8SR4NP8= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0 h1:dIIDULZJpgdiHz5tXrTgKIMLkus6jEFa7x5SOKcyR7E= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0/go.mod h1:jlRVBe7+Z1wyxFSUs48L6OBQZ5JwH2Hg/Vbl+t9rAgI= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.29.0 h1:nSiV3s7wiCam610XcLbYOmMfJxB9gO4uK3Xgv5gmTgg= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.29.0/go.mod h1:hKn/e/Nmd19/x1gvIHwtOwVWM+VhuITSWip3JUDghj0= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.29.0 h1:JAv0Jwtl01UFiyWZEMiJZBiTlv5A50zNs8lsthXqIio= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.29.0/go.mod h1:QNKLmUEAq2QUbPQUfvw4fmv0bgbK7UlOSFCnXyfvSNc= -go.opentelemetry.io/otel/metric v1.29.0 h1:vPf/HFWTNkPu1aYeIsc98l4ktOQaL6LeSoeV2g+8YLc= -go.opentelemetry.io/otel/metric v1.29.0/go.mod h1:auu/QWieFVWx+DmQOUMgj0F8LHWdgalxXqvp7BII/W8= -go.opentelemetry.io/otel/sdk v1.29.0 h1:vkqKjk7gwhS8VaWb0POZKmIEDimRCMsopNYnriHyryo= -go.opentelemetry.io/otel/sdk v1.29.0/go.mod h1:pM8Dx5WKnvxLCb+8lG1PRNIDxu9g9b9g59Qr7hfAAok= -go.opentelemetry.io/otel/sdk/metric v1.29.0 h1:K2CfmJohnRgvZ9UAj2/FhIf/okdWcNdBwe1m8xFXiSY= -go.opentelemetry.io/otel/sdk/metric v1.29.0/go.mod h1:6zZLdCl2fkauYoZIOn/soQIDSWFmNSRcICarHfuhNJQ= -go.opentelemetry.io/otel/trace v1.29.0 h1:J/8ZNK4XgR7a21DZUAsbF8pZ5Jcw1VhACmnYt39JTi4= -go.opentelemetry.io/otel/trace v1.29.0/go.mod h1:eHl3w0sp3paPkYstJOmAimxhiFXPg+MMTlEh3nsQgWQ= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= @@ -141,8 +147,8 @@ golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= -golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= @@ -155,8 +161,8 @@ golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBn golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/genai v1.31.0 h1:R7xDt/Dosz11vcXbZ4IgisGnzUGGau2PZOIOAnXsYjw= -google.golang.org/genai v1.31.0/go.mod h1:7pAilaICJlQBonjKKJNhftDFv3SREhZcTe9F6nRcjbg= +google.golang.org/genai v1.33.0 h1:DExzJZbSbxSRmwX2gCsZ+V9vb6rjdmsOAy47ASBgKvg= +google.golang.org/genai v1.33.0/go.mod h1:7pAilaICJlQBonjKKJNhftDFv3SREhZcTe9F6nRcjbg= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= diff --git a/go.mod b/go.mod index a20b83452..9e59c4a21 100644 --- a/go.mod +++ b/go.mod @@ -8,12 +8,10 @@ require ( github.com/gonfva/docxlib v0.0.0-20210517191039-d8f39cecf1ad github.com/google/go-cmp v0.6.0 github.com/google/uuid v1.6.0 - github.com/gorilla/mux v1.8.1 github.com/hashicorp/go-multierror v1.1.1 github.com/mattn/go-sqlite3 v1.14.32 github.com/openai/openai-go v1.12.0 - github.com/panjf2000/ants/v2 v2.9.0 - github.com/rs/cors v1.11.1 + github.com/panjf2000/ants/v2 v2.10.0 github.com/spaolacci/murmur3 v1.1.0 github.com/stretchr/testify v1.10.0 github.com/tencentyun/cos-go-sdk-v5 v0.7.69 diff --git a/go.sum b/go.sum index 66d69cf63..79dce7615 100644 --- a/go.sum +++ b/go.sum @@ -40,8 +40,6 @@ github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= -github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 h1:asbCHRVmodnJTuQ3qamDwqVOIjwqUPTYmYuemVOx+Ys= github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0/go.mod h1:ggCgvZ2r7uOoQjOyu2Y1NhHmEPPzzuhWgcza5M1Ji1I= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -82,16 +80,14 @@ github.com/mozillazg/go-httpheader v0.2.1 h1:geV7TrjbL8KXSyvghnFm+NyTux/hxwueTSr github.com/mozillazg/go-httpheader v0.2.1/go.mod h1:jJ8xECTlalr6ValeXYdOF8fFUISeBAdw6E61aqQma60= github.com/openai/openai-go v1.12.0 h1:NBQCnXzqOTv5wsgNC36PrFEiskGfO5wccfCWDo9S1U0= github.com/openai/openai-go v1.12.0/go.mod h1:g461MYGXEXBVdV5SaR/5tNzNbSfwTBBefwc+LlDCK0Y= -github.com/panjf2000/ants/v2 v2.9.0 h1:SztCLkVxBRigbg+vt0S5QvF5vxAbxbKt09/YfAJ0tEo= -github.com/panjf2000/ants/v2 v2.9.0/go.mod h1:7ZxyxsqE4vvW0M7LSD8aI3cKwgFhBHbxnlN8mDqHa1I= +github.com/panjf2000/ants/v2 v2.10.0 h1:zhRg1pQUtkyRiOFo2Sbqwjp0GfBNo9cUY2/Grpx1p+8= +github.com/panjf2000/ants/v2 v2.10.0/go.mod h1:7ZxyxsqE4vvW0M7LSD8aI3cKwgFhBHbxnlN8mDqHa1I= github.com/perimeterx/marshmallow v1.1.5 h1:a2LALqQ1BlHM8PZblsDdidgv1mWi1DgC2UmX50IvK2s= github.com/perimeterx/marshmallow v1.1.5/go.mod h1:dsXbUu8CRzfYP5a87xpp0xq9S3u0Vchtcl8we9tYaXw= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= -github.com/rs/cors v1.11.1 h1:eU3gRzXLRK57F5rKMGMZURNdIG4EoAmX8k94r9wXWHA= -github.com/rs/cors v1.11.1/go.mod h1:XyqrcTp5zjWr1wsJ8PIRZssZ8b/WMcMf71DJnit4EMU= github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529/go.mod h1:qe5TWALJ8/a1Lqznoc5BDHpYX/8HU60Hm2AwRmqzxqA= github.com/segmentio/asm v1.2.0 h1:9BQrFxC+YOHJlTlHGkTrFWf59nbL3XnCoFLTwDCI7ys= github.com/segmentio/asm v1.2.0/go.mod h1:BqMnlJP91P8d+4ibuonYZw9mfnzI9HfxselHZr5aAcs= diff --git a/server/debug/go.mod b/server/debug/go.mod new file mode 100644 index 000000000..5df137ef3 --- /dev/null +++ b/server/debug/go.mod @@ -0,0 +1,66 @@ +module trpc.group/trpc-go/trpc-agent-go/server/debug + +go 1.24.4 + +replace ( + trpc.group/trpc-go/trpc-agent-go => ../../ + trpc.group/trpc-go/trpc-agent-go/evaluation => ../../evaluation +) + +require ( + github.com/gorilla/mux v1.8.1 + github.com/rs/cors v1.11.1 + github.com/stretchr/testify v1.11.1 + go.opentelemetry.io/otel v1.38.0 + go.opentelemetry.io/otel/sdk v1.38.0 + go.opentelemetry.io/otel/trace v1.38.0 + google.golang.org/genai v1.33.0 + trpc.group/trpc-go/trpc-agent-go v0.0.0-00010101000000-000000000000 + trpc.group/trpc-go/trpc-agent-go/evaluation v0.0.0-00010101000000-000000000000 +) + +require ( + cloud.google.com/go v0.116.0 // indirect + cloud.google.com/go/auth v0.9.3 // indirect + cloud.google.com/go/compute/metadata v0.5.0 // indirect + github.com/bmatcuk/doublestar/v4 v4.9.1 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/google/go-cmp v0.7.0 // indirect + github.com/google/s2a-go v0.1.8 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect + github.com/gorilla/websocket v1.5.3 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 // indirect + github.com/openai/openai-go v1.12.0 // indirect + github.com/panjf2000/ants/v2 v2.10.0 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/spaolacci/murmur3 v1.1.0 // indirect + github.com/tidwall/gjson v1.14.4 // indirect + github.com/tidwall/match v1.1.1 // indirect + github.com/tidwall/pretty v1.2.1 // indirect + github.com/tidwall/sjson v1.2.5 // indirect + go.opencensus.io v0.24.0 // indirect + go.opentelemetry.io/auto/sdk v1.1.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.29.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.29.0 // indirect + go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.opentelemetry.io/proto/otlp v1.3.1 // indirect + go.uber.org/multierr v1.10.0 // indirect + go.uber.org/zap v1.27.0 // indirect + golang.org/x/crypto v0.32.0 // indirect + golang.org/x/net v0.34.0 // indirect + golang.org/x/sync v0.10.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/text v0.21.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect + google.golang.org/grpc v1.66.2 // indirect + google.golang.org/protobuf v1.34.2 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + trpc.group/trpc-go/trpc-a2a-go v0.2.5-0.20251023030722-7f02b57fd14a // indirect +) diff --git a/server/debug/go.sum b/server/debug/go.sum new file mode 100644 index 000000000..a73060dc5 --- /dev/null +++ b/server/debug/go.sum @@ -0,0 +1,206 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.116.0 h1:B3fRrSDkLRt5qSHWe40ERJvhvnQwdZiHu0bJOpldweE= +cloud.google.com/go v0.116.0/go.mod h1:cEPSRWPzZEswwdr9BxE6ChEn01dWlTaF05LiC2Xs70U= +cloud.google.com/go/auth v0.9.3 h1:VOEUIAADkkLtyfr3BLa3R8Ed/j6w1jTBmARx+wb5w5U= +cloud.google.com/go/auth v0.9.3/go.mod h1:7z6VY+7h3KUdRov5F1i8NDP5ZzWKYmEPO842BgCsmTk= +cloud.google.com/go/compute/metadata v0.5.0 h1:Zr0eK8JbFv6+Wi4ilXAR8FJ3wyNdpxHKJNPos6LTZOY= +cloud.google.com/go/compute/metadata v0.5.0/go.mod h1:aHnloV2TPI38yx4s9+wAZhHykWvVCfu7hQbF+9CWoiY= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/bmatcuk/doublestar/v4 v4.9.1 h1:X8jg9rRZmJd4yRy7ZeNDRnM+T3ZfHv15JiBJ/avrEXE= +github.com/bmatcuk/doublestar/v4 v4.9.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= +github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/s2a-go v0.1.8 h1:zZDs9gcbt9ZPLV0ndSyQk6Kacx2g/X+SKYovpnz3SMM= +github.com/google/s2a-go v0.1.8/go.mod h1:6iNWHTpQ+nfNRN5E00MSdfDwVesa8hhS32PhPO8deJA= +github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/enterprise-certificate-proxy v0.3.4 h1:XYIDZApgAnrN1c855gTgghdIA6Stxb52D5RnLI1SLyw= +github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA= +github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= +github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= +github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= +github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 h1:asbCHRVmodnJTuQ3qamDwqVOIjwqUPTYmYuemVOx+Ys= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0/go.mod h1:ggCgvZ2r7uOoQjOyu2Y1NhHmEPPzzuhWgcza5M1Ji1I= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs= +github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= +github.com/openai/openai-go v1.12.0 h1:NBQCnXzqOTv5wsgNC36PrFEiskGfO5wccfCWDo9S1U0= +github.com/openai/openai-go v1.12.0/go.mod h1:g461MYGXEXBVdV5SaR/5tNzNbSfwTBBefwc+LlDCK0Y= +github.com/panjf2000/ants/v2 v2.10.0 h1:zhRg1pQUtkyRiOFo2Sbqwjp0GfBNo9cUY2/Grpx1p+8= +github.com/panjf2000/ants/v2 v2.10.0/go.mod h1:7ZxyxsqE4vvW0M7LSD8aI3cKwgFhBHbxnlN8mDqHa1I= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/rs/cors v1.11.1 h1:eU3gRzXLRK57F5rKMGMZURNdIG4EoAmX8k94r9wXWHA= +github.com/rs/cors v1.11.1/go.mod h1:XyqrcTp5zjWr1wsJ8PIRZssZ8b/WMcMf71DJnit4EMU= +github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= +github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.14.4 h1:uo0p8EbA09J7RQaflQ1aBRffTR7xedD2bcIVSYxLnkM= +github.com/tidwall/gjson v1.14.4/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= +go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= +go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0 h1:dIIDULZJpgdiHz5tXrTgKIMLkus6jEFa7x5SOKcyR7E= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0/go.mod h1:jlRVBe7+Z1wyxFSUs48L6OBQZ5JwH2Hg/Vbl+t9rAgI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.29.0 h1:nSiV3s7wiCam610XcLbYOmMfJxB9gO4uK3Xgv5gmTgg= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.29.0/go.mod h1:hKn/e/Nmd19/x1gvIHwtOwVWM+VhuITSWip3JUDghj0= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.29.0 h1:JAv0Jwtl01UFiyWZEMiJZBiTlv5A50zNs8lsthXqIio= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.29.0/go.mod h1:QNKLmUEAq2QUbPQUfvw4fmv0bgbK7UlOSFCnXyfvSNc= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= +go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ= +go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc= +golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0= +golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/genai v1.33.0 h1:DExzJZbSbxSRmwX2gCsZ+V9vb6rjdmsOAy47ASBgKvg= +google.golang.org/genai v1.33.0/go.mod h1:7pAilaICJlQBonjKKJNhftDFv3SREhZcTe9F6nRcjbg= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= +google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 h1:hjSy6tcFQZ171igDaN5QHOw2n6vx40juYbC/x67CEhc= +google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:qpvKtACPCQhAdu3PyQgV4l3LMXZEtft7y8QcarRsp9I= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 h1:pPJltXNxVzT4pK9yD8vR9X75DaWYYmLGMsEvBfFQZzQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= +google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= +google.golang.org/grpc v1.66.2 h1:3QdXkuq3Bkh7w+ywLdLvM56cmGvQHUMZpiCzt6Rqaoo= +google.golang.org/grpc v1.66.2/go.mod h1:s3/l6xSSCURdVfAnL+TqCNMyTDAGN6+lZeVxnZR128Y= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +trpc.group/trpc-go/trpc-a2a-go v0.2.5-0.20251023030722-7f02b57fd14a h1:dOon6HF2sPRFnhCLEiAeKPc21JHL2eX7UBWjIR8PLaY= +trpc.group/trpc-go/trpc-a2a-go v0.2.5-0.20251023030722-7f02b57fd14a/go.mod h1:Gtytau9Uoc3oPo/dpHvKit+tQn9Qlk5XFG1RiZTGqfk= diff --git a/server/debug/internal/schema/schema.go b/server/debug/internal/schema/schema.go index 74f07a9d6..604d9c8b9 100644 --- a/server/debug/internal/schema/schema.go +++ b/server/debug/internal/schema/schema.go @@ -12,6 +12,12 @@ // packages. They only exist to facilitate request/response marshalling. package schema +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" +) + // ADKSession mirrors the structure expected by ADK Web UI for a session. // Field names follow the camel-case convention required by the UI. type ADKSession struct { @@ -87,3 +93,73 @@ type AgentRunRequest struct { type TraceLLMRequest struct { Contents []Content `json:"contents"` } + +// ListEvalSetsResponse wraps eval set IDs for object-style endpoints. +type ListEvalSetsResponse struct { + EvalSetIds []string `json:"evalSetIds"` +} + +// CreateEvalSetRequest is the request to create an eval set by JSON body. +type CreateEvalSetRequest struct { + EvalSet evalset.EvalSet `json:"evalSet"` +} + +// AddSessionToEvalSetRequest is the request to add a session as an eval case. +type AddSessionToEvalSetRequest struct { + EvalId string `json:"evalId"` + SessionId string `json:"sessionId"` + UserId string `json:"userId"` +} + +// RunEvalRequest is the request to run evaluation for an eval set. +type RunEvalRequest struct { + EvalCaseIds []string `json:"evalCaseIds"` + EvalMetrics []metric.EvalMetric `json:"evalMetrics"` +} + +// RunEvalResult is a per-case result for run_eval endpoints. +type RunEvalResult struct { + EvalSetFile string `json:"evalSetFile,omitempty"` + EvalSetId string `json:"evalSetId"` + EvalId string `json:"evalId"` + FinalEvalStatus int `json:"finalEvalStatus"` + OverallEvalMetricResults []*evalresult.EvalMetricResult `json:"overallEvalMetricResults"` + EvalMetricResultPerInvocation []*evalresult.EvalMetricResultPerInvocation `json:"evalMetricResultPerInvocation"` + UserId string `json:"userId"` + SessionId string `json:"sessionId"` +} + +// RunEvalResponse wraps run eval results for object-style endpoints. +type RunEvalResponse struct { + RunEvalResults []*RunEvalResult `json:"runEvalResults"` +} + +// ListEvalResultsResponse wraps eval result IDs for object-style endpoints. +type ListEvalResultsResponse struct { + EvalResultIds []string `json:"evalResultIds"` +} + +// MetricInterval describes a numeric interval allowed for metric values. +type MetricInterval struct { + MinValue float64 `json:"minValue"` + OpenAtMin bool `json:"openAtMin"` + MaxValue float64 `json:"maxValue"` + OpenAtMax bool `json:"openAtMax"` +} + +// MetricValueInfo provides metadata about the metric value type. +type MetricValueInfo struct { + Interval *MetricInterval `json:"interval,omitempty"` +} + +// MetricInfo describes a registered metric. +type MetricInfo struct { + MetricName string `json:"metricName"` + Description string `json:"description,omitempty"` + MetricValueInfo *MetricValueInfo `json:"metricValueInfo,omitempty"` +} + +// ListMetricsInfoResponse wraps metric metadata for object-style responses. +type ListMetricsInfoResponse struct { + MetricsInfo []*MetricInfo `json:"metricsInfo"` +} diff --git a/server/debug/server.go b/server/debug/server.go index c1a0fdef0..31b6f2c72 100644 --- a/server/debug/server.go +++ b/server/debug/server.go @@ -13,18 +13,32 @@ package debug import ( "context" "encoding/json" + "errors" "fmt" "net/http" + "os" + "sort" "strings" "sync" + "time" "github.com/gorilla/mux" "github.com/rs/cors" "go.opentelemetry.io/otel/attribute" sdktrace "go.opentelemetry.io/otel/sdk/trace" "go.opentelemetry.io/otel/trace/noop" - + "google.golang.org/genai" "trpc.group/trpc-go/trpc-agent-go/agent" + "trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" + evalresultinmemory "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult/inmemory" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + evalsetinmemory "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset/inmemory" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/registry" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + metricinmemory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/inmemory" + "trpc.group/trpc-go/trpc-agent-go/evaluation/service" + evalservice "trpc.group/trpc-go/trpc-agent-go/evaluation/service/local" "trpc.group/trpc-go/trpc-agent-go/event" "trpc.group/trpc-go/trpc-agent-go/graph" itelemetry "trpc.group/trpc-go/trpc-agent-go/internal/telemetry" @@ -49,6 +63,11 @@ type Server struct { sessionSvc session.Service runnerOpts []runner.Option // Extra options applied when creating a runner. + evalSetManager evalset.Manager // evalSetManager is the manager for evaluation sets. + evalResultManager evalresult.Manager // evalResultManager is the manager for evaluation results. + metricManager metric.Manager // metricManager persists configured eval metrics per eval set. + metricRegistry registry.Registry // metricRegistry exposes the available evaluation metrics. + traces map[string]attribute.Set // key: event_id memoryExporter *inMemoryExporter } @@ -68,16 +87,61 @@ func WithRunnerOptions(opts ...runner.Option) Option { return func(s *Server) { s.runnerOpts = append(s.runnerOpts, opts...) } } +// WithEvalSetManager overrides the default eval set manager. +func WithEvalSetManager(m evalset.Manager) Option { + return func(s *Server) { + if m != nil { + s.evalSetManager = m + } + } +} + +// WithEvalResultManager overrides the default eval result manager. +func WithEvalResultManager(m evalresult.Manager) Option { + return func(s *Server) { + if m != nil { + s.evalResultManager = m + } + } +} + +// WithMetricManager overrides the default eval metric manager used for persistence. +func WithMetricManager(m metric.Manager) Option { + return func(s *Server) { + if m != nil { + s.metricManager = m + } + } +} + +// WithMetricRegistry overrides the default evaluator registry used to describe metrics. +func WithMetricRegistry(reg registry.Registry) Option { + return func(s *Server) { + if reg != nil { + s.metricRegistry = reg + } + } +} + +// WithEvaluatorRegistry is kept for backward compatibility. Use WithMetricRegistry instead. +func WithEvaluatorRegistry(reg registry.Registry) Option { + return WithMetricRegistry(reg) +} + // New creates a new CLI HTTP server with explicit agent registration. The // behaviour can be tweaked via functional options. func New(agents map[string]agent.Agent, opts ...Option) *Server { s := &Server{ - agents: agents, - router: mux.NewRouter(), - runners: make(map[string]runner.Runner), - traces: make(map[string]attribute.Set), - memoryExporter: newInMemoryExporter(), - sessionSvc: sessioninmemory.NewSessionService(), + agents: agents, + router: mux.NewRouter(), + runners: make(map[string]runner.Runner), + traces: make(map[string]attribute.Set), + memoryExporter: newInMemoryExporter(), + sessionSvc: sessioninmemory.NewSessionService(), + evalSetManager: evalsetinmemory.New(), + evalResultManager: evalresultinmemory.New(), + metricManager: metricinmemory.New(), + metricRegistry: registry.New(), } // Apply user-provided options. @@ -89,7 +153,7 @@ func New(agents map[string]agent.Agent, opts ...Option) *Server { c := cors.New(cors.Options{ AllowedOrigins: []string{"*"}, AllowCredentials: true, - AllowedMethods: []string{"GET", "POST", "OPTIONS"}, + AllowedMethods: []string{"GET", "POST", "PUT", "DELETE", "OPTIONS"}, AllowedHeaders: []string{"*"}, ExposedHeaders: []string{"Content-Length", "Content-Type"}, }) @@ -229,6 +293,37 @@ func (s *Server) registerRoutes() { s.router.HandleFunc("/apps/{appName}/users/{userId}/sessions/{sessionId}", s.handleGetSession).Methods(http.MethodGet) + // Evaluation APIs. + s.router.HandleFunc("/apps/{appName}/eval-sets", s.handleCreateEvalSet).Methods(http.MethodPost) + s.router.HandleFunc("/apps/{appName}/eval_sets/{evalSetId}", s.handleCreateEvalSetLegacy).Methods(http.MethodPost) + s.router.HandleFunc("/apps/{appName}/eval-sets", s.handleListEvalSets).Methods(http.MethodGet) + s.router.HandleFunc("/apps/{appName}/eval_sets", s.handleListEvalSetsLegacy).Methods(http.MethodGet) + s.router.HandleFunc("/apps/{appName}/eval-sets/{evalSetId}/add-session", s.handleAddSessionToEvalSet). + Methods(http.MethodPost) + s.router.HandleFunc("/apps/{appName}/eval_sets/{evalSetId}/add_session", s.handleAddSessionToEvalSet). + Methods(http.MethodPost) + s.router.HandleFunc("/apps/{appName}/eval_sets/{evalSetId}/evals", s.handleListEvalsInSet). + Methods(http.MethodGet) + s.router.HandleFunc("/apps/{appName}/eval-sets/{evalSetId}/eval-cases/{evalCaseId}", s.handleGetEvalCase). + Methods(http.MethodGet) + s.router.HandleFunc("/apps/{appName}/eval_sets/{evalSetId}/evals/{evalCaseId}", s.handleGetEvalCase). + Methods(http.MethodGet) + s.router.HandleFunc("/apps/{appName}/eval-sets/{evalSetId}/eval-cases/{evalCaseId}", s.handleUpdateEvalCase). + Methods(http.MethodPut) + s.router.HandleFunc("/apps/{appName}/eval_sets/{evalSetId}/evals/{evalCaseId}", s.handleUpdateEvalCase). + Methods(http.MethodPut) + s.router.HandleFunc("/apps/{appName}/eval-sets/{evalSetId}/eval-cases/{evalCaseId}", s.handleDeleteEvalCase). + Methods(http.MethodDelete) + s.router.HandleFunc("/apps/{appName}/eval_sets/{evalSetId}/evals/{evalCaseId}", s.handleDeleteEvalCase). + Methods(http.MethodDelete) + s.router.HandleFunc("/apps/{appName}/eval_sets/{evalSetId}/run_eval", s.handleRunEvalLegacy).Methods(http.MethodPost) + s.router.HandleFunc("/apps/{appName}/eval-sets/{evalSetId}/run", s.handleRunEval).Methods(http.MethodPost) + s.router.HandleFunc("/apps/{appName}/eval_results/{evalResultId}", s.handleGetEvalResultLegacy).Methods(http.MethodGet) + s.router.HandleFunc("/apps/{appName}/eval-results/{evalResultId}", s.handleGetEvalResult).Methods(http.MethodGet) + s.router.HandleFunc("/apps/{appName}/eval_results", s.handleListEvalResultsLegacy).Methods(http.MethodGet) + s.router.HandleFunc("/apps/{appName}/eval-results", s.handleListEvalResults).Methods(http.MethodGet) + s.router.HandleFunc("/apps/{appName}/metrics-info", s.handleListMetricsInfo).Methods(http.MethodGet) + // Debug APIs s.router.HandleFunc("/debug/trace/{event_id}", s.handleEventTrace).Methods(http.MethodGet) @@ -245,6 +340,11 @@ func (s *Server) registerRoutes() { } s.router.HandleFunc("/run", preflight).Methods(http.MethodOptions) s.router.HandleFunc("/run_sse", preflight).Methods(http.MethodOptions) + s.router.HandleFunc("/apps/{appName}/eval-sets", preflight).Methods(http.MethodOptions) + s.router.HandleFunc("/apps/{appName}/eval_sets/{evalSetId}", preflight).Methods(http.MethodOptions) + s.router.HandleFunc("/apps/{appName}/eval-sets/{evalSetId}/add-session", preflight).Methods(http.MethodOptions) + s.router.HandleFunc("/apps/{appName}/eval_sets/{evalSetId}/add_session", preflight).Methods(http.MethodOptions) + s.router.HandleFunc("/apps/{appName}/eval_sets/{evalSetId}/run_eval", preflight).Methods(http.MethodOptions) } // ---- Handlers ----------------------------------------------------------- @@ -557,6 +657,513 @@ func (s *Server) handleRunSSE(w http.ResponseWriter, r *http.Request) { log.Infof("handleRunSSE finished for session %s", req.SessionID) } +// handleCreateEvalSet creates an eval set. +func (s *Server) handleCreateEvalSet(w http.ResponseWriter, r *http.Request) { + log.Infof("handleCreateEvalSet called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + var req schema.CreateEvalSetRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + defer r.Body.Close() + evalset, err := s.evalSetManager.Create(r.Context(), appName, req.EvalSet.EvalSetID) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + s.writeJSON(w, evalset) +} + +// handleCreateEvalSetLegacy creates an eval set. +func (s *Server) handleCreateEvalSetLegacy(w http.ResponseWriter, r *http.Request) { + log.Infof("handleCreateEvalSetLegacy called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + evalSetID := vars["evalSetId"] + evalset, err := s.evalSetManager.Create(r.Context(), appName, evalSetID) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + s.writeJSON(w, evalset) +} + +// handleListEvalSetsLegacy lists all eval sets. +func (s *Server) handleListEvalSetsLegacy(w http.ResponseWriter, r *http.Request) { + log.Infof("handleListEvalSets called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + ids, err := s.evalSetManager.List(r.Context(), appName) + if err != nil { + ids = []string{} + } + s.writeJSON(w, ids) +} + +// handleListEvalSets lists all eval sets. +func (s *Server) handleListEvalSets(w http.ResponseWriter, r *http.Request) { + log.Infof("handleListEvalSets called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + ids, err := s.evalSetManager.List(r.Context(), appName) + if err != nil { + ids = []string{} + } + s.writeJSON(w, &schema.ListEvalSetsResponse{EvalSetIds: ids}) +} + +// handleAddSessionToEvalSet adds a session to an eval set. +func (s *Server) handleAddSessionToEvalSet(w http.ResponseWriter, r *http.Request) { + log.Infof("handleAddSessionToEvalSet called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + evalSetID := vars["evalSetId"] + + var req schema.AddSessionToEvalSetRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + defer r.Body.Close() + + // Fetch session. + sess, err := s.sessionSvc.GetSession(r.Context(), session.Key{AppName: appName, UserID: req.UserId, SessionID: req.SessionId}) + if err != nil || sess == nil { + http.Error(w, "Session not found.", http.StatusBadRequest) + return + } + // Convert to eval invocations. + invocations := s.convertSessionToEvalInvocations(sess) + initialState := map[string]any{} + newCase := &evalset.EvalCase{ + EvalID: req.EvalId, + Conversation: invocations, + SessionInput: &evalset.SessionInput{AppName: appName, UserID: req.UserId, State: initialState}, + CreationTimestamp: &epochtime.EpochTime{Time: time.Now()}, + } + if err := s.evalSetManager.AddCase(r.Context(), appName, evalSetID, newCase); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + w.WriteHeader(http.StatusOK) +} + +// handleListEvalsInSet lists all eval cases in an eval set. +func (s *Server) handleListEvalsInSet(w http.ResponseWriter, r *http.Request) { + log.Infof("handleListEvalsInSet called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + evalSetID := vars["evalSetId"] + + evalSet, err := s.evalSetManager.Get(r.Context(), appName, evalSetID) + if err != nil || evalSet == nil { + http.Error(w, fmt.Sprintf("Eval set `%s` not found.", evalSetID), http.StatusBadRequest) + return + } + ids := make([]string, 0, len(evalSet.EvalCases)) + for _, c := range evalSet.EvalCases { + ids = append(ids, c.EvalID) + } + sort.Strings(ids) + s.writeJSON(w, ids) +} + +// handleGetEvalCase gets a single eval case. +func (s *Server) handleGetEvalCase(w http.ResponseWriter, r *http.Request) { + log.Infof("handleGetEvalCase called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + evalSetID := vars["evalSetId"] + evalCaseID := vars["evalCaseId"] + + evalCase, err := s.evalSetManager.GetCase(r.Context(), appName, evalSetID, evalCaseID) + if err != nil || evalCase == nil { + http.Error(w, fmt.Sprintf("Eval set `%s` or Eval `%s` not found.", evalSetID, evalCaseID), http.StatusNotFound) + return + } + s.writeJSON(w, evalCase) +} + +// handleUpdateEvalCase updates a stored eval case. +func (s *Server) handleUpdateEvalCase(w http.ResponseWriter, r *http.Request) { + log.Infof("handleUpdateEvalCase called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + evalSetID := vars["evalSetId"] + evalCaseID := vars["evalCaseId"] + + var evalCase evalset.EvalCase + if err := json.NewDecoder(r.Body).Decode(&evalCase); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + defer r.Body.Close() + + if evalCase.EvalID != "" && evalCase.EvalID != evalCaseID { + http.Error(w, "Eval id in payload must match path parameter.", http.StatusBadRequest) + return + } + evalCase.EvalID = evalCaseID + if err := s.evalSetManager.UpdateCase(r.Context(), appName, evalSetID, &evalCase); err != nil { + http.Error(w, err.Error(), http.StatusNotFound) + return + } + w.WriteHeader(http.StatusOK) +} + +// handleDeleteEvalCase deletes an eval case. +func (s *Server) handleDeleteEvalCase(w http.ResponseWriter, r *http.Request) { + log.Infof("handleDeleteEvalCase called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + evalSetID := vars["evalSetId"] + evalCaseID := vars["evalCaseId"] + if err := s.evalSetManager.DeleteCase(r.Context(), appName, evalSetID, evalCaseID); err != nil { + http.Error(w, err.Error(), http.StatusNotFound) + return + } + w.WriteHeader(http.StatusOK) +} + +// handleRunEvalLegacy runs an eval given the details in the eval request. +func (s *Server) handleRunEvalLegacy(w http.ResponseWriter, r *http.Request) { + log.Infof("handleRunEvalLegacy called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + evalSetID := vars["evalSetId"] + + var req schema.RunEvalRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + defer r.Body.Close() + + caseIDs := req.EvalCaseIds + metricConfigs := req.EvalMetrics + + runner, err := s.getRunner(appName) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + serviceOpts := []service.Option{ + service.WithEvalSetManager(s.evalSetManager), + service.WithEvalResultManager(s.evalResultManager), + service.WithRegistry(s.metricRegistry), + } + evalService, err := evalservice.New(runner, serviceOpts...) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + inferenceRequest := &service.InferenceRequest{AppName: appName, EvalSetID: evalSetID, EvalCaseIDs: caseIDs} + inferenceResults, err := evalService.Inference(r.Context(), inferenceRequest) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + metrics, err := s.resolveEvalMetrics(r.Context(), appName, evalSetID, metricConfigs) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + evaluateRequest := &service.EvaluateRequest{AppName: appName, EvalSetID: evalSetID, InferenceResults: inferenceResults, EvaluateConfig: &service.EvaluateConfig{EvalMetrics: metrics}} + evalSetResult, err := evalService.Evaluate(r.Context(), evaluateRequest) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + runEvalResults := make([]*schema.RunEvalResult, 0, len(evalSetResult.EvalCaseResults)) + for _, result := range evalSetResult.EvalCaseResults { + runEvalResults = append(runEvalResults, &schema.RunEvalResult{ + EvalSetFile: evalSetID, + EvalSetId: evalSetID, + EvalId: result.EvalID, + FinalEvalStatus: int(result.FinalEvalStatus), + OverallEvalMetricResults: result.OverallEvalMetricResults, + EvalMetricResultPerInvocation: result.EvalMetricResultPerInvocation, + UserId: result.UserID, + SessionId: result.SessionID, + }) + } + s.writeJSON(w, runEvalResults) +} + +// handleRunEval runs an eval given the details in the eval request. +func (s *Server) handleRunEval(w http.ResponseWriter, r *http.Request) { + log.Infof("handleRunEval called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + evalSetID := vars["evalSetId"] + + var req schema.RunEvalRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + defer r.Body.Close() + + caseIDs := req.EvalCaseIds + metricConfigs := req.EvalMetrics + + runner, err := s.getRunner(appName) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + serviceOpts := []service.Option{ + service.WithEvalSetManager(s.evalSetManager), + service.WithEvalResultManager(s.evalResultManager), + service.WithRegistry(s.metricRegistry), + } + evalService, err := evalservice.New(runner, serviceOpts...) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + inferenceRequest := &service.InferenceRequest{AppName: appName, EvalSetID: evalSetID, EvalCaseIDs: caseIDs} + inferenceResults, err := evalService.Inference(r.Context(), inferenceRequest) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + metrics, err := s.resolveEvalMetrics(r.Context(), appName, evalSetID, metricConfigs) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + evaluateRequest := &service.EvaluateRequest{AppName: appName, EvalSetID: evalSetID, InferenceResults: inferenceResults, EvaluateConfig: &service.EvaluateConfig{EvalMetrics: metrics}} + evalSetResult, err := evalService.Evaluate(r.Context(), evaluateRequest) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + runEvalResults := make([]*schema.RunEvalResult, 0, len(evalSetResult.EvalCaseResults)) + for _, result := range evalSetResult.EvalCaseResults { + runEvalResults = append(runEvalResults, &schema.RunEvalResult{ + EvalSetFile: evalSetID, + EvalSetId: evalSetID, + EvalId: result.EvalID, + FinalEvalStatus: int(result.FinalEvalStatus), + OverallEvalMetricResults: result.OverallEvalMetricResults, + EvalMetricResultPerInvocation: result.EvalMetricResultPerInvocation, + UserId: result.UserID, + SessionId: result.SessionID, + }) + } + s.writeJSON(w, &schema.RunEvalResponse{RunEvalResults: runEvalResults}) +} + +// handleGetEvalResultLegacy gets a full eval set result. +func (s *Server) handleGetEvalResultLegacy(w http.ResponseWriter, r *http.Request) { + log.Infof("handleGetEvalResultLegacy called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + evalResultID := vars["evalResultId"] + + evalResult, err := s.evalResultManager.Get(r.Context(), appName, evalResultID) + if err != nil || evalResult == nil { + http.Error(w, "Not Found", http.StatusNotFound) + return + } + s.writeJSON(w, evalResult) +} + +// handleGetEvalResult gets a full eval set result. +func (s *Server) handleGetEvalResult(w http.ResponseWriter, r *http.Request) { + log.Infof("handleGetEvalResult called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + evalResultID := vars["evalResultId"] + + evalResult, err := s.evalResultManager.Get(r.Context(), appName, evalResultID) + if err != nil || evalResult == nil { + http.Error(w, "Not Found", http.StatusNotFound) + return + } + s.writeJSON(w, evalResult) +} + +// handleListEvalResultsLegacy lists all eval result IDs for an app. +func (s *Server) handleListEvalResultsLegacy(w http.ResponseWriter, r *http.Request) { + log.Infof("handleListEvalResults called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + ids, err := s.evalResultManager.List(r.Context(), appName) + if err != nil { + ids = []string{} + } + s.writeJSON(w, ids) +} + +// handleListEvalResults lists all eval results for an app. +func (s *Server) handleListEvalResults(w http.ResponseWriter, r *http.Request) { + log.Infof("handleListEvalResults called: path=%s", r.URL.Path) + vars := mux.Vars(r) + appName := vars["appName"] + ids, err := s.evalResultManager.List(r.Context(), appName) + if err != nil { + ids = []string{} + } + s.writeJSON(w, &schema.ListEvalResultsResponse{EvalResultIds: ids}) +} + +// handleListMetricsInfo lists metadata for the registered evaluation metrics. +func (s *Server) handleListMetricsInfo(w http.ResponseWriter, r *http.Request) { + log.Infof("handleListMetricsInfo called: path=%s", r.URL.Path) + response := &schema.ListMetricsInfoResponse{MetricsInfo: s.buildMetricInfos()} + s.writeJSON(w, response) +} + +// buildMetricInfos collects metadata for the current evaluator registry. +func (s *Server) buildMetricInfos() []*schema.MetricInfo { + if s.metricRegistry == nil { + return []*schema.MetricInfo{} + } + names := s.metricRegistry.List() + infos := make([]*schema.MetricInfo, 0, len(names)) + for _, name := range names { + evaluator, err := s.metricRegistry.Get(name) + if err != nil { + log.Errorf("get evaluator %s: %v", name, err) + continue + } + info := &schema.MetricInfo{ + MetricName: evaluator.Name(), + Description: evaluator.Description(), + MetricValueInfo: &schema.MetricValueInfo{ + Interval: &schema.MetricInterval{ + MinValue: 0, + OpenAtMin: false, + MaxValue: 1, + OpenAtMax: false, + }, + }, + } + infos = append(infos, info) + } + return infos +} + +// resolveEvalMetrics returns the metric configuration for a run, optionally +// persisting values via the configured metric manager. +func (s *Server) resolveEvalMetrics(ctx context.Context, appName, evalSetID string, + configs []metric.EvalMetric) ([]*metric.EvalMetric, error) { + if len(configs) > 0 { + metrics := make([]*metric.EvalMetric, 0, len(configs)) + for _, cfg := range configs { + metricCopy := cfg + metrics = append(metrics, &metricCopy) + if s.metricManager == nil { + continue + } + if err := s.metricManager.Update(ctx, appName, evalSetID, &metricCopy); err != nil { + if errors.Is(err, os.ErrNotExist) { + if err := s.metricManager.Add(ctx, appName, evalSetID, &metricCopy); err != nil { + return nil, fmt.Errorf("store metric %s.%s.%s: %w", appName, evalSetID, + metricCopy.MetricName, err) + } + continue + } + return nil, fmt.Errorf("store metric %s.%s.%s: %w", appName, evalSetID, + metricCopy.MetricName, err) + } + } + return metrics, nil + } + if s.metricManager == nil { + return nil, errors.New("eval metrics not provided") + } + names, err := s.metricManager.List(ctx, appName, evalSetID) + if err != nil { + return nil, fmt.Errorf("list metrics for %s.%s: %w", appName, evalSetID, err) + } + if len(names) == 0 { + return nil, errors.New("no eval metrics configured for this eval set") + } + metrics := make([]*metric.EvalMetric, 0, len(names)) + for _, name := range names { + m, err := s.metricManager.Get(ctx, appName, evalSetID, name) + if err != nil { + return nil, fmt.Errorf("get metric %s for %s.%s: %w", name, appName, evalSetID, err) + } + metrics = append(metrics, m) + } + return metrics, nil +} + +// convertSessionToEvalInvocations builds eval invocations from a session's events. +func (s *Server) convertSessionToEvalInvocations(sess *session.Session) []*evalset.Invocation { + var invocations []*evalset.Invocation + if sess == nil { + return invocations + } + events := sess.GetEvents() + if len(events) == 0 { + return invocations + } + var cur *evalset.Invocation + for _, e := range events { + if e.Response == nil || len(e.Response.Choices) == 0 { + continue + } + // Start a new invocation on user message. + msg := e.Response.Choices[0].Message + if msg.Role == model.RoleUser { + // Flush previous. + if cur != nil { + invocations = append(invocations, cur) + } + cur = &evalset.Invocation{ + InvocationID: e.InvocationID, + UserContent: &genai.Content{ + Role: string(model.RoleUser), + Parts: []*genai.Part{{Text: msg.Content}}, + }, + CreationTimestamp: &epochtime.EpochTime{Time: e.Timestamp}, + IntermediateData: &evalset.IntermediateData{}, + } + continue + } + // If this is a final response, set finalResponse. + if e.IsFinalResponse() && cur != nil { + if msg.Content != "" { + cur.FinalResponse = &genai.Content{Role: string(msg.Role), Parts: []*genai.Part{{Text: msg.Content}}} + } + continue + } + // Capture tool calls as tool uses. + if e.IsToolCallResponse() && cur != nil { + for _, tc := range msg.ToolCalls { + if use := convertToolCallToFunctionCall(&tc); use != nil { + cur.IntermediateData.ToolUses = append(cur.IntermediateData.ToolUses, use) + } + } + } + } + if cur != nil { + invocations = append(invocations, cur) + } + return invocations +} + +// convertToolCallToFunctionCall converts model.ToolCall to genai.FunctionCall. +func convertToolCallToFunctionCall(tc *model.ToolCall) *genai.FunctionCall { + if tc == nil || tc.Function.Name == "" { + return nil + } + var args map[string]any + if len(tc.Function.Arguments) > 0 { + if err := json.Unmarshal(tc.Function.Arguments, &args); err != nil { + args = map[string]any{"raw": string(tc.Function.Arguments)} + } + } + return &genai.FunctionCall{ID: tc.ID, Name: tc.Function.Name, Args: args} +} + // convertSessionToADKFormat converts an internal session object to the // flattened structure the ADK Web UI expects. func convertSessionToADKFormat(s *session.Session) schema.ADKSession { diff --git a/server/debug/server_test.go b/server/debug/server_test.go index 06a004214..577968dca 100644 --- a/server/debug/server_test.go +++ b/server/debug/server_test.go @@ -14,6 +14,7 @@ import ( "context" "encoding/json" "errors" + "io" "net/http" "net/http/httptest" "strings" @@ -22,12 +23,18 @@ import ( "github.com/gorilla/mux" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "go.opentelemetry.io/otel/attribute" sdktrace "go.opentelemetry.io/otel/sdk/trace" "go.opentelemetry.io/otel/sdk/trace/tracetest" "go.opentelemetry.io/otel/trace" "trpc.group/trpc-go/trpc-agent-go/agent" "trpc.group/trpc-go/trpc-agent-go/agent/llmagent" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/status" "trpc.group/trpc-go/trpc-agent-go/event" "trpc.group/trpc-go/trpc-agent-go/graph" "trpc.group/trpc-go/trpc-agent-go/model" @@ -354,6 +361,348 @@ func TestConvertSessionToADKFormat(t *testing.T) { assert.Equal(t, 1, len(adkSession.State), "expected 1 state entry, got %d", len(adkSession.State)) } +func TestServer_convertSessionToEvalInvocations(t *testing.T) { + sess := &session.Session{} + sess.Events = append(sess.Events, + *newUserMessageEvent("invocation-1", "calc add 1 2"), + *newToolCallEvent("invocation-1"), + *newAssistantFinalEvent("invocation-1", "calc result: 3"), + ) + + srv := &Server{} + invocations := srv.convertSessionToEvalInvocations(sess) + + require.Len(t, invocations, 1) + inv := invocations[0] + require.NotNil(t, inv.UserContent) + require.NotNil(t, inv.FinalResponse) + assert.Equal(t, "invocation-1", inv.InvocationID) + assert.Equal(t, "calc add 1 2", inv.UserContent.Parts[0].Text) + assert.Equal(t, "calc result: 3", inv.FinalResponse.Parts[0].Text) + require.NotNil(t, inv.IntermediateData) + require.Len(t, inv.IntermediateData.ToolUses, 1) + assert.Equal(t, "calculator", inv.IntermediateData.ToolUses[0].Name) + assert.Equal(t, "add", inv.IntermediateData.ToolUses[0].Args["operation"]) +} + +func TestServer_handleAddSessionToEvalSet(t *testing.T) { + appName := "assistant" + srv := New(map[string]agent.Agent{ + appName: &mockAgent{name: appName}, + }) + + ctx := context.Background() + _, err := srv.evalSetManager.Create(ctx, appName, "eval-1") + require.NoError(t, err) + + sess := recordEvalSession(t, srv, appName, "user-1", "session-1") + + body := schema.AddSessionToEvalSetRequest{ + EvalId: "case-1", + SessionId: sess.ID, + UserId: sess.UserID, + } + payload, err := json.Marshal(body) + require.NoError(t, err) + + req := httptest.NewRequest(http.MethodPost, "/apps/assistant/eval-sets/eval-1/add-session", bytes.NewReader(payload)) + req.Header.Set("Content-Type", "application/json") + req = mux.SetURLVars(req, map[string]string{ + "appName": appName, + "evalSetId": "eval-1", + }) + w := httptest.NewRecorder() + + srv.handleAddSessionToEvalSet(w, req) + assert.Equal(t, http.StatusOK, w.Code) + + evalCase, err := srv.evalSetManager.GetCase(ctx, appName, "eval-1", "case-1") + require.NoError(t, err) + require.NotNil(t, evalCase) + require.Len(t, evalCase.Conversation, 1) + assert.Equal(t, "calc add 1 2", evalCase.Conversation[0].UserContent.Parts[0].Text) + assert.Equal(t, "calc result: 3", evalCase.Conversation[0].FinalResponse.Parts[0].Text) + require.NotNil(t, evalCase.Conversation[0].IntermediateData) + require.Len(t, evalCase.Conversation[0].IntermediateData.ToolUses, 1) + assert.Equal(t, "tool-call-1", evalCase.Conversation[0].IntermediateData.ToolUses[0].ID) + assert.Equal(t, "user-1", evalCase.SessionInput.UserID) +} + +func TestServer_EvaluationEndpoints(t *testing.T) { + appName := "assistant" + srv := New(map[string]agent.Agent{ + appName: &mockAgent{name: appName}, + }) + require.NoError(t, srv.metricRegistry.Register("fake_metric", &fakeEvaluatorImpl{name: "fake_metric"})) + srv.runners[appName] = &fakeEvalRunner{events: []*event.Event{ + newRunnerToolCallEvent("eval-run-1"), + newRunnerFinalEvent("eval-run-1", "calc result: 3"), + }} + + sessionObj := recordEvalSession(t, srv, appName, "user-http", "session-http") + + newSetBody := map[string]any{ + "evalSet": map[string]any{ + "evalSetId": "eval-http", + }, + } + resp := performJSONRequest(t, srv.Handler(), http.MethodPost, "/apps/assistant/eval-sets", newSetBody) + assert.Equal(t, http.StatusOK, resp.Code) + + resp = performJSONRequest(t, srv.Handler(), http.MethodPost, "/apps/assistant/eval_sets/eval-legacy", nil) + assert.Equal(t, http.StatusOK, resp.Code) + + resp = performJSONRequest(t, srv.Handler(), http.MethodGet, "/apps/assistant/eval-sets", nil) + var listResp schema.ListEvalSetsResponse + decodeBody(t, resp.Body, &listResp) + assert.ElementsMatch(t, []string{"eval-http", "eval-legacy"}, listResp.EvalSetIds) + + resp = performJSONRequest(t, srv.Handler(), http.MethodGet, "/apps/assistant/eval_sets", nil) + var legacyList []string + decodeBody(t, resp.Body, &legacyList) + assert.ElementsMatch(t, []string{"eval-http", "eval-legacy"}, legacyList) + + addBody := schema.AddSessionToEvalSetRequest{ + EvalId: "case-http", + SessionId: sessionObj.ID, + UserId: sessionObj.UserID, + } + resp = performJSONRequest(t, srv.Handler(), http.MethodPost, "/apps/assistant/eval-sets/eval-http/add-session", addBody) + assert.Equal(t, http.StatusOK, resp.Code) + + resp = performJSONRequest(t, srv.Handler(), http.MethodGet, "/apps/assistant/eval_sets/eval-http/evals", nil) + var caseIDs []string + decodeBody(t, resp.Body, &caseIDs) + assert.Equal(t, []string{"case-http"}, caseIDs) + + resp = performJSONRequest(t, srv.Handler(), http.MethodGet, "/apps/assistant/eval-sets/eval-http/eval-cases/case-http", nil) + var fetchedCase evalset.EvalCase + decodeBody(t, resp.Body, &fetchedCase) + assert.Equal(t, "case-http", fetchedCase.EvalID) + + resp = performJSONRequest(t, srv.Handler(), http.MethodGet, "/apps/assistant/eval_sets/eval-http/evals/case-http", nil) + var legacyFetched evalset.EvalCase + decodeBody(t, resp.Body, &legacyFetched) + assert.Equal(t, "case-http", legacyFetched.EvalID) + + fetchedCase.SessionInput.State = map[string]any{"channel": "test"} + updateBody, err := json.Marshal(fetchedCase) + require.NoError(t, err) + resp = performRequest(t, srv.Handler(), http.MethodPut, "/apps/assistant/eval-sets/eval-http/eval-cases/case-http", bytes.NewReader(updateBody)) + assert.Equal(t, http.StatusOK, resp.Code) + + resp = performJSONRequest(t, srv.Handler(), http.MethodGet, "/apps/assistant/metrics-info", nil) + var metricsInfo schema.ListMetricsInfoResponse + decodeBody(t, resp.Body, &metricsInfo) + assert.NotEmpty(t, metricsInfo.MetricsInfo) + + runReq := schema.RunEvalRequest{ + EvalCaseIds: []string{"case-http"}, + EvalMetrics: []metric.EvalMetric{{MetricName: "fake_metric", Threshold: 0.5}}, + } + resp = performJSONRequest(t, srv.Handler(), http.MethodPost, "/apps/assistant/eval-sets/eval-http/run", runReq) + assert.Equal(t, http.StatusOK, resp.Code) + var runResp schema.RunEvalResponse + decodeBody(t, resp.Body, &runResp) + require.Len(t, runResp.RunEvalResults, 1) + assert.Equal(t, "case-http", runResp.RunEvalResults[0].EvalId) + + resp = performJSONRequest(t, srv.Handler(), http.MethodPost, "/apps/assistant/eval_sets/eval-http/run_eval", runReq) + assert.Equal(t, http.StatusOK, resp.Code) + var legacyRun []*schema.RunEvalResult + decodeBody(t, resp.Body, &legacyRun) + require.Len(t, legacyRun, 1) + + resp = performJSONRequest(t, srv.Handler(), http.MethodGet, "/apps/assistant/eval-results", nil) + var evalResultIDs struct { + EvalResultIds []string `json:"evalResultIds"` + } + decodeBody(t, resp.Body, &evalResultIDs) + require.NotEmpty(t, evalResultIDs.EvalResultIds) + + resp = performJSONRequest(t, srv.Handler(), http.MethodGet, "/apps/assistant/eval_results", nil) + var legacyResultIDs []string + decodeBody(t, resp.Body, &legacyResultIDs) + require.NotEmpty(t, legacyResultIDs) + + targetID := evalResultIDs.EvalResultIds[0] + path := "/apps/assistant/eval-results/" + targetID + resp = performJSONRequest(t, srv.Handler(), http.MethodGet, path, nil) + var evalResult evalresult.EvalSetResult + decodeBody(t, resp.Body, &evalResult) + assert.Equal(t, targetID, evalResult.EvalSetResultID) + + resp = performJSONRequest(t, srv.Handler(), http.MethodGet, "/apps/assistant/eval_results/"+targetID, nil) + var legacyEvalResult evalresult.EvalSetResult + decodeBody(t, resp.Body, &legacyEvalResult) + assert.Equal(t, targetID, legacyEvalResult.EvalSetResultID) + + resp = performRequest(t, srv.Handler(), http.MethodDelete, "/apps/assistant/eval-sets/eval-http/eval-cases/case-http", nil) + assert.Equal(t, http.StatusOK, resp.Code) + + resp = performJSONRequest(t, srv.Handler(), http.MethodGet, "/apps/assistant/eval_sets/eval-http/evals", nil) + decodeBody(t, resp.Body, &caseIDs) + assert.Empty(t, caseIDs) +} + +func recordEvalSession(t *testing.T, srv *Server, appName, userID, sessionID string) *session.Session { + ctx := context.Background() + sess, err := srv.sessionSvc.CreateSession(ctx, session.Key{ + AppName: appName, + UserID: userID, + SessionID: sessionID, + }, session.StateMap{}) + require.NoError(t, err) + + require.NoError(t, srv.sessionSvc.AppendEvent(ctx, sess, newUserMessageEvent("invocation-1", "calc add 1 2"))) + require.NoError(t, srv.sessionSvc.AppendEvent(ctx, sess, newToolCallEvent("invocation-1"))) + require.NoError(t, srv.sessionSvc.AppendEvent(ctx, sess, newAssistantFinalEvent("invocation-1", "calc result: 3"))) + return sess +} + +func newUserMessageEvent(invocationID, content string) *event.Event { + rsp := &model.Response{ + Choices: []model.Choice{{ + Message: model.Message{ + Role: model.RoleUser, + Content: content, + }, + }}, + Done: true, + } + return event.NewResponseEvent(invocationID, string(model.RoleUser), rsp) +} + +func newToolCallEvent(invocationID string) *event.Event { + args := json.RawMessage(`{"operation":"add","a":1,"b":2}`) + rsp := &model.Response{ + Choices: []model.Choice{{ + Message: model.Message{ + Role: model.RoleAssistant, + ToolCalls: []model.ToolCall{ + { + ID: "tool-call-1", + Function: model.FunctionDefinitionParam{ + Name: "calculator", + Arguments: args, + }, + }, + }, + }, + }}, + } + return event.NewResponseEvent(invocationID, string(model.RoleAssistant), rsp) +} + +func newAssistantFinalEvent(invocationID, content string) *event.Event { + rsp := &model.Response{ + Choices: []model.Choice{{ + Message: model.Message{ + Role: model.RoleAssistant, + Content: content, + }, + }}, + Done: true, + } + return event.NewResponseEvent(invocationID, string(model.RoleAssistant), rsp) +} + +type fakeEvalRunner struct { + events []*event.Event +} + +func (f *fakeEvalRunner) Run(ctx context.Context, userID, sessionID string, message model.Message, runOpts ...agent.RunOption) (<-chan *event.Event, error) { + ch := make(chan *event.Event, len(f.events)) + for _, evt := range f.events { + e := *evt + ch <- &e + } + close(ch) + return ch, nil +} + +func (f *fakeEvalRunner) Close() error { + return nil +} + +type fakeEvaluatorImpl struct { + name string +} + +func (f *fakeEvaluatorImpl) Name() string { return f.name } +func (f *fakeEvaluatorImpl) Description() string { return "fake evaluator" } +func (f *fakeEvaluatorImpl) Evaluate(ctx context.Context, actuals, expecteds []*evalset.Invocation, evalMetric *metric.EvalMetric) (*evaluator.EvaluateResult, error) { + result := &evaluator.EvaluateResult{ + OverallScore: 1, + OverallStatus: status.EvalStatusPassed, + PerInvocationResults: make([]evaluator.PerInvocationResult, len(actuals)), + } + for i := range actuals { + result.PerInvocationResults[i] = evaluator.PerInvocationResult{ + ActualInvocation: actuals[i], + ExpectedInvocation: expecteds[i], + Score: 1, + Status: status.EvalStatusPassed, + } + } + return result, nil +} + +func performJSONRequest(t *testing.T, handler http.Handler, method, path string, payload any) *httptest.ResponseRecorder { + var body io.Reader + if payload != nil { + data, err := json.Marshal(payload) + require.NoError(t, err) + body = bytes.NewReader(data) + } + return performRequest(t, handler, method, path, body) +} + +func performRequest(t *testing.T, handler http.Handler, method, path string, body io.Reader) *httptest.ResponseRecorder { + req := httptest.NewRequest(method, path, body) + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + resp := httptest.NewRecorder() + handler.ServeHTTP(resp, req) + return resp +} + +func decodeBody(t *testing.T, r io.Reader, v any) { + data, err := io.ReadAll(r) + require.NoError(t, err) + require.NoError(t, json.Unmarshal(data, v)) +} + +func newRunnerToolCallEvent(invocationID string) *event.Event { + args := json.RawMessage(`{"operation":"add","a":1,"b":2}`) + resp := &model.Response{ + Choices: []model.Choice{{ + Message: model.Message{ + Role: model.RoleAssistant, + ToolCalls: []model.ToolCall{{ + ID: "runner-tool", + Function: model.FunctionDefinitionParam{Name: "calculator", Arguments: args}, + }}, + }, + }}, + } + return &event.Event{Response: resp, InvocationID: invocationID} +} + +func newRunnerFinalEvent(invocationID, content string) *event.Event { + resp := &model.Response{ + Done: true, + Choices: []model.Choice{{ + Message: model.Message{ + Role: model.RoleAssistant, + Content: content, + }, + }}, + } + return &event.Event{Response: resp, InvocationID: invocationID} +} + // mockSessionService is a simple mock session service for testing. type mockSessionService struct { sessions map[string]*session.Session From 4029a2c07df170cbc2941e175555e524510fd458 Mon Sep 17 00:00:00 2001 From: hackerli Date: Thu, 20 Nov 2025 12:56:39 +0800 Subject: [PATCH 02/14] docs --- docs/mkdocs/en/evaluation.md | 33 +++ docs/mkdocs/zh/evaluation.md | 43 ++- server/debug/openapi.json | 494 ++++++++++++++++++++++++++++++++++- 3 files changed, 559 insertions(+), 11 deletions(-) diff --git a/docs/mkdocs/en/evaluation.md b/docs/mkdocs/en/evaluation.md index c1a180241..f7f1eb5e4 100644 --- a/docs/mkdocs/en/evaluation.md +++ b/docs/mkdocs/en/evaluation.md @@ -726,6 +726,39 @@ Because the Agent's execution process may be uncertain, `evaluation.WithNumRuns` ## Usage Guide +### Debug Server Integration + +Debug Server bundles evaluation management and run endpoints so you can drive visual evaluation flows from ADK Web/AG UI. + +```go +import ( + "net/http" + + "trpc.group/trpc-go/trpc-agent-go/agent" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" + evalresultlocal "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult/local" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + evalsetlocal "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset/local" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + metriclocal "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/local" + debugserver "trpc.group/trpc-go/trpc-agent-go/server/debug" +) + +agents := map[string]agent.Agent{ + "math-app": myAgent, +} +srv := debugserver.New( + agents, + debugserver.WithEvalSetManager(evalsetlocal.New(evalset.WithBaseDir("./evaldata"))), + debugserver.WithEvalResultManager(evalresultlocal.New(evalresult.WithBaseDir("./evaldata"))), + debugserver.WithMetricManager(metriclocal.New(metric.WithBaseDir("./evaldata"))), +) +// Debug Server returns an http.Handler; register it to your HTTP server. +_ = http.ListenAndServe(":8000", srv.Handler()) +``` + +For a full example, see [examples/evaluation/debug](https://github.com/trpc-group/trpc-agent-go/tree/main/examples/evaluation/debug). + ### Local File Path There are three types of local files: diff --git a/docs/mkdocs/zh/evaluation.md b/docs/mkdocs/zh/evaluation.md index 9edd3656c..ffd9125db 100644 --- a/docs/mkdocs/zh/evaluation.md +++ b/docs/mkdocs/zh/evaluation.md @@ -6,16 +6,6 @@ Evaluation 提供完整的 Agent 评估框架,支持本地文件和内存两 本节介绍如何在本地文件系统 local 或内存 inmemory 模式下执行 Agent 评估流程。 -### Evaluation 搭配 Debug Server - -若需要在 ADK Web 中一边调试 Agent、一边把真实会话转成评估用例,可以直接复用 `examples/evaluation/debug` 示例: - -1. 通过 `debug.New` 启动调试服务器时,传入 `debug.WithEvalSetManager`、`debug.WithEvalResultManager`、`debug.WithMetricManager` 等选项,把评估集、指标以及评估结果落盘到指定目录。 -2. ADK Web 连接到该服务器后,聊天产生的 session 可在 UI 里使用 “Convert to Eval Case” 功能写入本地 `*.evalset.json`/`*.metrics.json`。 -3. UI 中的 Eval 标签页或 `run`/`run_eval` API 会自动读取这些配置,并在 `-output-dir` 写出 `*.evalset_result.json`,便于离线分析或版本管理。 - -完整流程(含命令行参数示例与数据目录结构)见 [examples/evaluation/debug](https://github.com/trpc-group/trpc-agent-go/tree/main/examples/evaluation/debug)。 - ### 本地文件系统 local local 在本地文件系统上维护评估集、评估指标和评估结果。 @@ -720,6 +710,39 @@ agentEvaluator, err := evaluation.New(appName, runner, evaluation.WithNumRuns(nu ## 使用指南 +### Debug Server 集成 + +Debug Server 集成了 Evaluation 管理与运行接口,提供了可视化评估能力。 + +```go +import ( + "net/http" + + "trpc.group/trpc-go/trpc-agent-go/agent" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult" + evalresultlocal "trpc.group/trpc-go/trpc-agent-go/evaluation/evalresult/local" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + evalsetlocal "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset/local" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + metriclocal "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/local" + debugserver "trpc.group/trpc-go/trpc-agent-go/server/debug" +) + +agents := map[string]agent.Agent{ + "math-app": myAgent, +} +srv := debugserver.New( + agents, + debugserver.WithEvalSetManager(evalsetlocal.New(evalset.WithBaseDir("./evaldata"))), + debugserver.WithEvalResultManager(evalresultlocal.New(evalresult.WithBaseDir("./evaldata"))), + debugserver.WithMetricManager(metriclocal.New(metric.WithBaseDir("./evaldata"))), +) +// Debug Server returns an http.Handler; register it to your HTTP server. +_ = http.ListenAndServe(":8000", srv.Handler()) +``` + +完整代码参见 [examples/evaluation/debug](https://github.com/trpc-group/trpc-agent-go/tree/main/examples/evaluation/debug)。 + ### 本地文件路径 本地文件有三种: diff --git a/server/debug/openapi.json b/server/debug/openapi.json index 8b15afe82..dfb546808 100644 --- a/server/debug/openapi.json +++ b/server/debug/openapi.json @@ -200,6 +200,228 @@ } } } + }, + "/apps/{appName}/eval-sets": { + "parameters": [ + { "$ref": "#/components/parameters/appName" } + ], + "post": { + "summary": "Create a new eval set.", + "operationId": "createEvalSet", + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/CreateEvalSetRequest" } + } + } + }, + "responses": { + "200": { + "description": "The created eval set.", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/EvalSet" } + } + } + }, + "400": { "description": "Failed to create the eval set." } + } + }, + "get": { + "summary": "List eval set IDs for the app.", + "operationId": "listEvalSets", + "responses": { + "200": { + "description": "Eval set IDs.", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/ListEvalSetsResponse" } + } + } + } + } + } + }, + "/apps/{appName}/eval-sets/{evalSetId}/add-session": { + "parameters": [ + { "$ref": "#/components/parameters/appName" }, + { "$ref": "#/components/parameters/evalSetId" } + ], + "post": { + "summary": "Convert a recorded session into an eval case and append it to the eval set.", + "operationId": "addSessionToEvalSet", + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/AddSessionToEvalSetRequest" } + } + } + }, + "responses": { + "200": { "description": "Eval case added." }, + "400": { "description": "Request invalid or session not found." } + } + } + }, + "/apps/{appName}/eval-sets/{evalSetId}/evals": { + "parameters": [ + { "$ref": "#/components/parameters/appName" }, + { "$ref": "#/components/parameters/evalSetId" } + ], + "get": { + "summary": "List eval case IDs in the eval set.", + "operationId": "listEvalCases", + "responses": { + "200": { + "description": "Eval case IDs.", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { "type": "string" } + } + } + } + }, + "400": { "description": "Eval set not found." } + } + } + }, + "/apps/{appName}/eval-sets/{evalSetId}/eval-cases/{evalCaseId}": { + "parameters": [ + { "$ref": "#/components/parameters/appName" }, + { "$ref": "#/components/parameters/evalSetId" }, + { "$ref": "#/components/parameters/evalCaseId" } + ], + "get": { + "summary": "Get a single eval case.", + "operationId": "getEvalCase", + "responses": { + "200": { + "description": "Eval case details.", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/EvalCase" } + } + } + }, + "404": { "description": "Eval case not found." } + } + }, + "put": { + "summary": "Update an eval case.", + "operationId": "updateEvalCase", + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/EvalCase" } + } + } + }, + "responses": { + "200": { "description": "Eval case updated." }, + "404": { "description": "Eval case not found." } + } + }, + "delete": { + "summary": "Delete an eval case.", + "operationId": "deleteEvalCase", + "responses": { + "200": { "description": "Eval case deleted." }, + "404": { "description": "Eval case not found." } + } + } + }, + "/apps/{appName}/eval-sets/{evalSetId}/run": { + "parameters": [ + { "$ref": "#/components/parameters/appName" }, + { "$ref": "#/components/parameters/evalSetId" } + ], + "post": { + "summary": "Run evaluation for an eval set.", + "operationId": "runEval", + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/RunEvalRequest" } + } + } + }, + "responses": { + "200": { + "description": "Per-case evaluation results.", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/RunEvalResponse" } + } + } + }, + "400": { "description": "Evaluation failed." } + } + } + }, + "/apps/{appName}/eval-results": { + "parameters": [ + { "$ref": "#/components/parameters/appName" } + ], + "get": { + "summary": "List saved evaluation result IDs for the app.", + "operationId": "listEvalResults", + "responses": { + "200": { + "description": "Eval result IDs.", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/ListEvalResultsResponse" } + } + } + } + } + } + }, + "/apps/{appName}/eval-results/{evalResultId}": { + "parameters": [ + { "$ref": "#/components/parameters/appName" }, + { "$ref": "#/components/parameters/evalResultId" } + ], + "get": { + "summary": "Fetch a full evaluation result by ID.", + "operationId": "getEvalResult", + "responses": { + "200": { + "description": "Evaluation result.", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/EvalSetResult" } + } + } + }, + "404": { "description": "Evaluation result not found." } + } + } + }, + "/apps/{appName}/metrics-info": { + "parameters": [ + { "$ref": "#/components/parameters/appName" } + ], + "get": { + "summary": "List registered evaluation metrics and metadata.", + "operationId": "listMetricsInfo", + "responses": { + "200": { + "description": "Metric metadata.", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/ListMetricsInfoResponse" } + } + } + } + } + } } }, "components": { @@ -224,6 +446,27 @@ "required": true, "description": "Session identifier.", "schema": { "type": "string" } + }, + "evalSetId": { + "name": "evalSetId", + "in": "path", + "required": true, + "description": "Eval set identifier.", + "schema": { "type": "string" } + }, + "evalCaseId": { + "name": "evalCaseId", + "in": "path", + "required": true, + "description": "Eval case identifier.", + "schema": { "type": "string" } + }, + "evalResultId": { + "name": "evalResultId", + "in": "path", + "required": true, + "description": "Eval result identifier.", + "schema": { "type": "string" } } }, "schemas": { @@ -345,7 +588,256 @@ "description": "Generic event envelope returned by the agent. The exact shape is loosely typed to avoid coupling the API spec to internal implementation details.", "type": "object", "additionalProperties": true + }, + "EvalSet": { + "type": "object", + "properties": { + "evalSetId": { "type": "string" }, + "name": { "type": "string" }, + "description": { "type": "string" }, + "evalCases": { + "type": "array", + "items": { "$ref": "#/components/schemas/EvalCase" } + }, + "creationTimestamp": { + "type": "number", + "format": "double", + "description": "Unix seconds." + } + } + }, + "EvalCase": { + "type": "object", + "properties": { + "evalId": { "type": "string" }, + "conversation": { + "type": "array", + "items": { "$ref": "#/components/schemas/Invocation" } + }, + "sessionInput": { "$ref": "#/components/schemas/SessionInput" }, + "creationTimestamp": { + "type": "number", + "format": "double", + "description": "Unix seconds." + } + } + }, + "Invocation": { + "type": "object", + "properties": { + "invocationId": { "type": "string" }, + "userContent": { "$ref": "#/components/schemas/Content" }, + "finalResponse": { "$ref": "#/components/schemas/Content" }, + "intermediateData": { "$ref": "#/components/schemas/IntermediateData" }, + "creationTimestamp": { + "type": "number", + "format": "double", + "description": "Unix seconds." + } + } + }, + "IntermediateData": { + "type": "object", + "properties": { + "toolUses": { + "type": "array", + "items": { "$ref": "#/components/schemas/FunctionCall" } + }, + "toolResponses": { + "type": "array", + "items": { "$ref": "#/components/schemas/FunctionResponse" } + }, + "intermediateResponses": { + "type": "array", + "items": { + "type": "array", + "items": {} + } + } + } + }, + "SessionInput": { + "type": "object", + "properties": { + "appName": { "type": "string" }, + "userId": { "type": "string" }, + "state": { + "type": "object", + "additionalProperties": true + } + } + }, + "EvalMetric": { + "type": "object", + "properties": { + "metricName": { "type": "string" }, + "threshold": { "type": "number", "format": "double" } + } + }, + "RunEvalRequest": { + "type": "object", + "properties": { + "evalCaseIds": { + "type": "array", + "items": { "type": "string" } + }, + "evalMetrics": { + "type": "array", + "items": { "$ref": "#/components/schemas/EvalMetric" } + } + } + }, + "EvalMetricResult": { + "type": "object", + "properties": { + "metricName": { "type": "string" }, + "score": { "type": "number", "format": "double" }, + "evalStatus": { "type": "integer", "format": "int32" }, + "threshold": { "type": "number", "format": "double" }, + "details": { + "type": "object", + "additionalProperties": true + } + } + }, + "EvalMetricResultPerInvocation": { + "type": "object", + "properties": { + "actualInvocation": { "$ref": "#/components/schemas/Invocation" }, + "expectedInvocation": { "$ref": "#/components/schemas/Invocation" }, + "evalMetricResults": { + "type": "array", + "items": { "$ref": "#/components/schemas/EvalMetricResult" } + } + } + }, + "RunEvalResult": { + "type": "object", + "properties": { + "evalSetFile": { "type": "string" }, + "evalSetId": { "type": "string" }, + "evalId": { "type": "string" }, + "finalEvalStatus": { "type": "integer", "format": "int32" }, + "overallEvalMetricResults": { + "type": "array", + "items": { "$ref": "#/components/schemas/EvalMetricResult" } + }, + "evalMetricResultPerInvocation": { + "type": "array", + "items": { "$ref": "#/components/schemas/EvalMetricResultPerInvocation" } + }, + "userId": { "type": "string" }, + "sessionId": { "type": "string" } + } + }, + "RunEvalResponse": { + "type": "object", + "properties": { + "runEvalResults": { + "type": "array", + "items": { "$ref": "#/components/schemas/RunEvalResult" } + } + } + }, + "EvalCaseResult": { + "type": "object", + "properties": { + "evalSetId": { "type": "string" }, + "evalId": { "type": "string" }, + "finalEvalStatus": { "type": "integer", "format": "int32" }, + "overallEvalMetricResults": { + "type": "array", + "items": { "$ref": "#/components/schemas/EvalMetricResult" } + }, + "evalMetricResultPerInvocation": { + "type": "array", + "items": { "$ref": "#/components/schemas/EvalMetricResultPerInvocation" } + }, + "sessionId": { "type": "string" }, + "userId": { "type": "string" } + } + }, + "EvalSetResult": { + "type": "object", + "properties": { + "evalSetResultId": { "type": "string" }, + "evalSetResultName": { "type": "string" }, + "evalSetId": { "type": "string" }, + "evalCaseResults": { + "type": "array", + "items": { "$ref": "#/components/schemas/EvalCaseResult" } + }, + "creationTimestamp": { + "type": "number", + "format": "double", + "description": "Unix seconds." + } + } + }, + "ListEvalSetsResponse": { + "type": "object", + "properties": { + "evalSetIds": { + "type": "array", + "items": { "type": "string" } + } + } + }, + "CreateEvalSetRequest": { + "type": "object", + "properties": { + "evalSet": { "$ref": "#/components/schemas/EvalSet" } + } + }, + "AddSessionToEvalSetRequest": { + "type": "object", + "properties": { + "evalId": { "type": "string" }, + "sessionId": { "type": "string" }, + "userId": { "type": "string" } + } + }, + "ListEvalResultsResponse": { + "type": "object", + "properties": { + "evalResultIds": { + "type": "array", + "items": { "type": "string" } + } + } + }, + "MetricInterval": { + "type": "object", + "properties": { + "minValue": { "type": "number", "format": "double" }, + "openAtMin": { "type": "boolean" }, + "maxValue": { "type": "number", "format": "double" }, + "openAtMax": { "type": "boolean" } + } + }, + "MetricValueInfo": { + "type": "object", + "properties": { + "interval": { "$ref": "#/components/schemas/MetricInterval" } + } + }, + "MetricInfo": { + "type": "object", + "properties": { + "metricName": { "type": "string" }, + "description": { "type": "string" }, + "metricValueInfo": { "$ref": "#/components/schemas/MetricValueInfo" } + } + }, + "ListMetricsInfoResponse": { + "type": "object", + "properties": { + "metricsInfo": { + "type": "array", + "items": { "$ref": "#/components/schemas/MetricInfo" } + } + } } } } -} \ No newline at end of file +} From 6ab614b94d9aae16939cc0b4b4da716ad41e7ac1 Mon Sep 17 00:00:00 2001 From: hackerli Date: Sat, 22 Nov 2025 00:43:07 +0800 Subject: [PATCH 03/14] feat --- evaluation/metric/criterion/criterion.go | 27 + evaluation/metric/criterion/criterion_test.go | 28 + .../metric/criterion/maptext/maptext.go | 51 ++ .../metric/criterion/maptext/maptext_test.go | 65 +++ evaluation/metric/criterion/options.go | 38 ++ evaluation/metric/criterion/options_test.go | 28 + evaluation/metric/criterion/text/text.go | 78 +++ evaluation/metric/criterion/text/text_test.go | 84 +++ .../criterion/tooltrajectory/options.go | 99 ++++ .../criterion/tooltrajectory/options_test.go | 71 +++ .../tooltrajectory/tooltrajectory.go | 249 +++++++++ .../tooltrajectory/tooltrajectory_test.go | 522 ++++++++++++++++++ evaluation/metric/metric.go | 13 +- .../service/internal/inference/inference.go | 35 +- .../internal/inference/inference_test.go | 51 ++ 15 files changed, 1432 insertions(+), 7 deletions(-) create mode 100644 evaluation/metric/criterion/criterion.go create mode 100644 evaluation/metric/criterion/criterion_test.go create mode 100644 evaluation/metric/criterion/maptext/maptext.go create mode 100644 evaluation/metric/criterion/maptext/maptext_test.go create mode 100644 evaluation/metric/criterion/options.go create mode 100644 evaluation/metric/criterion/options_test.go create mode 100644 evaluation/metric/criterion/text/text.go create mode 100644 evaluation/metric/criterion/text/text_test.go create mode 100644 evaluation/metric/criterion/tooltrajectory/options.go create mode 100644 evaluation/metric/criterion/tooltrajectory/options_test.go create mode 100644 evaluation/metric/criterion/tooltrajectory/tooltrajectory.go create mode 100644 evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go diff --git a/evaluation/metric/criterion/criterion.go b/evaluation/metric/criterion/criterion.go new file mode 100644 index 000000000..53fe46f39 --- /dev/null +++ b/evaluation/metric/criterion/criterion.go @@ -0,0 +1,27 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package criterion provides configurable evaluation criteria. +package criterion + +import "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" + +// Criterion encapsulates multiple evaluation criteria for comprehensive model behavior assessment. +type Criterion struct { + // ToolTrajectory configures checks for tool call and response sequences. + ToolTrajectory *tooltrajectory.ToolTrajectoryCriterion `json:"toolTrajectory,omitempty"` +} + +// New creates a Criterion with the provided options. +func New(opt ...Option) *Criterion { + opts := newOptions(opt...) + return &Criterion{ + ToolTrajectory: opts.ToolTrajectory, + } +} diff --git a/evaluation/metric/criterion/criterion_test.go b/evaluation/metric/criterion/criterion_test.go new file mode 100644 index 000000000..ba29f2ad9 --- /dev/null +++ b/evaluation/metric/criterion/criterion_test.go @@ -0,0 +1,28 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package criterion + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +func TestCriterionNewDefaults(t *testing.T) { + c := New() + assert.NotNil(t, c.ToolTrajectory) +} + +func TestCriterionWithToolTrajectory(t *testing.T) { + custom := tooltrajectory.New() + c := New(WithToolTrajectory(custom)) + assert.Equal(t, custom, c.ToolTrajectory) +} diff --git a/evaluation/metric/criterion/maptext/maptext.go b/evaluation/metric/criterion/maptext/maptext.go new file mode 100644 index 000000000..7452876b1 --- /dev/null +++ b/evaluation/metric/criterion/maptext/maptext.go @@ -0,0 +1,51 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package maptext defines map-based comparison criteria. +package maptext + +import ( + "encoding/json" + "fmt" + "reflect" + + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" +) + +// MapTextCriterion compares two string-keyed maps. +type MapTextCriterion struct { + // TextCriterion applies string-based matching on JSON-serialized maps. + TextCriterion *text.TextCriterion `json:"textCriterion,omitempty"` + // Compare overrides default comparison when provided. + Compare func(actual, expected map[string]any) error `json:"-"` +} + +// Match compares two maps using custom logic, text-based matching, or deep equality. +func (m *MapTextCriterion) Match(actual, expected map[string]any) error { + if m.Compare != nil { + return m.Compare(actual, expected) + } + if m.TextCriterion != nil { + // Although the keys in a map are unordered, json.Marshal guarantees the order of the keys, + // so we can directly use json.Marshal for comparison. + actualData, err := json.Marshal(actual) + if err != nil { + return fmt.Errorf("marshal actual: %w", err) + } + expectedData, err := json.Marshal(expected) + if err != nil { + return fmt.Errorf("marshal expected: %w", err) + } + return m.TextCriterion.Match(string(actualData), string(expectedData)) + } + if reflect.DeepEqual(actual, expected) { + return nil + } + return fmt.Errorf("actual %v and expected %v do not match", actual, expected) +} diff --git a/evaluation/metric/criterion/maptext/maptext_test.go b/evaluation/metric/criterion/maptext/maptext_test.go new file mode 100644 index 000000000..bd51156e1 --- /dev/null +++ b/evaluation/metric/criterion/maptext/maptext_test.go @@ -0,0 +1,65 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package maptext + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" +) + +func TestMapTextCriterionCompareOverride(t *testing.T) { + called := false + criterion := &MapTextCriterion{ + Compare: func(actual, expected map[string]any) error { + called = true + return nil + }, + } + err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "v"}) + assert.NoError(t, err) + assert.True(t, called) +} + +func TestMapTextCriterionTextMatch(t *testing.T) { + criterion := &MapTextCriterion{ + TextCriterion: &text.TextCriterion{ + CaseInsensitive: true, + MatchStrategy: text.TextMatchStrategyExact, + }, + } + err := criterion.Match(map[string]any{"msg": "Hello"}, map[string]any{"msg": "hello"}) + assert.NoError(t, err) +} + +func TestMapTextCriterionDeepEqualMismatch(t *testing.T) { + criterion := &MapTextCriterion{} + err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "diff"}) + assert.Error(t, err) +} + +func TestMapTextCriterionMarshalErrors(t *testing.T) { + criterion := &MapTextCriterion{ + TextCriterion: &text.TextCriterion{}, + } + // Actual marshal error. + actualErr := criterion.Match(map[string]any{"bad": make(chan int)}, map[string]any{"k": "v"}) + assert.Error(t, actualErr) + // Expected marshal error. + expectedErr := criterion.Match(map[string]any{"k": "v"}, map[string]any{"bad": make(chan int)}) + assert.Error(t, expectedErr) +} + +func TestMapTextCriterionDeepEqualSuccess(t *testing.T) { + criterion := &MapTextCriterion{} + err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "v"}) + assert.NoError(t, err) +} diff --git a/evaluation/metric/criterion/options.go b/evaluation/metric/criterion/options.go new file mode 100644 index 000000000..9a45e567d --- /dev/null +++ b/evaluation/metric/criterion/options.go @@ -0,0 +1,38 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package criterion + +import "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" + +// options aggregates configurable parts of Criterion. +type options struct { + ToolTrajectory *tooltrajectory.ToolTrajectoryCriterion // ToolTrajectory sets the default tool trajectory criterion. +} + +// newOptions creates a Options with the provided options. +func newOptions(opt ...Option) *options { + opts := &options{ + ToolTrajectory: tooltrajectory.New(), + } + for _, o := range opt { + o(opts) + } + return opts +} + +// Option is a function that configures Criterion. +type Option func(*options) + +// WithToolTrajectory sets the tool trajectory criterion. +func WithToolTrajectory(toolTrajectory *tooltrajectory.ToolTrajectoryCriterion) Option { + return func(o *options) { + o.ToolTrajectory = toolTrajectory + } +} diff --git a/evaluation/metric/criterion/options_test.go b/evaluation/metric/criterion/options_test.go new file mode 100644 index 000000000..c475638cb --- /dev/null +++ b/evaluation/metric/criterion/options_test.go @@ -0,0 +1,28 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package criterion + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +func TestNewOptionsDefaults(t *testing.T) { + opts := newOptions() + assert.NotNil(t, opts.ToolTrajectory) +} + +func TestWithToolTrajectory(t *testing.T) { + custom := tooltrajectory.New(tooltrajectory.WithOrderInsensitive(true)) + opts := newOptions(WithToolTrajectory(custom)) + assert.Equal(t, custom, opts.ToolTrajectory) +} diff --git a/evaluation/metric/criterion/text/text.go b/evaluation/metric/criterion/text/text.go new file mode 100644 index 000000000..21d49644c --- /dev/null +++ b/evaluation/metric/criterion/text/text.go @@ -0,0 +1,78 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package text defines text comparison criteria. +package text + +import ( + "fmt" + "regexp" + "strings" +) + +// TextCriterion governs how two strings should be compared. +type TextCriterion struct { + // Ignore skips comparison when true. + Ignore bool `json:"ignore,omitempty"` + // CaseInsensitive toggles lowercase comparison. + CaseInsensitive bool `json:"caseInsensitive,omitempty"` + // MatchStrategy selects the comparison rule. + MatchStrategy TextMatchStrategy `json:"matchStrategy,omitempty"` + // Compare overrides built-in strategies. + Compare func(actual, expected string) error `json:"-"` +} + +// TextMatchStrategy enumerates supported text comparison strategies. +type TextMatchStrategy string + +const ( + // TextMatchStrategyExact matches strings exactly. + TextMatchStrategyExact TextMatchStrategy = "exact" + // TextMatchStrategyContains matches strings that contain the target. + TextMatchStrategyContains TextMatchStrategy = "contains" + // TextMatchStrategyRegex matches strings that match the regex. + TextMatchStrategyRegex TextMatchStrategy = "regex" +) + +// Match compares source and target using the configured strategy. +func (t *TextCriterion) Match(source, target string) error { + if t.Compare != nil { + return t.Compare(source, target) + } + if t.Ignore { + return nil + } + if t.CaseInsensitive { + source = strings.ToLower(source) + target = strings.ToLower(target) + } + switch t.MatchStrategy { + case TextMatchStrategyExact: + if source == target { + return nil + } + return fmt.Errorf("source %s and target %s do not match", source, target) + case TextMatchStrategyContains: + if strings.Contains(source, target) { + return nil + } + return fmt.Errorf("source %s does not contain target %s", source, target) + case TextMatchStrategyRegex: + re, err := regexp.Compile(target) + if err != nil { + return fmt.Errorf("invalid regex %s: %w", target, err) + } + if re.MatchString(source) { + return nil + } + return fmt.Errorf("source %s does not match regex %s", source, target) + default: + return fmt.Errorf("invalid match strategy %s", t.MatchStrategy) + } +} diff --git a/evaluation/metric/criterion/text/text_test.go b/evaluation/metric/criterion/text/text_test.go new file mode 100644 index 000000000..66143c531 --- /dev/null +++ b/evaluation/metric/criterion/text/text_test.go @@ -0,0 +1,84 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package text + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestTextCriterionMatchStrategies(t *testing.T) { + criterion := &TextCriterion{ + CaseInsensitive: true, + MatchStrategy: TextMatchStrategyContains, + } + err := criterion.Match("Hello World", "hello") + assert.NoError(t, err) +} + +func TestTextCriterionIgnore(t *testing.T) { + criterion := &TextCriterion{ + Ignore: true, + } + err := criterion.Match("anything", "value") + assert.NoError(t, err) +} + +func TestTextCriterionRegexInvalid(t *testing.T) { + criterion := &TextCriterion{ + MatchStrategy: TextMatchStrategyRegex, + } + err := criterion.Match("source", "[invalid(") + assert.Error(t, err) +} + +func TestTextCriterionUnknownStrategy(t *testing.T) { + criterion := &TextCriterion{ + MatchStrategy: TextMatchStrategy("unknown"), + } + err := criterion.Match("a", "b") + assert.Error(t, err) +} + +func TestTextCriterionAllBranches(t *testing.T) { + customCalled := false + custom := &TextCriterion{ + Compare: func(actual, expected string) error { + customCalled = true + return nil + }, + } + err := custom.Match("x", "y") + assert.NoError(t, err) + assert.True(t, customCalled) + + exact := &TextCriterion{ + MatchStrategy: TextMatchStrategyExact, + } + err = exact.Match("same", "same") + assert.NoError(t, err) + err = exact.Match("same", "diff") + assert.Error(t, err) + + contains := &TextCriterion{ + MatchStrategy: TextMatchStrategyContains, + } + err = contains.Match("hello", "missing") + assert.Error(t, err) + + regex := &TextCriterion{ + MatchStrategy: TextMatchStrategyRegex, + } + err = regex.Match("abc123", "abc[0-9]+") + assert.NoError(t, err) + err = regex.Match("xyz", "abc[0-9]+") + assert.Error(t, err) +} diff --git a/evaluation/metric/criterion/tooltrajectory/options.go b/evaluation/metric/criterion/tooltrajectory/options.go new file mode 100644 index 000000000..dcaa87cc6 --- /dev/null +++ b/evaluation/metric/criterion/tooltrajectory/options.go @@ -0,0 +1,99 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package tooltrajectory + +import ( + "fmt" + "reflect" + + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" +) + +// defaultToolTrajectoryStrategy is used when no user strategy is supplied. +var defaultToolTrajectoryStrategy = &ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &maptext.MapTextCriterion{ + Compare: func(actual, expected map[string]any) error { + if !reflect.DeepEqual(actual, expected) { + return fmt.Errorf("actual %v and expected %v do not match", actual, expected) + } + return nil + }, + }, + Response: &maptext.MapTextCriterion{ + Compare: func(actual, expected map[string]any) error { + if !reflect.DeepEqual(actual, expected) { + return fmt.Errorf("actual %v and expected %v do not match", actual, expected) + } + return nil + }, + }, +} + +// options configures ToolTrajectoryCriterion. +type options struct { + // defaultStrategy sets the fallback strategy when no tool-specific strategy is defined. + defaultStrategy *ToolTrajectoryStrategy + // toolStrategy configures per-tool strategies keyed by tool name. + toolStrategy map[string]*ToolTrajectoryStrategy + // orderInsensitive toggles order-agnostic comparison for args and responses. + orderInsensitive bool + // compare allows overriding comparison logic entirely. + compare func(actual, expected *evalset.Invocation) error +} + +// newOptions applies provided options for ToolTrajectoryCriterion. +func newOptions(opt ...Option) *options { + opts := &options{ + defaultStrategy: defaultToolTrajectoryStrategy, + toolStrategy: nil, + orderInsensitive: false, + compare: nil, + } + for _, o := range opt { + o(opts) + } + return opts +} + +// Option is a function that configures ToolTrajectoryCriterion. +type Option func(*options) + +// WithDefault sets the default tool trajectory strategy. +func WithDefault(defaultStrategy *ToolTrajectoryStrategy) Option { + return func(o *options) { + o.defaultStrategy = defaultStrategy + } +} + +// WithTool sets the per-tool strategies keyed by tool name. +func WithTool(tool map[string]*ToolTrajectoryStrategy) Option { + return func(o *options) { + o.toolStrategy = tool + } +} + +// WithOrderInsensitive sets the order-agnostic comparison for tool calls and responses. +func WithOrderInsensitive(orderInsensitive bool) Option { + return func(o *options) { + o.orderInsensitive = orderInsensitive + } +} + +// WithCompare sets the tool trajectory comparison logic. +func WithCompare(compare func(actual, expected *evalset.Invocation) error) Option { + return func(o *options) { + o.compare = compare + } +} diff --git a/evaluation/metric/criterion/tooltrajectory/options_test.go b/evaluation/metric/criterion/tooltrajectory/options_test.go new file mode 100644 index 000000000..005213674 --- /dev/null +++ b/evaluation/metric/criterion/tooltrajectory/options_test.go @@ -0,0 +1,71 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package tooltrajectory + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" +) + +func TestNewOptionsDefaults(t *testing.T) { + opts := newOptions() + assert.Equal(t, defaultToolTrajectoryStrategy, opts.defaultStrategy) + assert.Nil(t, opts.toolStrategy) + assert.False(t, opts.orderInsensitive) + assert.Nil(t, opts.compare) +} + +func TestWithDefault(t *testing.T) { + custom := &ToolTrajectoryStrategy{} + opts := newOptions(WithDefault(custom)) + assert.Equal(t, custom, opts.defaultStrategy) +} + +func TestWithTool(t *testing.T) { + tool := map[string]*ToolTrajectoryStrategy{ + "custom": {}, + } + opts := newOptions(WithTool(tool)) + assert.Equal(t, tool, opts.toolStrategy) +} + +func TestWithOrderInsensitive(t *testing.T) { + opts := newOptions(WithOrderInsensitive(true)) + assert.True(t, opts.orderInsensitive) +} + +func TestWithCompare(t *testing.T) { + var called bool + compare := func(actual, expected *evalset.Invocation) error { + called = true + return nil + } + opts := newOptions(WithCompare(compare)) + assert.NotNil(t, opts.compare) + err := opts.compare(nil, nil) + assert.NoError(t, err) + assert.True(t, called) +} + +func TestDefaultToolTrajectoryStrategyDeepEqualMismatch(t *testing.T) { + errArgs := defaultToolTrajectoryStrategy.Arguments.Match( + map[string]any{"a": 1}, + map[string]any{"a": 2}, + ) + assert.Error(t, errArgs) + + errResp := defaultToolTrajectoryStrategy.Response.Match( + map[string]any{"r": 1}, + map[string]any{"r": 3}, + ) + assert.Error(t, errResp) +} diff --git a/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go b/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go new file mode 100644 index 000000000..ac077d13e --- /dev/null +++ b/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go @@ -0,0 +1,249 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package tooltrajectory defines tool trajectory comparison criteria. +package tooltrajectory + +import ( + "encoding/json" + "errors" + "fmt" + "sort" + + "google.golang.org/genai" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" +) + +// New creates a ToolTrajectoryCriterion with the provided options. +func New(opt ...Option) *ToolTrajectoryCriterion { + opts := newOptions(opt...) + return &ToolTrajectoryCriterion{ + DefaultStrategy: opts.defaultStrategy, + ToolStrategy: opts.toolStrategy, + OrderInsensitive: opts.orderInsensitive, + Compare: opts.compare, + } +} + +// ToolTrajectoryCriterion provides comparison rules for tool call and response sequences. +type ToolTrajectoryCriterion struct { + // DefaultStrategy applies when no tool-specific strategy is provided. + DefaultStrategy *ToolTrajectoryStrategy `json:"defaultStrategy,omitempty"` + // ToolStrategy holds per-tool strategies keyed by tool name. + ToolStrategy map[string]*ToolTrajectoryStrategy `json:"toolStrategy,omitempty"` + // OrderInsensitive toggles comparison order for args and responses. + OrderInsensitive bool `json:"orderInsensitive,omitempty"` + // Compare allows custom comparison override. + Compare func(actual, expected *evalset.Invocation) error `json:"-"` +} + +// Match compares actual and expected invocations according to tool trajectory rules. +func (t *ToolTrajectoryCriterion) Match(actual, expected *evalset.Invocation) error { + if t.Compare != nil { + return t.Compare(actual, expected) + } + if actual == nil || expected == nil { + return fmt.Errorf("actual or expected invocation is nil") + } + if actual.IntermediateData == nil || expected.IntermediateData == nil { + return fmt.Errorf("actual or expected intermediate data is nil") + } + // Ensure one-to-one mapping between tool calls and responses on actual invocation. + if len(actual.IntermediateData.ToolUses) != len(actual.IntermediateData.ToolResponses) { + return fmt.Errorf("tool uses and tool responses count mismatch: %d != %d", + len(actual.IntermediateData.ToolUses), len(actual.IntermediateData.ToolResponses)) + } + // Ensure one-to-one mapping between tool calls and responses on expected invocation. + if len(expected.IntermediateData.ToolUses) != len(expected.IntermediateData.ToolResponses) { + return fmt.Errorf("tool uses and tool responses count mismatch: %d != %d", + len(expected.IntermediateData.ToolUses), len(expected.IntermediateData.ToolResponses)) + } + // Ensure the same number of tool uses before detailed comparison. + if len(actual.IntermediateData.ToolUses) != len(expected.IntermediateData.ToolUses) { + return fmt.Errorf("tool uses count mismatch: %d != %d", + len(actual.IntermediateData.ToolUses), len(expected.IntermediateData.ToolUses)) + } + if len(actual.IntermediateData.ToolUses) == 0 { + return nil + } + actualTools, err := t.getToolComparers( + actual.IntermediateData.ToolUses, + actual.IntermediateData.ToolResponses, + t.OrderInsensitive, + ) + if err != nil { + return fmt.Errorf("get actual tools: %w", err) + } + expectedTools, err := t.getToolComparers( + expected.IntermediateData.ToolUses, + expected.IntermediateData.ToolResponses, + t.OrderInsensitive, + ) + if err != nil { + return fmt.Errorf("get expected tools: %w", err) + } + if t.OrderInsensitive { + sort.Slice(actualTools, func(i, j int) bool { + return actualTools[i].lessThan(actualTools[j]) + }) + sort.Slice(expectedTools, func(i, j int) bool { + return expectedTools[i].lessThan(expectedTools[j]) + }) + } + for i := range len(actualTools) { + strategy := t.getStrategy(actualTools[i], expectedTools[i]) + if err := strategy.Match(actualTools[i], expectedTools[i]); err != nil { + return fmt.Errorf("tool %s mismatch: %w", actualTools[i].name, err) + } + } + return nil +} + +// getToolComparers aligns tool uses with their responses and builds toolComparer. +func (t *ToolTrajectoryCriterion) getToolComparers(toolUses []*genai.FunctionCall, + toolResponses []*genai.FunctionResponse, orderInsensitive bool) ([]*toolComparer, error) { + // toolCallIDs ensures every tool use can be matched by ID. + // Map from tool call id to index. + toolCallIDs := make(map[string]int) + for i := range len(toolUses) { + if toolUses[i].ID == "" { + return nil, fmt.Errorf("tool use id is empty") + } + if _, ok := toolCallIDs[toolUses[i].ID]; ok { + return nil, fmt.Errorf("tool use id %s is duplicated", toolUses[i].ID) + } + toolCallIDs[toolUses[i].ID] = i + } + // toolResponseIDs ensures every tool response can be matched by ID. + // Map from tool response id to index. + toolResponseIDs := make(map[string]int) + for i := range len(toolResponses) { + if toolResponses[i].ID == "" { + return nil, fmt.Errorf("tool response id is empty") + } + if _, ok := toolResponseIDs[toolResponses[i].ID]; ok { + return nil, fmt.Errorf("tool response id %s is duplicated", toolResponses[i].ID) + } + toolResponseIDs[toolResponses[i].ID] = i + } + for toolID := range toolCallIDs { + if _, ok := toolResponseIDs[toolID]; !ok { + return nil, fmt.Errorf("tool id %s is missing response", toolID) + } + } + toolComparers := make([]*toolComparer, 0, len(toolUses)) + for i := range len(toolUses) { + toolComparer, err := getToolComparer( + toolUses[i], + toolResponses[toolResponseIDs[toolUses[i].ID]], + orderInsensitive, + ) + if err != nil { + return nil, fmt.Errorf("get tool comparer: %w", err) + } + toolComparers = append(toolComparers, toolComparer) + } + return toolComparers, nil +} + +// getStrategy picks the comparison strategy for a specific tool pair. +func (t *ToolTrajectoryCriterion) getStrategy(actualTool, expectedTool *toolComparer) *ToolTrajectoryStrategy { + if t.ToolStrategy != nil { + strategy, ok := t.ToolStrategy[actualTool.name] + if ok { + return strategy + } + strategy, ok = t.ToolStrategy[expectedTool.name] + if ok { + return strategy + } + } + if t.DefaultStrategy != nil { + return t.DefaultStrategy + } + return defaultToolTrajectoryStrategy +} + +// ToolTrajectoryStrategy defines comparison strategies for a single tool. +type ToolTrajectoryStrategy struct { + Name *text.TextCriterion `json:"name,omitempty"` // Name compares tool names. + Arguments *maptext.MapTextCriterion `json:"arguments,omitempty"` // Arguments compares tool call arguments. + Response *maptext.MapTextCriterion `json:"response,omitempty"` // Response compares tool call responses. +} + +// Match validates a single tool call pair using configured criteria. +func (t *ToolTrajectoryStrategy) Match(actual, expected *toolComparer) error { + if t.Name != nil { + if err := t.Name.Match(actual.name, expected.name); err != nil { + return fmt.Errorf("name mismatch: %w", err) + } + } + if t.Arguments != nil { + if err := t.Arguments.Match(actual.args, expected.args); err != nil { + return fmt.Errorf("arguments mismatch: %w", err) + } + } + if t.Response != nil { + if err := t.Response.Match(actual.response, expected.response); err != nil { + return fmt.Errorf("response mismatch: %w", err) + } + } + return nil +} + +// toolComparer normalizes tool call and response data for comparison. +type toolComparer struct { + name string // name holds the tool name. + args map[string]any // args holds parsed tool arguments. + response map[string]any // response holds parsed tool response payload. + argsOrder string // argsOrder caches JSON for order-insensitive compare. + responseOrder string // responseOrder caches JSON for order-insensitive compare. +} + +// lessThan provides deterministic ordering when order-insensitive compares require sorting. +func (t *toolComparer) lessThan(other *toolComparer) bool { + if t.name != other.name { + return t.name < other.name + } + if t.argsOrder != other.argsOrder { + return t.argsOrder < other.argsOrder + } + if t.responseOrder != other.responseOrder { + return t.responseOrder < other.responseOrder + } + return false +} + +// getToolComparer pairs a tool use with its response and precomputes ordering hints. +func getToolComparer(toolUse *genai.FunctionCall, toolResponse *genai.FunctionResponse, + orderInsensitive bool) (*toolComparer, error) { + if toolUse == nil || toolResponse == nil { + return nil, errors.New("tool use or tool response is nil") + } + tool := &toolComparer{ + name: toolUse.Name, + args: toolUse.Args, + response: toolResponse.Response, + } + if orderInsensitive { + args, err := json.Marshal(toolUse.Args) + if err != nil { + return nil, fmt.Errorf("marshal arguments: %w", err) + } + response, err := json.Marshal(toolResponse.Response) + if err != nil { + return nil, fmt.Errorf("marshal response: %w", err) + } + tool.argsOrder = string(args) + tool.responseOrder = string(response) + } + return tool, nil +} diff --git a/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go b/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go new file mode 100644 index 000000000..76b4856e7 --- /dev/null +++ b/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go @@ -0,0 +1,522 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package tooltrajectory + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "google.golang.org/genai" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" +) + +func TestToolTrajectoryCriterionMatchOrderInsensitive(t *testing.T) { + actual := makeInvocation( + []toolData{ + {id: "call-1", name: "shared", args: map[string]any{"a": 1}, response: map[string]any{"r": 2}}, + {id: "call-2", name: "shared", args: map[string]any{"a": 2}, response: map[string]any{"r": 3}}, + }, + ) + expected := makeInvocation( + []toolData{ + {id: "call-2", name: "shared", args: map[string]any{"a": 2}, response: map[string]any{"r": 3}}, + {id: "call-1", name: "shared", args: map[string]any{"a": 1}, response: map[string]any{"r": 2}}, + }, + ) + + criterion := New(WithOrderInsensitive(true)) + err := criterion.Match(actual, expected) + assert.NoError(t, err) +} + +func TestToolTrajectoryCriterionMissingResponse(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{}, + }, + } + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool"}, + }, + }, + } + criterion := New() + err := criterion.Match(actual, expected) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionCustomStrategy(t *testing.T) { + actual := makeInvocation( + []toolData{ + {id: "call-1", name: "custom", args: map[string]any{"k": "v"}, response: map[string]any{"r": "x"}}, + }, + ) + expected := makeInvocation( + []toolData{ + {id: "call-1", name: "custom", args: map[string]any{"k": "v"}, response: map[string]any{"r": "x"}}, + }, + ) + customStrategy := &ToolTrajectoryStrategy{ + Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, + } + criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{ + "custom": customStrategy, + })) + err := criterion.Match(actual, expected) + assert.NoError(t, err) +} + +type toolData struct { + id string + name string + args map[string]any + response map[string]any +} + +func makeInvocation(tools []toolData) *evalset.Invocation { + toolUses := make([]*genai.FunctionCall, 0, len(tools)) + toolResponses := make([]*genai.FunctionResponse, 0, len(tools)) + for _, t := range tools { + toolUses = append(toolUses, &genai.FunctionCall{ + ID: t.id, + Name: t.name, + Args: t.args, + }) + toolResponses = append(toolResponses, &genai.FunctionResponse{ + ID: t.id, + Name: t.name, + Response: t.response, + }) + } + return &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: toolUses, + ToolResponses: toolResponses, + }, + } +} + +func TestToolTrajectoryCriterionIDMismatch(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "use-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "resp-1", Name: "tool"}, + }, + }, + } + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "use-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "use-1", Name: "tool"}, + }, + }, + } + criterion := New() + err := criterion.Match(actual, expected) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionNilInvocation(t *testing.T) { + criterion := New() + err := criterion.Match(nil, makeInvocation(nil)) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionNilIntermediate(t *testing.T) { + criterion := New() + err := criterion.Match(&evalset.Invocation{}, &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}}) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionEmptyToolUseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "resp-1", Name: "tool"}, + }, + }, + } + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "resp-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "resp-1", Name: "tool"}, + }, + }, + } + err := New().Match(actual, expected) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionDuplicateResponseID(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + actual.IntermediateData.ToolResponses = append(actual.IntermediateData.ToolResponses, &genai.FunctionResponse{ + ID: "call-1", + Name: "tool", + Response: map[string]any{"r": 2}, + }) + err := New().Match(actual, makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + })) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionCustomCompare(t *testing.T) { + var called bool + criterion := &ToolTrajectoryCriterion{ + Compare: func(actual, expected *evalset.Invocation) error { + called = true + return nil + }, + } + err := criterion.Match(&evalset.Invocation{}, &evalset.Invocation{}) + assert.NoError(t, err) + assert.True(t, called) +} + +func TestToolTrajectoryCriterionExpectedResponseCountMismatch(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool", Args: map[string]any{"a": 1}}, + }, + ToolResponses: []*genai.FunctionResponse{}, + }, + } + err := New().Match(actual, expected) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionToolUsesCountMismatch(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + {id: "call-2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, + }) + err := New().Match(actual, expected) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionZeroTools(t *testing.T) { + actual := &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}} + expected := &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}} + err := New().Match(actual, expected) + assert.NoError(t, err) +} + +func TestToolTrajectoryCriterionExpectedInvalidID(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "", Name: "tool", Args: map[string]any{"a": 1}}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool", Response: map[string]any{"r": 1}}, + }, + }, + } + err := New().Match(actual, expected) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionStrategyMismatch(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool-A", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool-B", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + strategy := &ToolTrajectoryStrategy{ + Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, + } + criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{"tool-A": strategy})) + err := criterion.Match(actual, expected) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionDuplicateToolUseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "dup", Name: "tool"}, + {ID: "dup", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "dup", Name: "tool"}, + {ID: "dup2", Name: "tool"}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "dup", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + {id: "dup2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, + }) + err := New().Match(actual, expected) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionDuplicateToolResponseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + {ID: "call-2", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool"}, + {ID: "call-1", Name: "tool"}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + {id: "call-2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, + }) + err := New().Match(actual, expected) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionMissingResponseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "other", Name: "tool"}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + err := New().Match(actual, expected) + assert.Error(t, err) +} + +func TestToolComparerOrderInsensitiveMarshalError(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool", Args: map[string]any{"bad": make(chan int)}}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool", Response: map[string]any{"r": 1}}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{}, response: map[string]any{"r": 1}}, + }) + err := New(WithOrderInsensitive(true)).Match(actual, expected) + assert.Error(t, err) +} + +func TestToolComparerOrderInsensitiveMarshalResponseError(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool", Args: map[string]any{"a": 1}}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool", Response: map[string]any{"bad": make(chan int)}}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + err := New(WithOrderInsensitive(true)).Match(actual, expected) + assert.Error(t, err) +} + +func TestToolComparerLessThanBranches(t *testing.T) { + left := &toolComparer{name: "a", argsOrder: "1", responseOrder: "1"} + right := &toolComparer{name: "b", argsOrder: "0", responseOrder: "0"} + assert.True(t, left.lessThan(right)) + + left2 := &toolComparer{name: "a", argsOrder: "2", responseOrder: "1"} + right2 := &toolComparer{name: "a", argsOrder: "3", responseOrder: "0"} + assert.True(t, left2.lessThan(right2)) + + left3 := &toolComparer{name: "a", argsOrder: "1", responseOrder: "2"} + right3 := &toolComparer{name: "a", argsOrder: "1", responseOrder: "3"} + assert.True(t, left3.lessThan(right3)) +} + +func TestToolTrajectoryStrategyArgumentAndResponseMismatch(t *testing.T) { + strategy := &ToolTrajectoryStrategy{ + Arguments: &maptext.MapTextCriterion{}, + Response: &maptext.MapTextCriterion{}, + } + actual := &toolComparer{ + name: "tool", + args: map[string]any{"a": 1}, + response: map[string]any{"r": 1}, + } + expected := &toolComparer{ + name: "tool", + args: map[string]any{"a": 2}, + response: map[string]any{"r": 3}, + } + err := strategy.Match(actual, expected) + assert.Error(t, err) +} + +func TestGetToolComparerNilInputs(t *testing.T) { + _, err := getToolComparer(nil, &genai.FunctionResponse{}, false) + assert.Error(t, err) + _, err = getToolComparer(&genai.FunctionCall{}, nil, false) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionMissingResponseSet(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool"}, + }, + }, + } + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "other", Name: "tool"}, + }, + }, + } + err := New().Match(actual, expected) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionFallbackDefault(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + criterion := &ToolTrajectoryCriterion{ + DefaultStrategy: nil, + ToolStrategy: nil, + } + err := criterion.Match(actual, expected) + assert.NoError(t, err) +} + +func TestToolTrajectoryCriterionFallbackDefaultStrategy(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + criterion := &ToolTrajectoryCriterion{ + DefaultStrategy: nil, + ToolStrategy: nil, + } + err := criterion.Match(actual, expected) + assert.NoError(t, err) +} + +func TestToolTrajectoryCriterionEmptyToolResponseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "", Name: "tool"}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{}, response: map[string]any{}}, + }) + err := New().Match(actual, expected) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionStrategyLookupByExpectedName(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "unknown", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "custom", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + customStrategy := &ToolTrajectoryStrategy{} + criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{ + "custom": customStrategy, + })) + err := criterion.Match(actual, expected) + assert.NoError(t, err) +} + +func TestToolTrajectoryStrategyResponseMismatchOnly(t *testing.T) { + strategy := &ToolTrajectoryStrategy{ + Arguments: &maptext.MapTextCriterion{}, + Response: &maptext.MapTextCriterion{}, + } + actual := &toolComparer{ + name: "tool", + args: map[string]any{"a": 1}, + response: map[string]any{"r": 1}, + } + expected := &toolComparer{ + name: "tool", + args: map[string]any{"a": 1}, + response: map[string]any{"r": 2}, + } + err := strategy.Match(actual, expected) + assert.Error(t, err) +} + +func TestToolComparerLessThanEqual(t *testing.T) { + left := &toolComparer{name: "same", argsOrder: "1", responseOrder: "1"} + right := &toolComparer{name: "same", argsOrder: "1", responseOrder: "1"} + assert.False(t, left.lessThan(right)) +} diff --git a/evaluation/metric/metric.go b/evaluation/metric/metric.go index e8fb4beb2..ca82bbc3a 100644 --- a/evaluation/metric/metric.go +++ b/evaluation/metric/metric.go @@ -10,15 +10,18 @@ // Package metric provides evaluation metrics. package metric -import "context" +import ( + "context" + + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" +) // EvalMetric represents a metric used to evaluate a particular aspect of an eval case. // It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format. type EvalMetric struct { - // MetricName identifies the metric. - MetricName string `json:"metricName,omitempty"` - // Threshold value for this metric. - Threshold float64 `json:"threshold,omitempty"` + MetricName string `json:"metricName,omitempty"` // MetricName identifies the metric. + Threshold float64 `json:"threshold,omitempty"` // Threshold value for this metric. + Criterion *criterion.Criterion `json:"criterion,omitempty"` // Evaluation criterion used by the metric. } // Manager defines the interface for managing evaluation metrics. diff --git a/evaluation/service/internal/inference/inference.go b/evaluation/service/internal/inference/inference.go index 9b12fd833..af17773d2 100644 --- a/evaluation/service/internal/inference/inference.go +++ b/evaluation/service/internal/inference/inference.go @@ -72,11 +72,12 @@ func inferenceInvocation( if err != nil { return nil, fmt.Errorf("runner run: %w", err) } - // Capture the invocation ID, final response, and tool uses. + // Capture the invocation ID, final response, tool uses, and tool responses. var ( invocationID string finalResponse *genai.Content toolUses []*genai.FunctionCall + toolResponses []*genai.FunctionResponse ) for event := range events { if event == nil { @@ -105,6 +106,14 @@ func inferenceInvocation( } toolUses = append(toolUses, uses...) } + // Capture tool call responses. + if event.IsToolResultResponse() { + responses, err := convertToolResultResponse(event) + if err != nil { + return nil, fmt.Errorf("convert tool result response: %w", err) + } + toolResponses = append(toolResponses, responses...) + } } // Convert the final response to evalset content. return &evalset.Invocation{ @@ -112,7 +121,8 @@ func inferenceInvocation( UserContent: invocation.UserContent, FinalResponse: finalResponse, IntermediateData: &evalset.IntermediateData{ - ToolUses: toolUses, + ToolUses: toolUses, + ToolResponses: toolResponses, }, }, nil } @@ -132,6 +142,27 @@ func convertToolCallResponse(event *event.Event) ([]*genai.FunctionCall, error) return toolUses, nil } +// convertToolResultResponse converts the tool result response to function responses. +func convertToolResultResponse(event *event.Event) ([]*genai.FunctionResponse, error) { + toolResponses := []*genai.FunctionResponse{} + for _, choice := range event.Response.Choices { + if choice.Message.ToolID == "" { + continue + } + var response map[string]any + if err := json.Unmarshal([]byte(choice.Message.Content), &response); err != nil { + return nil, fmt.Errorf("unmarshal tool result response: %w", err) + } + toolResponse := &genai.FunctionResponse{ + ID: choice.Message.ToolID, + Name: choice.Message.ToolName, + Response: response, + } + toolResponses = append(toolResponses, toolResponse) + } + return toolResponses, nil +} + // convertContentToMessage transforms evalset input content into a model message. func convertContentToMessage(content *genai.Content) (*model.Message, error) { if content == nil { diff --git a/evaluation/service/internal/inference/inference_test.go b/evaluation/service/internal/inference/inference_test.go index 6ef0f9827..38b12387e 100644 --- a/evaluation/service/internal/inference/inference_test.go +++ b/evaluation/service/internal/inference/inference_test.go @@ -265,3 +265,54 @@ func TestConvertToolCallResponse(t *testing.T) { assert.Equal(t, "tool", result[0].Name) assert.Equal(t, float64(1), result[0].Args["count"]) } + +func TestConvertToolResultResponse(t *testing.T) { + ev := &event.Event{ + Response: &model.Response{ + Choices: []model.Choice{ + { + Message: model.Message{ + ToolID: "call-1", + ToolName: "tool", + Content: `{"result":42}`, + }, + }, + }, + }, + } + result, err := convertToolResultResponse(ev) + assert.NoError(t, err) + assert.Len(t, result, 1) + assert.Equal(t, "call-1", result[0].ID) + assert.Equal(t, "tool", result[0].Name) + assert.Equal(t, float64(42), result[0].Response["result"]) +} + +func TestConvertToolResultResponseSkipEmptyID(t *testing.T) { + ev := &event.Event{ + Response: &model.Response{ + Choices: []model.Choice{ + {Message: model.Message{Content: "{}", ToolID: ""}}, + {Message: model.Message{Content: `{"ok":true}`, ToolID: "id-1", ToolName: "t"}}, + }, + }, + } + result, err := convertToolResultResponse(ev) + assert.NoError(t, err) + assert.Len(t, result, 1) + assert.Equal(t, "id-1", result[0].ID) + assert.Equal(t, "t", result[0].Name) + assert.Equal(t, true, result[0].Response["ok"]) +} + +func TestConvertToolResultResponseInvalidJSON(t *testing.T) { + ev := &event.Event{ + Response: &model.Response{ + Choices: []model.Choice{ + {Message: model.Message{Content: "{", ToolID: "bad"}}, + }, + }, + } + _, err := convertToolResultResponse(ev) + assert.Error(t, err) +} From 61770a40760a1bc2802e60403dbaee297c8ed93b Mon Sep 17 00:00:00 2001 From: hackerli Date: Sat, 22 Nov 2025 21:23:16 +0800 Subject: [PATCH 04/14] refactor --- .../criterion/internal/maptext/maptext.go | 34 +++ .../{ => internal}/maptext/maptext_test.go | 23 +- .../metric/criterion/internal/text/text.go | 46 ++++ .../criterion/internal/text/text_test.go | 85 +++++++ .../internal/tooltrajectory/tooltrajectory.go | 233 ++++++++++++++++++ .../tooltrajectory/tooltrajectory_test.go | 143 ++++++----- .../metric/criterion/maptext/maptext.go | 28 --- evaluation/metric/criterion/options.go | 3 +- evaluation/metric/criterion/text/text.go | 43 ---- evaluation/metric/criterion/text/text_test.go | 84 ------- .../criterion/tooltrajectory/options_test.go | 5 +- .../tooltrajectory/tooltrajectory.go | 202 --------------- 12 files changed, 494 insertions(+), 435 deletions(-) create mode 100644 evaluation/metric/criterion/internal/maptext/maptext.go rename evaluation/metric/criterion/{ => internal}/maptext/maptext_test.go (59%) create mode 100644 evaluation/metric/criterion/internal/text/text.go create mode 100644 evaluation/metric/criterion/internal/text/text_test.go create mode 100644 evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory.go rename evaluation/metric/criterion/{ => internal}/tooltrajectory/tooltrajectory_test.go (77%) delete mode 100644 evaluation/metric/criterion/text/text_test.go diff --git a/evaluation/metric/criterion/internal/maptext/maptext.go b/evaluation/metric/criterion/internal/maptext/maptext.go new file mode 100644 index 000000000..9ce0a6a13 --- /dev/null +++ b/evaluation/metric/criterion/internal/maptext/maptext.go @@ -0,0 +1,34 @@ +package maptext + +import ( + "encoding/json" + "fmt" + "reflect" + + itext "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/internal/text" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" +) + +// Match compares two maps using custom logic, text-based matching, or deep equality. +func Match(m *maptext.MapTextCriterion, actual, expected map[string]any) error { + if m.Compare != nil { + return m.Compare(actual, expected) + } + if m.TextCriterion != nil { + // Although the keys in a map are unordered, json.Marshal guarantees the order of the keys, + // so we can directly use json.Marshal for comparison. + actualData, err := json.Marshal(actual) + if err != nil { + return fmt.Errorf("marshal actual: %w", err) + } + expectedData, err := json.Marshal(expected) + if err != nil { + return fmt.Errorf("marshal expected: %w", err) + } + return itext.Match(m.TextCriterion, string(actualData), string(expectedData)) + } + if reflect.DeepEqual(actual, expected) { + return nil + } + return fmt.Errorf("actual %v and expected %v do not match", actual, expected) +} diff --git a/evaluation/metric/criterion/maptext/maptext_test.go b/evaluation/metric/criterion/internal/maptext/maptext_test.go similarity index 59% rename from evaluation/metric/criterion/maptext/maptext_test.go rename to evaluation/metric/criterion/internal/maptext/maptext_test.go index bd51156e1..017576ded 100644 --- a/evaluation/metric/criterion/maptext/maptext_test.go +++ b/evaluation/metric/criterion/internal/maptext/maptext_test.go @@ -13,53 +13,54 @@ import ( "testing" "github.com/stretchr/testify/assert" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" ) func TestMapTextCriterionCompareOverride(t *testing.T) { called := false - criterion := &MapTextCriterion{ + criterion := &maptext.MapTextCriterion{ Compare: func(actual, expected map[string]any) error { called = true return nil }, } - err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "v"}) + err := Match(criterion, map[string]any{"k": "v"}, map[string]any{"k": "v"}) assert.NoError(t, err) assert.True(t, called) } func TestMapTextCriterionTextMatch(t *testing.T) { - criterion := &MapTextCriterion{ + criterion := &maptext.MapTextCriterion{ TextCriterion: &text.TextCriterion{ CaseInsensitive: true, MatchStrategy: text.TextMatchStrategyExact, }, } - err := criterion.Match(map[string]any{"msg": "Hello"}, map[string]any{"msg": "hello"}) + err := Match(criterion, map[string]any{"msg": "Hello"}, map[string]any{"msg": "hello"}) assert.NoError(t, err) } func TestMapTextCriterionDeepEqualMismatch(t *testing.T) { - criterion := &MapTextCriterion{} - err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "diff"}) + criterion := &maptext.MapTextCriterion{} + err := Match(criterion, map[string]any{"k": "v"}, map[string]any{"k": "diff"}) assert.Error(t, err) } func TestMapTextCriterionMarshalErrors(t *testing.T) { - criterion := &MapTextCriterion{ + criterion := &maptext.MapTextCriterion{ TextCriterion: &text.TextCriterion{}, } // Actual marshal error. - actualErr := criterion.Match(map[string]any{"bad": make(chan int)}, map[string]any{"k": "v"}) + actualErr := Match(criterion, map[string]any{"bad": make(chan int)}, map[string]any{"k": "v"}) assert.Error(t, actualErr) // Expected marshal error. - expectedErr := criterion.Match(map[string]any{"k": "v"}, map[string]any{"bad": make(chan int)}) + expectedErr := Match(criterion, map[string]any{"k": "v"}, map[string]any{"bad": make(chan int)}) assert.Error(t, expectedErr) } func TestMapTextCriterionDeepEqualSuccess(t *testing.T) { - criterion := &MapTextCriterion{} - err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "v"}) + criterion := &maptext.MapTextCriterion{} + err := Match(criterion, map[string]any{"k": "v"}, map[string]any{"k": "v"}) assert.NoError(t, err) } diff --git a/evaluation/metric/criterion/internal/text/text.go b/evaluation/metric/criterion/internal/text/text.go new file mode 100644 index 000000000..a53ccbfaa --- /dev/null +++ b/evaluation/metric/criterion/internal/text/text.go @@ -0,0 +1,46 @@ +package text + +import ( + "fmt" + "regexp" + "strings" + + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" +) + +// Match compares source and target using the configured strategy. +func Match(t *text.TextCriterion, source, target string) error { + if t.Compare != nil { + return t.Compare(source, target) + } + if t.Ignore { + return nil + } + if t.CaseInsensitive { + source = strings.ToLower(source) + target = strings.ToLower(target) + } + switch t.MatchStrategy { + case text.TextMatchStrategyExact: + if source == target { + return nil + } + return fmt.Errorf("source %s and target %s do not match", source, target) + case text.TextMatchStrategyContains: + if strings.Contains(source, target) { + return nil + } + return fmt.Errorf("source %s does not contain target %s", source, target) + case text.TextMatchStrategyRegex: + re, err := regexp.Compile(target) + if err != nil { + return fmt.Errorf("invalid regex %s: %w", target, err) + } + if re.MatchString(source) { + return nil + } + return fmt.Errorf("source %s does not match regex %s", source, target) + default: + return fmt.Errorf("invalid match strategy %s", t.MatchStrategy) + } +} diff --git a/evaluation/metric/criterion/internal/text/text_test.go b/evaluation/metric/criterion/internal/text/text_test.go new file mode 100644 index 000000000..f78b1edf7 --- /dev/null +++ b/evaluation/metric/criterion/internal/text/text_test.go @@ -0,0 +1,85 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package text + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" +) + +func TestTextCriterionMatchStrategies(t *testing.T) { + criterion := &text.TextCriterion{ + CaseInsensitive: true, + MatchStrategy: text.TextMatchStrategyContains, + } + err := Match(criterion, "Hello World", "hello") + assert.NoError(t, err) +} + +func TestTextCriterionIgnore(t *testing.T) { + criterion := &text.TextCriterion{ + Ignore: true, + } + err := Match(criterion, "anything", "value") + assert.NoError(t, err) +} + +func TestTextCriterionRegexInvalid(t *testing.T) { + criterion := &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyRegex, + } + err := Match(criterion, "source", "[invalid(") + assert.Error(t, err) +} + +func TestTextCriterionUnknownStrategy(t *testing.T) { + criterion := &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategy("unknown"), + } + err := Match(criterion, "a", "b") + assert.Error(t, err) +} + +func TestTextCriterionAllBranches(t *testing.T) { + customCalled := false + custom := &text.TextCriterion{ + Compare: func(actual, expected string) error { + customCalled = true + return nil + }, + } + err := Match(custom, "x", "y") + assert.NoError(t, err) + assert.True(t, customCalled) + + exact := &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + } + err = Match(exact, "same", "same") + assert.NoError(t, err) + err = Match(exact, "same", "diff") + assert.Error(t, err) + + contains := &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyContains, + } + err = Match(contains, "hello", "missing") + assert.Error(t, err) + + regex := &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyRegex, + } + err = Match(regex, "abc123", "abc[0-9]+") + assert.NoError(t, err) + err = Match(regex, "xyz", "abc[0-9]+") + assert.Error(t, err) +} diff --git a/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory.go b/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory.go new file mode 100644 index 000000000..d039c276d --- /dev/null +++ b/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory.go @@ -0,0 +1,233 @@ +package tooltrajectory + +import ( + "encoding/json" + "errors" + "fmt" + "reflect" + "sort" + + "google.golang.org/genai" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + imaptext "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/internal/maptext" + itext "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/internal/text" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +// Match compares actual and expected invocations according to tool trajectory rules. +func Match(t *tooltrajectory.ToolTrajectoryCriterion, actual, expected *evalset.Invocation) error { + if t.Compare != nil { + return t.Compare(actual, expected) + } + if actual == nil || expected == nil { + return fmt.Errorf("actual or expected invocation is nil") + } + if actual.IntermediateData == nil || expected.IntermediateData == nil { + return fmt.Errorf("actual or expected intermediate data is nil") + } + // Ensure one-to-one mapping between tool calls and responses on actual invocation. + if len(actual.IntermediateData.ToolUses) != len(actual.IntermediateData.ToolResponses) { + return fmt.Errorf("tool uses and tool responses count mismatch: %d != %d", + len(actual.IntermediateData.ToolUses), len(actual.IntermediateData.ToolResponses)) + } + // Ensure one-to-one mapping between tool calls and responses on expected invocation. + if len(expected.IntermediateData.ToolUses) != len(expected.IntermediateData.ToolResponses) { + return fmt.Errorf("tool uses and tool responses count mismatch: %d != %d", + len(expected.IntermediateData.ToolUses), len(expected.IntermediateData.ToolResponses)) + } + // Ensure the same number of tool uses before detailed comparison. + if len(actual.IntermediateData.ToolUses) != len(expected.IntermediateData.ToolUses) { + return fmt.Errorf("tool uses count mismatch: %d != %d", + len(actual.IntermediateData.ToolUses), len(expected.IntermediateData.ToolUses)) + } + if len(actual.IntermediateData.ToolUses) == 0 { + return nil + } + actualTools, err := getToolComparers(t, + actual.IntermediateData.ToolUses, + actual.IntermediateData.ToolResponses, + t.OrderInsensitive, + ) + if err != nil { + return fmt.Errorf("get actual tools: %w", err) + } + expectedTools, err := getToolComparers(t, + expected.IntermediateData.ToolUses, + expected.IntermediateData.ToolResponses, + t.OrderInsensitive, + ) + if err != nil { + return fmt.Errorf("get expected tools: %w", err) + } + if t.OrderInsensitive { + sort.Slice(actualTools, func(i, j int) bool { + return actualTools[i].lessThan(actualTools[j]) + }) + sort.Slice(expectedTools, func(i, j int) bool { + return expectedTools[i].lessThan(expectedTools[j]) + }) + } + for i := range len(actualTools) { + strategy := getStrategy(t, actualTools[i], expectedTools[i]) + if err := MatchStrategy(strategy, actualTools[i], expectedTools[i]); err != nil { + return fmt.Errorf("tool %s mismatch: %w", actualTools[i].name, err) + } + } + return nil +} + +// getToolComparers aligns tool uses with their responses and builds toolComparer. +func getToolComparers(t *tooltrajectory.ToolTrajectoryCriterion, toolUses []*genai.FunctionCall, + toolResponses []*genai.FunctionResponse, orderInsensitive bool) ([]*toolComparer, error) { + // toolCallIDs ensures every tool use can be matched by ID. + // Map from tool call id to index. + toolCallIDs := make(map[string]int) + for i := range len(toolUses) { + if toolUses[i].ID == "" { + return nil, fmt.Errorf("tool use id is empty") + } + if _, ok := toolCallIDs[toolUses[i].ID]; ok { + return nil, fmt.Errorf("tool use id %s is duplicated", toolUses[i].ID) + } + toolCallIDs[toolUses[i].ID] = i + } + // toolResponseIDs ensures every tool response can be matched by ID. + // Map from tool response id to index. + toolResponseIDs := make(map[string]int) + for i := range len(toolResponses) { + if toolResponses[i].ID == "" { + return nil, fmt.Errorf("tool response id is empty") + } + if _, ok := toolResponseIDs[toolResponses[i].ID]; ok { + return nil, fmt.Errorf("tool response id %s is duplicated", toolResponses[i].ID) + } + toolResponseIDs[toolResponses[i].ID] = i + } + for toolID := range toolCallIDs { + if _, ok := toolResponseIDs[toolID]; !ok { + return nil, fmt.Errorf("tool id %s is missing response", toolID) + } + } + toolComparers := make([]*toolComparer, 0, len(toolUses)) + for i := range len(toolUses) { + toolComparer, err := getToolComparer( + toolUses[i], + toolResponses[toolResponseIDs[toolUses[i].ID]], + orderInsensitive, + ) + if err != nil { + return nil, fmt.Errorf("get tool comparer: %w", err) + } + toolComparers = append(toolComparers, toolComparer) + } + return toolComparers, nil +} + +// getStrategy picks the comparison strategy for a specific tool pair. +func getStrategy(t *tooltrajectory.ToolTrajectoryCriterion, actualTool, expectedTool *toolComparer) *tooltrajectory.ToolTrajectoryStrategy { + if t.ToolStrategy != nil { + strategy, ok := t.ToolStrategy[actualTool.name] + if ok { + return strategy + } + strategy, ok = t.ToolStrategy[expectedTool.name] + if ok { + return strategy + } + } + if t.DefaultStrategy != nil { + return t.DefaultStrategy + } + return &tooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &maptext.MapTextCriterion{ + Compare: func(actual, expected map[string]any) error { + if !reflect.DeepEqual(actual, expected) { + return fmt.Errorf("actual %v and expected %v do not match", actual, expected) + } + return nil + }, + }, + Response: &maptext.MapTextCriterion{ + Compare: func(actual, expected map[string]any) error { + if !reflect.DeepEqual(actual, expected) { + return fmt.Errorf("actual %v and expected %v do not match", actual, expected) + } + return nil + }, + }, + } +} + +// Match validates a single tool call pair using configured criteria. +func MatchStrategy(t *tooltrajectory.ToolTrajectoryStrategy, actual, expected *toolComparer) error { + if t.Name != nil { + if err := itext.Match(t.Name, actual.name, expected.name); err != nil { + return fmt.Errorf("name mismatch: %w", err) + } + } + if t.Arguments != nil { + if err := imaptext.Match(t.Arguments, actual.args, expected.args); err != nil { + return fmt.Errorf("arguments mismatch: %w", err) + } + } + if t.Response != nil { + if err := imaptext.Match(t.Response, actual.response, expected.response); err != nil { + return fmt.Errorf("response mismatch: %w", err) + } + } + return nil +} + +// toolComparer normalizes tool call and response data for comparison. +type toolComparer struct { + name string // name holds the tool name. + args map[string]any // args holds parsed tool arguments. + response map[string]any // response holds parsed tool response payload. + argsOrder string // argsOrder caches JSON for order-insensitive compare. + responseOrder string // responseOrder caches JSON for order-insensitive compare. +} + +// lessThan provides deterministic ordering when order-insensitive compares require sorting. +func (t *toolComparer) lessThan(other *toolComparer) bool { + if t.name != other.name { + return t.name < other.name + } + if t.argsOrder != other.argsOrder { + return t.argsOrder < other.argsOrder + } + if t.responseOrder != other.responseOrder { + return t.responseOrder < other.responseOrder + } + return false +} + +// getToolComparer pairs a tool use with its response and precomputes ordering hints. +func getToolComparer(toolUse *genai.FunctionCall, toolResponse *genai.FunctionResponse, + orderInsensitive bool) (*toolComparer, error) { + if toolUse == nil || toolResponse == nil { + return nil, errors.New("tool use or tool response is nil") + } + tool := &toolComparer{ + name: toolUse.Name, + args: toolUse.Args, + response: toolResponse.Response, + } + if orderInsensitive { + args, err := json.Marshal(toolUse.Args) + if err != nil { + return nil, fmt.Errorf("marshal arguments: %w", err) + } + response, err := json.Marshal(toolResponse.Response) + if err != nil { + return nil, fmt.Errorf("marshal response: %w", err) + } + tool.argsOrder = string(args) + tool.responseOrder = string(response) + } + return tool, nil +} diff --git a/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go b/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory_test.go similarity index 77% rename from evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go rename to evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory_test.go index 76b4856e7..7525a20a1 100644 --- a/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go +++ b/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory_test.go @@ -15,8 +15,11 @@ import ( "github.com/stretchr/testify/assert" "google.golang.org/genai" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + imaptext "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/internal/maptext" + itext "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/internal/text" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" ) func TestToolTrajectoryCriterionMatchOrderInsensitive(t *testing.T) { @@ -33,8 +36,10 @@ func TestToolTrajectoryCriterionMatchOrderInsensitive(t *testing.T) { }, ) - criterion := New(WithOrderInsensitive(true)) - err := criterion.Match(actual, expected) + criterion := &tooltrajectory.ToolTrajectoryCriterion{ + OrderInsensitive: true, + } + err := Match(criterion, actual, expected) assert.NoError(t, err) } @@ -57,8 +62,8 @@ func TestToolTrajectoryCriterionMissingResponse(t *testing.T) { }, }, } - criterion := New() - err := criterion.Match(actual, expected) + criterion := &tooltrajectory.ToolTrajectoryCriterion{} + err := Match(criterion, actual, expected) assert.Error(t, err) } @@ -73,13 +78,15 @@ func TestToolTrajectoryCriterionCustomStrategy(t *testing.T) { {id: "call-1", name: "custom", args: map[string]any{"k": "v"}, response: map[string]any{"r": "x"}}, }, ) - customStrategy := &ToolTrajectoryStrategy{ + customStrategy := &tooltrajectory.ToolTrajectoryStrategy{ Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, } - criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{ - "custom": customStrategy, - })) - err := criterion.Match(actual, expected) + criterion := &tooltrajectory.ToolTrajectoryCriterion{ + ToolStrategy: map[string]*tooltrajectory.ToolTrajectoryStrategy{ + "custom": customStrategy, + }, + } + err := Match(criterion, actual, expected) assert.NoError(t, err) } @@ -134,20 +141,20 @@ func TestToolTrajectoryCriterionIDMismatch(t *testing.T) { }, }, } - criterion := New() - err := criterion.Match(actual, expected) + criterion := tooltrajectory.New() + err := Match(criterion, actual, expected) assert.Error(t, err) } func TestToolTrajectoryCriterionNilInvocation(t *testing.T) { - criterion := New() - err := criterion.Match(nil, makeInvocation(nil)) + criterion := tooltrajectory.New() + err := Match(criterion, nil, makeInvocation(nil)) assert.Error(t, err) } func TestToolTrajectoryCriterionNilIntermediate(t *testing.T) { - criterion := New() - err := criterion.Match(&evalset.Invocation{}, &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}}) + criterion := tooltrajectory.New() + err := Match(criterion, &evalset.Invocation{}, &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}}) assert.Error(t, err) } @@ -172,7 +179,7 @@ func TestToolTrajectoryCriterionEmptyToolUseID(t *testing.T) { }, }, } - err := New().Match(actual, expected) + err := Match(tooltrajectory.New(), actual, expected) assert.Error(t, err) } @@ -185,7 +192,7 @@ func TestToolTrajectoryCriterionDuplicateResponseID(t *testing.T) { Name: "tool", Response: map[string]any{"r": 2}, }) - err := New().Match(actual, makeInvocation([]toolData{ + err := Match(tooltrajectory.New(), actual, makeInvocation([]toolData{ {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, })) assert.Error(t, err) @@ -193,13 +200,13 @@ func TestToolTrajectoryCriterionDuplicateResponseID(t *testing.T) { func TestToolTrajectoryCriterionCustomCompare(t *testing.T) { var called bool - criterion := &ToolTrajectoryCriterion{ + criterion := &tooltrajectory.ToolTrajectoryCriterion{ Compare: func(actual, expected *evalset.Invocation) error { called = true return nil }, } - err := criterion.Match(&evalset.Invocation{}, &evalset.Invocation{}) + err := Match(criterion, &evalset.Invocation{}, &evalset.Invocation{}) assert.NoError(t, err) assert.True(t, called) } @@ -216,7 +223,7 @@ func TestToolTrajectoryCriterionExpectedResponseCountMismatch(t *testing.T) { ToolResponses: []*genai.FunctionResponse{}, }, } - err := New().Match(actual, expected) + err := Match(tooltrajectory.New(), actual, expected) assert.Error(t, err) } @@ -228,14 +235,14 @@ func TestToolTrajectoryCriterionToolUsesCountMismatch(t *testing.T) { {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, {id: "call-2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, }) - err := New().Match(actual, expected) + err := Match(tooltrajectory.New(), actual, expected) assert.Error(t, err) } func TestToolTrajectoryCriterionZeroTools(t *testing.T) { actual := &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}} expected := &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}} - err := New().Match(actual, expected) + err := Match(tooltrajectory.New(), actual, expected) assert.NoError(t, err) } @@ -253,7 +260,7 @@ func TestToolTrajectoryCriterionExpectedInvalidID(t *testing.T) { }, }, } - err := New().Match(actual, expected) + err := Match(tooltrajectory.New(), actual, expected) assert.Error(t, err) } @@ -264,11 +271,11 @@ func TestToolTrajectoryCriterionStrategyMismatch(t *testing.T) { expected := makeInvocation([]toolData{ {id: "call-1", name: "tool-B", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, }) - strategy := &ToolTrajectoryStrategy{ + strategy := &tooltrajectory.ToolTrajectoryStrategy{ Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, } - criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{"tool-A": strategy})) - err := criterion.Match(actual, expected) + criterion := tooltrajectory.New(tooltrajectory.WithTool(map[string]*tooltrajectory.ToolTrajectoryStrategy{"tool-A": strategy})) + err := Match(criterion, actual, expected) assert.Error(t, err) } @@ -289,7 +296,7 @@ func TestToolTrajectoryCriterionDuplicateToolUseID(t *testing.T) { {id: "dup", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, {id: "dup2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, }) - err := New().Match(actual, expected) + err := Match(tooltrajectory.New(), actual, expected) assert.Error(t, err) } @@ -310,7 +317,7 @@ func TestToolTrajectoryCriterionDuplicateToolResponseID(t *testing.T) { {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, {id: "call-2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, }) - err := New().Match(actual, expected) + err := Match(tooltrajectory.New(), actual, expected) assert.Error(t, err) } @@ -328,7 +335,7 @@ func TestToolTrajectoryCriterionMissingResponseID(t *testing.T) { expected := makeInvocation([]toolData{ {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, }) - err := New().Match(actual, expected) + err := Match(tooltrajectory.New(), actual, expected) assert.Error(t, err) } @@ -346,7 +353,7 @@ func TestToolComparerOrderInsensitiveMarshalError(t *testing.T) { expected := makeInvocation([]toolData{ {id: "call-1", name: "tool", args: map[string]any{}, response: map[string]any{"r": 1}}, }) - err := New(WithOrderInsensitive(true)).Match(actual, expected) + err := Match(tooltrajectory.New(tooltrajectory.WithOrderInsensitive(true)), actual, expected) assert.Error(t, err) } @@ -364,7 +371,7 @@ func TestToolComparerOrderInsensitiveMarshalResponseError(t *testing.T) { expected := makeInvocation([]toolData{ {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, }) - err := New(WithOrderInsensitive(true)).Match(actual, expected) + err := Match(tooltrajectory.New(tooltrajectory.WithOrderInsensitive(true)), actual, expected) assert.Error(t, err) } @@ -383,21 +390,20 @@ func TestToolComparerLessThanBranches(t *testing.T) { } func TestToolTrajectoryStrategyArgumentAndResponseMismatch(t *testing.T) { - strategy := &ToolTrajectoryStrategy{ + strategy := &tooltrajectory.ToolTrajectoryStrategy{ Arguments: &maptext.MapTextCriterion{}, Response: &maptext.MapTextCriterion{}, } - actual := &toolComparer{ - name: "tool", - args: map[string]any{"a": 1}, - response: map[string]any{"r": 1}, - } - expected := &toolComparer{ - name: "tool", - args: map[string]any{"a": 2}, - response: map[string]any{"r": 3}, - } - err := strategy.Match(actual, expected) + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 3}}, + }) + criterion := tooltrajectory.New(tooltrajectory.WithTool(map[string]*tooltrajectory.ToolTrajectoryStrategy{ + "tool": strategy, + })) + err := Match(criterion, actual, expected) assert.Error(t, err) } @@ -429,7 +435,7 @@ func TestToolTrajectoryCriterionMissingResponseSet(t *testing.T) { }, }, } - err := New().Match(actual, expected) + err := Match(tooltrajectory.New(), actual, expected) assert.Error(t, err) } @@ -440,11 +446,11 @@ func TestToolTrajectoryCriterionFallbackDefault(t *testing.T) { expected := makeInvocation([]toolData{ {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, }) - criterion := &ToolTrajectoryCriterion{ + criterion := &tooltrajectory.ToolTrajectoryCriterion{ DefaultStrategy: nil, ToolStrategy: nil, } - err := criterion.Match(actual, expected) + err := Match(criterion, actual, expected) assert.NoError(t, err) } @@ -455,11 +461,11 @@ func TestToolTrajectoryCriterionFallbackDefaultStrategy(t *testing.T) { expected := makeInvocation([]toolData{ {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, }) - criterion := &ToolTrajectoryCriterion{ + criterion := &tooltrajectory.ToolTrajectoryCriterion{ DefaultStrategy: nil, ToolStrategy: nil, } - err := criterion.Match(actual, expected) + err := Match(criterion, actual, expected) assert.NoError(t, err) } @@ -477,7 +483,7 @@ func TestToolTrajectoryCriterionEmptyToolResponseID(t *testing.T) { expected := makeInvocation([]toolData{ {id: "call-1", name: "tool", args: map[string]any{}, response: map[string]any{}}, }) - err := New().Match(actual, expected) + err := Match(tooltrajectory.New(), actual, expected) assert.Error(t, err) } @@ -488,30 +494,29 @@ func TestToolTrajectoryCriterionStrategyLookupByExpectedName(t *testing.T) { expected := makeInvocation([]toolData{ {id: "call-1", name: "custom", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, }) - customStrategy := &ToolTrajectoryStrategy{} - criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{ + customStrategy := &tooltrajectory.ToolTrajectoryStrategy{} + criterion := tooltrajectory.New(tooltrajectory.WithTool(map[string]*tooltrajectory.ToolTrajectoryStrategy{ "custom": customStrategy, })) - err := criterion.Match(actual, expected) + err := Match(criterion, actual, expected) assert.NoError(t, err) } func TestToolTrajectoryStrategyResponseMismatchOnly(t *testing.T) { - strategy := &ToolTrajectoryStrategy{ + strategy := &tooltrajectory.ToolTrajectoryStrategy{ Arguments: &maptext.MapTextCriterion{}, Response: &maptext.MapTextCriterion{}, } - actual := &toolComparer{ - name: "tool", - args: map[string]any{"a": 1}, - response: map[string]any{"r": 1}, - } - expected := &toolComparer{ - name: "tool", - args: map[string]any{"a": 1}, - response: map[string]any{"r": 2}, - } - err := strategy.Match(actual, expected) + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 2}}, + }) + criterion := tooltrajectory.New(tooltrajectory.WithTool(map[string]*tooltrajectory.ToolTrajectoryStrategy{ + "tool": strategy, + })) + err := Match(criterion, actual, expected) assert.Error(t, err) } @@ -520,3 +525,13 @@ func TestToolComparerLessThanEqual(t *testing.T) { right := &toolComparer{name: "same", argsOrder: "1", responseOrder: "1"} assert.False(t, left.lessThan(right)) } + +func TestInternalTextAndMapWrappers(t *testing.T) { + txt := &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact} + err := itext.Match(txt, "same", "same") + assert.NoError(t, err) + + crit := &maptext.MapTextCriterion{} + err = imaptext.Match(crit, map[string]any{"a": 1}, map[string]any{"a": 1}) + assert.NoError(t, err) +} diff --git a/evaluation/metric/criterion/maptext/maptext.go b/evaluation/metric/criterion/maptext/maptext.go index 7452876b1..76eab58b4 100644 --- a/evaluation/metric/criterion/maptext/maptext.go +++ b/evaluation/metric/criterion/maptext/maptext.go @@ -11,10 +11,6 @@ package maptext import ( - "encoding/json" - "fmt" - "reflect" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" ) @@ -25,27 +21,3 @@ type MapTextCriterion struct { // Compare overrides default comparison when provided. Compare func(actual, expected map[string]any) error `json:"-"` } - -// Match compares two maps using custom logic, text-based matching, or deep equality. -func (m *MapTextCriterion) Match(actual, expected map[string]any) error { - if m.Compare != nil { - return m.Compare(actual, expected) - } - if m.TextCriterion != nil { - // Although the keys in a map are unordered, json.Marshal guarantees the order of the keys, - // so we can directly use json.Marshal for comparison. - actualData, err := json.Marshal(actual) - if err != nil { - return fmt.Errorf("marshal actual: %w", err) - } - expectedData, err := json.Marshal(expected) - if err != nil { - return fmt.Errorf("marshal expected: %w", err) - } - return m.TextCriterion.Match(string(actualData), string(expectedData)) - } - if reflect.DeepEqual(actual, expected) { - return nil - } - return fmt.Errorf("actual %v and expected %v do not match", actual, expected) -} diff --git a/evaluation/metric/criterion/options.go b/evaluation/metric/criterion/options.go index 9a45e567d..bf59e1b69 100644 --- a/evaluation/metric/criterion/options.go +++ b/evaluation/metric/criterion/options.go @@ -13,7 +13,8 @@ import "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltraject // options aggregates configurable parts of Criterion. type options struct { - ToolTrajectory *tooltrajectory.ToolTrajectoryCriterion // ToolTrajectory sets the default tool trajectory criterion. + // ToolTrajectory sets the default tool trajectory criterion. + ToolTrajectory *tooltrajectory.ToolTrajectoryCriterion } // newOptions creates a Options with the provided options. diff --git a/evaluation/metric/criterion/text/text.go b/evaluation/metric/criterion/text/text.go index 21d49644c..8d306cbfc 100644 --- a/evaluation/metric/criterion/text/text.go +++ b/evaluation/metric/criterion/text/text.go @@ -10,12 +10,6 @@ // Package text defines text comparison criteria. package text -import ( - "fmt" - "regexp" - "strings" -) - // TextCriterion governs how two strings should be compared. type TextCriterion struct { // Ignore skips comparison when true. @@ -39,40 +33,3 @@ const ( // TextMatchStrategyRegex matches strings that match the regex. TextMatchStrategyRegex TextMatchStrategy = "regex" ) - -// Match compares source and target using the configured strategy. -func (t *TextCriterion) Match(source, target string) error { - if t.Compare != nil { - return t.Compare(source, target) - } - if t.Ignore { - return nil - } - if t.CaseInsensitive { - source = strings.ToLower(source) - target = strings.ToLower(target) - } - switch t.MatchStrategy { - case TextMatchStrategyExact: - if source == target { - return nil - } - return fmt.Errorf("source %s and target %s do not match", source, target) - case TextMatchStrategyContains: - if strings.Contains(source, target) { - return nil - } - return fmt.Errorf("source %s does not contain target %s", source, target) - case TextMatchStrategyRegex: - re, err := regexp.Compile(target) - if err != nil { - return fmt.Errorf("invalid regex %s: %w", target, err) - } - if re.MatchString(source) { - return nil - } - return fmt.Errorf("source %s does not match regex %s", source, target) - default: - return fmt.Errorf("invalid match strategy %s", t.MatchStrategy) - } -} diff --git a/evaluation/metric/criterion/text/text_test.go b/evaluation/metric/criterion/text/text_test.go deleted file mode 100644 index 66143c531..000000000 --- a/evaluation/metric/criterion/text/text_test.go +++ /dev/null @@ -1,84 +0,0 @@ -// -// Tencent is pleased to support the open source community by making trpc-agent-go available. -// -// Copyright (C) 2025 Tencent. All rights reserved. -// -// trpc-agent-go is licensed under the Apache License Version 2.0. -// -// - -package text - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestTextCriterionMatchStrategies(t *testing.T) { - criterion := &TextCriterion{ - CaseInsensitive: true, - MatchStrategy: TextMatchStrategyContains, - } - err := criterion.Match("Hello World", "hello") - assert.NoError(t, err) -} - -func TestTextCriterionIgnore(t *testing.T) { - criterion := &TextCriterion{ - Ignore: true, - } - err := criterion.Match("anything", "value") - assert.NoError(t, err) -} - -func TestTextCriterionRegexInvalid(t *testing.T) { - criterion := &TextCriterion{ - MatchStrategy: TextMatchStrategyRegex, - } - err := criterion.Match("source", "[invalid(") - assert.Error(t, err) -} - -func TestTextCriterionUnknownStrategy(t *testing.T) { - criterion := &TextCriterion{ - MatchStrategy: TextMatchStrategy("unknown"), - } - err := criterion.Match("a", "b") - assert.Error(t, err) -} - -func TestTextCriterionAllBranches(t *testing.T) { - customCalled := false - custom := &TextCriterion{ - Compare: func(actual, expected string) error { - customCalled = true - return nil - }, - } - err := custom.Match("x", "y") - assert.NoError(t, err) - assert.True(t, customCalled) - - exact := &TextCriterion{ - MatchStrategy: TextMatchStrategyExact, - } - err = exact.Match("same", "same") - assert.NoError(t, err) - err = exact.Match("same", "diff") - assert.Error(t, err) - - contains := &TextCriterion{ - MatchStrategy: TextMatchStrategyContains, - } - err = contains.Match("hello", "missing") - assert.Error(t, err) - - regex := &TextCriterion{ - MatchStrategy: TextMatchStrategyRegex, - } - err = regex.Match("abc123", "abc[0-9]+") - assert.NoError(t, err) - err = regex.Match("xyz", "abc[0-9]+") - assert.Error(t, err) -} diff --git a/evaluation/metric/criterion/tooltrajectory/options_test.go b/evaluation/metric/criterion/tooltrajectory/options_test.go index 005213674..ebee155d7 100644 --- a/evaluation/metric/criterion/tooltrajectory/options_test.go +++ b/evaluation/metric/criterion/tooltrajectory/options_test.go @@ -14,6 +14,7 @@ import ( "github.com/stretchr/testify/assert" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + imaptext "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/internal/maptext" ) func TestNewOptionsDefaults(t *testing.T) { @@ -57,13 +58,13 @@ func TestWithCompare(t *testing.T) { } func TestDefaultToolTrajectoryStrategyDeepEqualMismatch(t *testing.T) { - errArgs := defaultToolTrajectoryStrategy.Arguments.Match( + errArgs := imaptext.Match(defaultToolTrajectoryStrategy.Arguments, map[string]any{"a": 1}, map[string]any{"a": 2}, ) assert.Error(t, errArgs) - errResp := defaultToolTrajectoryStrategy.Response.Match( + errResp := imaptext.Match(defaultToolTrajectoryStrategy.Response, map[string]any{"r": 1}, map[string]any{"r": 3}, ) diff --git a/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go b/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go index ac077d13e..404d16362 100644 --- a/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go +++ b/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go @@ -11,12 +11,6 @@ package tooltrajectory import ( - "encoding/json" - "errors" - "fmt" - "sort" - - "google.golang.org/genai" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" @@ -45,205 +39,9 @@ type ToolTrajectoryCriterion struct { Compare func(actual, expected *evalset.Invocation) error `json:"-"` } -// Match compares actual and expected invocations according to tool trajectory rules. -func (t *ToolTrajectoryCriterion) Match(actual, expected *evalset.Invocation) error { - if t.Compare != nil { - return t.Compare(actual, expected) - } - if actual == nil || expected == nil { - return fmt.Errorf("actual or expected invocation is nil") - } - if actual.IntermediateData == nil || expected.IntermediateData == nil { - return fmt.Errorf("actual or expected intermediate data is nil") - } - // Ensure one-to-one mapping between tool calls and responses on actual invocation. - if len(actual.IntermediateData.ToolUses) != len(actual.IntermediateData.ToolResponses) { - return fmt.Errorf("tool uses and tool responses count mismatch: %d != %d", - len(actual.IntermediateData.ToolUses), len(actual.IntermediateData.ToolResponses)) - } - // Ensure one-to-one mapping between tool calls and responses on expected invocation. - if len(expected.IntermediateData.ToolUses) != len(expected.IntermediateData.ToolResponses) { - return fmt.Errorf("tool uses and tool responses count mismatch: %d != %d", - len(expected.IntermediateData.ToolUses), len(expected.IntermediateData.ToolResponses)) - } - // Ensure the same number of tool uses before detailed comparison. - if len(actual.IntermediateData.ToolUses) != len(expected.IntermediateData.ToolUses) { - return fmt.Errorf("tool uses count mismatch: %d != %d", - len(actual.IntermediateData.ToolUses), len(expected.IntermediateData.ToolUses)) - } - if len(actual.IntermediateData.ToolUses) == 0 { - return nil - } - actualTools, err := t.getToolComparers( - actual.IntermediateData.ToolUses, - actual.IntermediateData.ToolResponses, - t.OrderInsensitive, - ) - if err != nil { - return fmt.Errorf("get actual tools: %w", err) - } - expectedTools, err := t.getToolComparers( - expected.IntermediateData.ToolUses, - expected.IntermediateData.ToolResponses, - t.OrderInsensitive, - ) - if err != nil { - return fmt.Errorf("get expected tools: %w", err) - } - if t.OrderInsensitive { - sort.Slice(actualTools, func(i, j int) bool { - return actualTools[i].lessThan(actualTools[j]) - }) - sort.Slice(expectedTools, func(i, j int) bool { - return expectedTools[i].lessThan(expectedTools[j]) - }) - } - for i := range len(actualTools) { - strategy := t.getStrategy(actualTools[i], expectedTools[i]) - if err := strategy.Match(actualTools[i], expectedTools[i]); err != nil { - return fmt.Errorf("tool %s mismatch: %w", actualTools[i].name, err) - } - } - return nil -} - -// getToolComparers aligns tool uses with their responses and builds toolComparer. -func (t *ToolTrajectoryCriterion) getToolComparers(toolUses []*genai.FunctionCall, - toolResponses []*genai.FunctionResponse, orderInsensitive bool) ([]*toolComparer, error) { - // toolCallIDs ensures every tool use can be matched by ID. - // Map from tool call id to index. - toolCallIDs := make(map[string]int) - for i := range len(toolUses) { - if toolUses[i].ID == "" { - return nil, fmt.Errorf("tool use id is empty") - } - if _, ok := toolCallIDs[toolUses[i].ID]; ok { - return nil, fmt.Errorf("tool use id %s is duplicated", toolUses[i].ID) - } - toolCallIDs[toolUses[i].ID] = i - } - // toolResponseIDs ensures every tool response can be matched by ID. - // Map from tool response id to index. - toolResponseIDs := make(map[string]int) - for i := range len(toolResponses) { - if toolResponses[i].ID == "" { - return nil, fmt.Errorf("tool response id is empty") - } - if _, ok := toolResponseIDs[toolResponses[i].ID]; ok { - return nil, fmt.Errorf("tool response id %s is duplicated", toolResponses[i].ID) - } - toolResponseIDs[toolResponses[i].ID] = i - } - for toolID := range toolCallIDs { - if _, ok := toolResponseIDs[toolID]; !ok { - return nil, fmt.Errorf("tool id %s is missing response", toolID) - } - } - toolComparers := make([]*toolComparer, 0, len(toolUses)) - for i := range len(toolUses) { - toolComparer, err := getToolComparer( - toolUses[i], - toolResponses[toolResponseIDs[toolUses[i].ID]], - orderInsensitive, - ) - if err != nil { - return nil, fmt.Errorf("get tool comparer: %w", err) - } - toolComparers = append(toolComparers, toolComparer) - } - return toolComparers, nil -} - -// getStrategy picks the comparison strategy for a specific tool pair. -func (t *ToolTrajectoryCriterion) getStrategy(actualTool, expectedTool *toolComparer) *ToolTrajectoryStrategy { - if t.ToolStrategy != nil { - strategy, ok := t.ToolStrategy[actualTool.name] - if ok { - return strategy - } - strategy, ok = t.ToolStrategy[expectedTool.name] - if ok { - return strategy - } - } - if t.DefaultStrategy != nil { - return t.DefaultStrategy - } - return defaultToolTrajectoryStrategy -} - // ToolTrajectoryStrategy defines comparison strategies for a single tool. type ToolTrajectoryStrategy struct { Name *text.TextCriterion `json:"name,omitempty"` // Name compares tool names. Arguments *maptext.MapTextCriterion `json:"arguments,omitempty"` // Arguments compares tool call arguments. Response *maptext.MapTextCriterion `json:"response,omitempty"` // Response compares tool call responses. } - -// Match validates a single tool call pair using configured criteria. -func (t *ToolTrajectoryStrategy) Match(actual, expected *toolComparer) error { - if t.Name != nil { - if err := t.Name.Match(actual.name, expected.name); err != nil { - return fmt.Errorf("name mismatch: %w", err) - } - } - if t.Arguments != nil { - if err := t.Arguments.Match(actual.args, expected.args); err != nil { - return fmt.Errorf("arguments mismatch: %w", err) - } - } - if t.Response != nil { - if err := t.Response.Match(actual.response, expected.response); err != nil { - return fmt.Errorf("response mismatch: %w", err) - } - } - return nil -} - -// toolComparer normalizes tool call and response data for comparison. -type toolComparer struct { - name string // name holds the tool name. - args map[string]any // args holds parsed tool arguments. - response map[string]any // response holds parsed tool response payload. - argsOrder string // argsOrder caches JSON for order-insensitive compare. - responseOrder string // responseOrder caches JSON for order-insensitive compare. -} - -// lessThan provides deterministic ordering when order-insensitive compares require sorting. -func (t *toolComparer) lessThan(other *toolComparer) bool { - if t.name != other.name { - return t.name < other.name - } - if t.argsOrder != other.argsOrder { - return t.argsOrder < other.argsOrder - } - if t.responseOrder != other.responseOrder { - return t.responseOrder < other.responseOrder - } - return false -} - -// getToolComparer pairs a tool use with its response and precomputes ordering hints. -func getToolComparer(toolUse *genai.FunctionCall, toolResponse *genai.FunctionResponse, - orderInsensitive bool) (*toolComparer, error) { - if toolUse == nil || toolResponse == nil { - return nil, errors.New("tool use or tool response is nil") - } - tool := &toolComparer{ - name: toolUse.Name, - args: toolUse.Args, - response: toolResponse.Response, - } - if orderInsensitive { - args, err := json.Marshal(toolUse.Args) - if err != nil { - return nil, fmt.Errorf("marshal arguments: %w", err) - } - response, err := json.Marshal(toolResponse.Response) - if err != nil { - return nil, fmt.Errorf("marshal response: %w", err) - } - tool.argsOrder = string(args) - tool.responseOrder = string(response) - } - return tool, nil -} From b8fd383636e61da67ed0dd7bb830c25eb3bad990 Mon Sep 17 00:00:00 2001 From: hackerli Date: Mon, 24 Nov 2025 10:09:37 +0800 Subject: [PATCH 05/14] test --- evaluation/metric/criterion/criterion_test.go | 14 +++ .../metric/criterion/maptext/maptext_test.go | 37 ++++++ evaluation/metric/criterion/text/text_test.go | 35 ++++++ .../tooltrajectory/tooltrajectory_test.go | 109 ++++++++++++++++++ 4 files changed, 195 insertions(+) create mode 100644 evaluation/metric/criterion/maptext/maptext_test.go create mode 100644 evaluation/metric/criterion/text/text_test.go create mode 100644 evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go diff --git a/evaluation/metric/criterion/criterion_test.go b/evaluation/metric/criterion/criterion_test.go index ba29f2ad9..65480a4b4 100644 --- a/evaluation/metric/criterion/criterion_test.go +++ b/evaluation/metric/criterion/criterion_test.go @@ -10,6 +10,7 @@ package criterion import ( + "encoding/json" "testing" "github.com/stretchr/testify/assert" @@ -26,3 +27,16 @@ func TestCriterionWithToolTrajectory(t *testing.T) { c := New(WithToolTrajectory(custom)) assert.Equal(t, custom, c.ToolTrajectory) } + +func TestCriterionJSONRoundTrip(t *testing.T) { + c := &Criterion{ + ToolTrajectory: tooltrajectory.New(), + } + data, err := json.Marshal(c) + assert.NoError(t, err) + + var decoded Criterion + err = json.Unmarshal(data, &decoded) + assert.NoError(t, err) + assert.NotNil(t, decoded.ToolTrajectory) +} diff --git a/evaluation/metric/criterion/maptext/maptext_test.go b/evaluation/metric/criterion/maptext/maptext_test.go new file mode 100644 index 000000000..f57c8f000 --- /dev/null +++ b/evaluation/metric/criterion/maptext/maptext_test.go @@ -0,0 +1,37 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package maptext + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" +) + +func TestMapTextCriterionJSONRoundTrip(t *testing.T) { + criterion := &MapTextCriterion{ + TextCriterion: &text.TextCriterion{ + Ignore: true, + MatchStrategy: text.TextMatchStrategyExact, + }, + } + data, err := json.Marshal(criterion) + assert.NoError(t, err) + assert.JSONEq(t, `{"textCriterion":{"ignore":true,"matchStrategy":"exact"}}`, string(data)) + + var decoded MapTextCriterion + err = json.Unmarshal(data, &decoded) + assert.NoError(t, err) + assert.NotNil(t, decoded.TextCriterion) + assert.Equal(t, criterion.TextCriterion.Ignore, decoded.TextCriterion.Ignore) + assert.Equal(t, criterion.TextCriterion.MatchStrategy, decoded.TextCriterion.MatchStrategy) +} diff --git a/evaluation/metric/criterion/text/text_test.go b/evaluation/metric/criterion/text/text_test.go new file mode 100644 index 000000000..b99ddbe36 --- /dev/null +++ b/evaluation/metric/criterion/text/text_test.go @@ -0,0 +1,35 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package text + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestTextCriterionJSONRoundTrip(t *testing.T) { + criterion := &TextCriterion{ + Ignore: true, + CaseInsensitive: true, + MatchStrategy: TextMatchStrategyRegex, + } + data, err := json.Marshal(criterion) + assert.NoError(t, err) + assert.JSONEq(t, `{"ignore":true,"caseInsensitive":true,"matchStrategy":"regex"}`, string(data)) + + var decoded TextCriterion + err = json.Unmarshal(data, &decoded) + assert.NoError(t, err) + assert.Equal(t, criterion.Ignore, decoded.Ignore) + assert.Equal(t, criterion.CaseInsensitive, decoded.CaseInsensitive) + assert.Equal(t, criterion.MatchStrategy, decoded.MatchStrategy) +} diff --git a/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go b/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go new file mode 100644 index 000000000..87742a492 --- /dev/null +++ b/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go @@ -0,0 +1,109 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package tooltrajectory + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" +) + +func TestToolTrajectoryCriterionJSONRoundTrip(t *testing.T) { + criterion := &ToolTrajectoryCriterion{ + DefaultStrategy: &ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + Ignore: true, + CaseInsensitive: true, + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &maptext.MapTextCriterion{}, + Response: &maptext.MapTextCriterion{}, + }, + ToolStrategy: map[string]*ToolTrajectoryStrategy{ + "foo": { + Name: &text.TextCriterion{ + Ignore: true, + CaseInsensitive: true, + MatchStrategy: text.TextMatchStrategyContains, + }, + }, + }, + OrderInsensitive: true, + } + data, err := json.Marshal(criterion) + assert.NoError(t, err) + assert.JSONEq(t, `{ + "defaultStrategy":{ + "name":{"ignore":true,"caseInsensitive":true,"matchStrategy":"exact"}, + "arguments":{}, + "response":{} + }, + "toolStrategy":{ + "foo":{"name":{"ignore":true,"caseInsensitive":true,"matchStrategy":"contains"}} + }, + "orderInsensitive":true + }`, string(data)) + var decoded ToolTrajectoryCriterion + err = json.Unmarshal(data, &decoded) + assert.NoError(t, err) + assert.True(t, decoded.OrderInsensitive) + assert.NotNil(t, decoded.DefaultStrategy) + assert.Equal(t, text.TextMatchStrategyExact, decoded.DefaultStrategy.Name.MatchStrategy) + assert.True(t, decoded.DefaultStrategy.Name.Ignore) + assert.True(t, decoded.DefaultStrategy.Name.CaseInsensitive) + assert.NotNil(t, decoded.ToolStrategy["foo"]) + assert.Equal(t, text.TextMatchStrategyContains, decoded.ToolStrategy["foo"].Name.MatchStrategy) + assert.True(t, decoded.ToolStrategy["foo"].Name.Ignore) + assert.True(t, decoded.ToolStrategy["foo"].Name.CaseInsensitive) +} + +func TestToolTrajectoryCriterionJSONOmitEmpty(t *testing.T) { + criterion := &ToolTrajectoryCriterion{} + data, err := json.Marshal(criterion) + assert.NoError(t, err) + assert.JSONEq(t, `{}`, string(data)) +} + +func TestToolTrajectoryStrategyJSONRoundTrip(t *testing.T) { + strategy := &ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + Ignore: true, + CaseInsensitive: true, + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &maptext.MapTextCriterion{ + TextCriterion: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyRegex}, + }, + Response: &maptext.MapTextCriterion{ + TextCriterion: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyContains}, + }, + } + data, err := json.Marshal(strategy) + assert.NoError(t, err) + assert.JSONEq(t, `{ + "name":{"ignore":true,"caseInsensitive":true,"matchStrategy":"exact"}, + "arguments":{"textCriterion":{"matchStrategy":"regex"}}, + "response":{"textCriterion":{"matchStrategy":"contains"}} + }`, string(data)) + + var decoded ToolTrajectoryStrategy + err = json.Unmarshal(data, &decoded) + assert.NoError(t, err) + assert.Equal(t, text.TextMatchStrategyExact, decoded.Name.MatchStrategy) + assert.True(t, decoded.Name.Ignore) + assert.True(t, decoded.Name.CaseInsensitive) + assert.NotNil(t, decoded.Arguments) + assert.NotNil(t, decoded.Response) + assert.Equal(t, text.TextMatchStrategyRegex, decoded.Arguments.TextCriterion.MatchStrategy) + assert.Equal(t, text.TextMatchStrategyContains, decoded.Response.TextCriterion.MatchStrategy) +} From b8440a54075bcb18c79061c89c399ab9eb8afd9b Mon Sep 17 00:00:00 2001 From: hackerli Date: Mon, 24 Nov 2025 14:10:42 +0800 Subject: [PATCH 06/14] examples --- docs/mkdocs/zh/evaluation.md | 251 +++++++++++++-- evaluation/evalresult/evalresult.go | 3 + evaluation/evalresult/local/local.go | 16 +- evaluation/evaluation.go | 4 + .../criterion/internal/maptext/maptext.go | 10 + .../metric/criterion/internal/text/text.go | 10 + .../internal/tooltrajectory/tooltrajectory.go | 21 +- evaluation/service/local/local.go | 2 + examples/evaluation/inmemory/main.go | 51 ++- .../math-eval-app/math-basic.evalset.json | 70 ++-- .../math-eval-app/math-basic.metrics.json | 23 +- ...41e9-b20e-06f23aa3cdbc.evalset_result.json | 1 - ...43c4-ac7b-ee12870fa973.evalset_result.json | 304 ++++++++++++++++++ 13 files changed, 695 insertions(+), 71 deletions(-) delete mode 100644 examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc.evalset_result.json create mode 100644 examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973.evalset_result.json diff --git a/docs/mkdocs/zh/evaluation.md b/docs/mkdocs/zh/evaluation.md index ffd9125db..f4a435396 100644 --- a/docs/mkdocs/zh/evaluation.md +++ b/docs/mkdocs/zh/evaluation.md @@ -58,15 +58,15 @@ if err != nil { ```json { - "eval_set_id": "math-basic", + "evalSetId": "math-basic", "name": "math-basic", - "eval_cases": [ + "evalCases": [ { - "eval_id": "calc_add", + "evalId": "calc_add", "conversation": [ { - "invocation_id": "calc_add-1", - "user_content": { + "invocationId": "calc_add-1", + "userContent": { "parts": [ { "text": "calc add 2 3" @@ -74,7 +74,7 @@ if err != nil { ], "role": "user" }, - "final_response": { + "finalResponse": { "parts": [ { "text": "calc result: 5" @@ -82,8 +82,8 @@ if err != nil { ], "role": "assistant" }, - "intermediate_data": { - "tool_uses": [ + "intermediateData": { + "toolUses": [ { "args": { "a": 2, @@ -92,19 +92,25 @@ if err != nil { }, "name": "calculator" } + ], + "toolResponses": [ + { + "name": "calculator", + "response": { + "result": 5 + } + } ] - }, - "creation_timestamp": 1761134484.981062 + } } ], - "session_input": { - "app_name": "math-eval-app", - "user_id": "user" - }, - "creation_timestamp": 1761134484.981062 - }, + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" + } + } ], - "creation_timestamp": 1761134484.9804401 + "creationTimestamp": 1761134484.9804401 } ``` @@ -113,8 +119,27 @@ if err != nil { ```json [ { - "metric_name": "tool_trajectory_avg_score", - "threshold": 1 + "metricName": "tool_trajectory_avg_score", + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } } ] ``` @@ -122,7 +147,162 @@ if err != nil { #### 评估结果 EvalResult 文件示例 ```json -"{\"eval_set_result_id\":\"math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc\",\"eval_set_result_name\":\"math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc\",\"eval_set_id\":\"math-basic\",\"eval_case_results\":[{\"eval_set_id\":\"math-basic\",\"eval_id\":\"calc_add\",\"final_eval_status\":1,\"overall_eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}],\"eval_metric_result_per_invocation\":[{\"actual_invocation\":{\"invocation_id\":\"8b205b3f-682e-409a-b751-89ef805d0221\",\"user_content\":{\"parts\":[{\"text\":\"calc add 2 3\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"The result of adding 2 and 3 is **5**.\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"id\":\"call_00_j75SIh8A9xSlG61OrC1ARIab\",\"args\":{\"a\":2,\"b\":3,\"operation\":\"add\"},\"name\":\"calculator\"}]}},\"expected_invocation\":{\"invocation_id\":\"calc_add-1\",\"user_content\":{\"parts\":[{\"text\":\"calc add 2 3\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"calc result: 5\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"args\":{\"a\":2,\"b\":3,\"operation\":\"add\"},\"name\":\"calculator\"}]},\"creation_timestamp\":1761134484.981062},\"eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}]}],\"session_id\":\"74252944-b1a7-4c17-8f39-4a5809395d1d\",\"user_id\":\"user\"},{\"eval_set_id\":\"math-basic\",\"eval_id\":\"calc_multiply\",\"final_eval_status\":1,\"overall_eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}],\"eval_metric_result_per_invocation\":[{\"actual_invocation\":{\"invocation_id\":\"65226930-d45c-43ae-ab88-9c35f3abce70\",\"user_content\":{\"parts\":[{\"text\":\"calc multiply 6 7\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"6 × 7 = 42\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"id\":\"call_00_b3Gj4Y3fJu9Blkbl6H0MLquO\",\"args\":{\"a\":6,\"b\":7,\"operation\":\"multiply\"},\"name\":\"calculator\"}]}},\"expected_invocation\":{\"invocation_id\":\"calc_multiply-1\",\"user_content\":{\"parts\":[{\"text\":\"calc multiply 6 7\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"calc result: 42\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"args\":{\"a\":6,\"b\":7,\"operation\":\"multiply\"},\"name\":\"calculator\"}]},\"creation_timestamp\":1761134484.9812014},\"eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}]}],\"session_id\":\"6393fabd-ab50-49b7-8656-59fcb0a29758\",\"user_id\":\"user\"}],\"creation_timestamp\":1761134849.3572516}" +{ + "evalSetResultId": "math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973", + "evalSetResultName": "math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973", + "evalSetId": "math-basic", + "evalCaseResults": [ + { + "evalSetId": "math-basic", + "evalId": "calc_add", + "finalEvalStatus": 1, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "49ff84cf-ad89-42ab-be07-1fffc4dc78f2", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "The result of 2 + 3 is **5**." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_pCwL67NPbNQAJEvZjvxuthX6", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_pCwL67NPbNQAJEvZjvxuthX6", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "name": "calculator", + "response": { + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ] + } + ], + "sessionId": "007a49f9-5a2c-49ba-a6ae-b0657d50aafb", + "userId": "user" + } + ], + "creationTimestamp": 1763960812.6226852 +} ``` ### 内存 inmemory @@ -225,6 +405,14 @@ cases := []*evalset.EvalCase{ }, }, }, + ToolResponses: []*genai.FunctionResponse{ + { + Name: "calculator", + Response: map[string]interface{}{ + "result": 5.0, + }, + }, + }, }, }, }, @@ -249,6 +437,29 @@ import "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" evalMetric := &metric.EvalMetric{ MetricName: "tool_trajectory_avg_score", Threshold: 1.0, + Criterion: criterion.New( + criterion.WithToolTrajectory( + ctooltrajectory.New( + ctooltrajectory.WithDefault( + &ctooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &maptext.MapTextCriterion{ + TextCriterion: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + }, + Response: &maptext.MapTextCriterion{ + TextCriterion: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyContains, + }, + }, + }, + ), + ), + ), + ), } metricManager.Add(ctx, appName, evalSetID, evalMetric) ``` diff --git a/evaluation/evalresult/evalresult.go b/evaluation/evalresult/evalresult.go index da40fde7b..7669d3468 100644 --- a/evaluation/evalresult/evalresult.go +++ b/evaluation/evalresult/evalresult.go @@ -15,6 +15,7 @@ import ( "trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" "trpc.group/trpc-go/trpc-agent-go/evaluation/status" ) @@ -63,6 +64,8 @@ type EvalMetricResult struct { EvalStatus status.EvalStatus `json:"evalStatus,omitempty"` // Threshold that was used. Threshold float64 `json:"threshold,omitempty"` + // Criterion contains the criterion used for this metric evaluation. + Criterion *criterion.Criterion `json:"criterion,omitempty"` // Details contains additional metric-specific information. Details map[string]any `json:"details,omitempty"` } diff --git a/evaluation/evalresult/local/local.go b/evaluation/evalresult/local/local.go index 9f87dd7a1..22a77d5e9 100644 --- a/evaluation/evalresult/local/local.go +++ b/evaluation/evalresult/local/local.go @@ -128,13 +128,9 @@ func (m *manager) load(appName, evalSetResultID string) (*evalresult.EvalSetResu return nil, fmt.Errorf("open file %s: %w", path, err) } defer f.Close() - var payload string - if err := json.NewDecoder(f).Decode(&payload); err != nil { - return nil, fmt.Errorf("decode file %s: %w", path, err) - } var res evalresult.EvalSetResult - if err := json.Unmarshal([]byte(payload), &res); err != nil { - return nil, fmt.Errorf("unmarshal eval set result %s: %w", path, err) + if err := json.NewDecoder(f).Decode(&res); err != nil { + return nil, fmt.Errorf("decode file %s: %w", path, err) } return &res, nil } @@ -154,13 +150,9 @@ func (m *manager) store(appName string, evalSetResult *evalresult.EvalSetResult) if err != nil { return fmt.Errorf("open file %s: %w", tmp, err) } - data, err := json.Marshal(evalSetResult) - if err != nil { - file.Close() - return fmt.Errorf("json marshal: %w", err) - } encoder := json.NewEncoder(file) - if err := encoder.Encode(string(data)); err != nil { + encoder.SetIndent("", " ") + if err := encoder.Encode(evalSetResult); err != nil { file.Close() os.Remove(tmp) return fmt.Errorf("encode file %s: %w", tmp, err) diff --git a/evaluation/evaluation.go b/evaluation/evaluation.go index cd69416fc..b80d8accc 100644 --- a/evaluation/evaluation.go +++ b/evaluation/evaluation.go @@ -20,6 +20,7 @@ import ( "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/registry" istatus "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/status" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" "trpc.group/trpc-go/trpc-agent-go/evaluation/service" "trpc.group/trpc-go/trpc-agent-go/evaluation/service/local" "trpc.group/trpc-go/trpc-agent-go/evaluation/status" @@ -195,6 +196,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu count int score float64 threshold float64 + criterion *criterion.Criterion } // Group metrics results by metric name. aggregatedMetrics := make(map[string]*aggregatedMetric) @@ -208,6 +210,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu } aggregatedMetrics[metric.MetricName].count++ aggregatedMetrics[metric.MetricName].score += metric.Score + aggregatedMetrics[metric.MetricName].criterion = metric.Criterion } } // Aggregate metrics results by metric name. @@ -223,6 +226,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu Score: average, EvalStatus: evalStatus, Threshold: aggregatedMetric.threshold, + Criterion: aggregatedMetric.criterion, }) } status, err := istatus.SummarizeMetricsStatus(metricsResults) diff --git a/evaluation/metric/criterion/internal/maptext/maptext.go b/evaluation/metric/criterion/internal/maptext/maptext.go index 9ce0a6a13..d52e0e6e5 100644 --- a/evaluation/metric/criterion/internal/maptext/maptext.go +++ b/evaluation/metric/criterion/internal/maptext/maptext.go @@ -1,3 +1,13 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package maptext defines map-based comparison criteria. package maptext import ( diff --git a/evaluation/metric/criterion/internal/text/text.go b/evaluation/metric/criterion/internal/text/text.go index a53ccbfaa..bc752defd 100644 --- a/evaluation/metric/criterion/internal/text/text.go +++ b/evaluation/metric/criterion/internal/text/text.go @@ -1,3 +1,13 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package text defines text comparison criteria. package text import ( diff --git a/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory.go b/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory.go index d039c276d..9df9abe17 100644 --- a/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory.go +++ b/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory.go @@ -1,3 +1,13 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package tooltrajectory defines tool trajectory comparison criteria. package tooltrajectory import ( @@ -45,7 +55,7 @@ func Match(t *tooltrajectory.ToolTrajectoryCriterion, actual, expected *evalset. if len(actual.IntermediateData.ToolUses) == 0 { return nil } - actualTools, err := getToolComparers(t, + actualTools, err := getToolComparers( actual.IntermediateData.ToolUses, actual.IntermediateData.ToolResponses, t.OrderInsensitive, @@ -53,7 +63,7 @@ func Match(t *tooltrajectory.ToolTrajectoryCriterion, actual, expected *evalset. if err != nil { return fmt.Errorf("get actual tools: %w", err) } - expectedTools, err := getToolComparers(t, + expectedTools, err := getToolComparers( expected.IntermediateData.ToolUses, expected.IntermediateData.ToolResponses, t.OrderInsensitive, @@ -79,8 +89,8 @@ func Match(t *tooltrajectory.ToolTrajectoryCriterion, actual, expected *evalset. } // getToolComparers aligns tool uses with their responses and builds toolComparer. -func getToolComparers(t *tooltrajectory.ToolTrajectoryCriterion, toolUses []*genai.FunctionCall, - toolResponses []*genai.FunctionResponse, orderInsensitive bool) ([]*toolComparer, error) { +func getToolComparers(toolUses []*genai.FunctionCall, toolResponses []*genai.FunctionResponse, + orderInsensitive bool) ([]*toolComparer, error) { // toolCallIDs ensures every tool use can be matched by ID. // Map from tool call id to index. toolCallIDs := make(map[string]int) @@ -126,7 +136,8 @@ func getToolComparers(t *tooltrajectory.ToolTrajectoryCriterion, toolUses []*gen } // getStrategy picks the comparison strategy for a specific tool pair. -func getStrategy(t *tooltrajectory.ToolTrajectoryCriterion, actualTool, expectedTool *toolComparer) *tooltrajectory.ToolTrajectoryStrategy { +func getStrategy(t *tooltrajectory.ToolTrajectoryCriterion, actualTool, + expectedTool *toolComparer) *tooltrajectory.ToolTrajectoryStrategy { if t.ToolStrategy != nil { strategy, ok := t.ToolStrategy[actualTool.name] if ok { diff --git a/evaluation/service/local/local.go b/evaluation/service/local/local.go index 70cb015ff..93f10b29d 100644 --- a/evaluation/service/local/local.go +++ b/evaluation/service/local/local.go @@ -202,6 +202,7 @@ func (s *local) evaluatePerCase(ctx context.Context, inferenceResult *service.In overallMetricResults = append(overallMetricResults, &evalresult.EvalMetricResult{ MetricName: evalMetric.MetricName, Threshold: evalMetric.Threshold, + Criterion: evalMetric.Criterion, Score: result.OverallScore, EvalStatus: result.OverallStatus, }) @@ -214,6 +215,7 @@ func (s *local) evaluatePerCase(ctx context.Context, inferenceResult *service.In evalMetricResult := &evalresult.EvalMetricResult{ MetricName: evalMetric.MetricName, Threshold: evalMetric.Threshold, + Criterion: evalMetric.Criterion, Score: invocationResult.Score, EvalStatus: invocationResult.Status, } diff --git a/examples/evaluation/inmemory/main.go b/examples/evaluation/inmemory/main.go index 449159d58..678a8ad29 100644 --- a/examples/evaluation/inmemory/main.go +++ b/examples/evaluation/inmemory/main.go @@ -15,6 +15,10 @@ import ( evalsetinmemory "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset/inmemory" "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/registry" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" + ctooltrajectory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" metricinmemory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/inmemory" "trpc.group/trpc-go/trpc-agent-go/runner" ) @@ -35,10 +39,10 @@ func main() { ctx := context.Background() // New runner. run := runner.NewRunner(appName, newCalculatorAgent(*modelName, *streaming)) - + // Ensure runner resources are cleaned up (trpc-agent-go >= v0.5.0) defer run.Close() - + // New manager and registry for evaluation. evalSetManager := evalsetinmemory.New() metricManager := metricinmemory.New() @@ -154,6 +158,14 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { }, }, }, + ToolResponses: []*genai.FunctionResponse{ + { + Name: "calculator", + Response: map[string]interface{}{ + "result": 5.0, + }, + }, + }, }, }, }, @@ -194,6 +206,14 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { }, }, }, + ToolResponses: []*genai.FunctionResponse{ + { + Name: "calculator", + Response: map[string]interface{}{ + "result": 5.0, + }, + }, + }, }, }, }, @@ -204,6 +224,8 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { }, } for _, evalCase := range cases { + data, _ := json.MarshalIndent(evalCase, "", " ") + fmt.Println(string(data)) if err := evalSetManager.AddCase(ctx, appName, evalSetID, evalCase); err != nil { return err } @@ -215,6 +237,31 @@ func prepareMetric(ctx context.Context, metricManager metric.Manager) error { evalMetric := &metric.EvalMetric{ MetricName: "tool_trajectory_avg_score", Threshold: 1.0, + Criterion: criterion.New( + criterion.WithToolTrajectory( + ctooltrajectory.New( + ctooltrajectory.WithDefault( + &ctooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &maptext.MapTextCriterion{ + TextCriterion: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + }, + Response: &maptext.MapTextCriterion{ + TextCriterion: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyContains, + }, + }, + }, + ), + ), + ), + ), } + data, _ := json.MarshalIndent(evalMetric, "", " ") + fmt.Println(string(data)) return metricManager.Add(ctx, appName, evalSetID, evalMetric) } diff --git a/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json b/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json index 9069826a4..2370ec0e4 100644 --- a/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json +++ b/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json @@ -1,13 +1,13 @@ { - "eval_set_id": "math-basic", + "evalSetId": "math-basic", "name": "math-basic", - "eval_cases": [ + "evalCases": [ { - "eval_id": "calc_add", + "evalId": "calc_add", "conversation": [ { - "invocation_id": "calc_add-1", - "user_content": { + "invocationId": "calc_add-1", + "userContent": { "parts": [ { "text": "calc add 2 3" @@ -15,7 +15,7 @@ ], "role": "user" }, - "final_response": { + "finalResponse": { "parts": [ { "text": "calc result: 5" @@ -23,8 +23,8 @@ ], "role": "assistant" }, - "intermediate_data": { - "tool_uses": [ + "intermediateData": { + "toolUses": [ { "args": { "a": 2, @@ -33,23 +33,29 @@ }, "name": "calculator" } + ], + "toolResponses": [ + { + "name": "calculator", + "response": { + "result": 5 + } + } ] - }, - "creation_timestamp": 1761134484.981062 + } } ], - "session_input": { - "app_name": "math-eval-app", - "user_id": "user" - }, - "creation_timestamp": 1761134484.981062 + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" + } }, { - "eval_id": "calc_multiply", + "evalId": "calc_multiply", "conversation": [ { - "invocation_id": "calc_multiply-1", - "user_content": { + "invocationId": "calc_multiply-1", + "userContent": { "parts": [ { "text": "calc multiply 6 7" @@ -57,7 +63,7 @@ ], "role": "user" }, - "final_response": { + "finalResponse": { "parts": [ { "text": "calc result: 42" @@ -65,8 +71,8 @@ ], "role": "assistant" }, - "intermediate_data": { - "tool_uses": [ + "intermediateData": { + "toolUses": [ { "args": { "a": 6, @@ -75,17 +81,23 @@ }, "name": "calculator" } + ], + "toolResponses": [ + { + "name": "calculator", + "response": { + "result": 5 + } + } ] - }, - "creation_timestamp": 1761134484.9812014 + } } ], - "session_input": { - "app_name": "math-eval-app", - "user_id": "user" - }, - "creation_timestamp": 1761134484.9812014 + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" + } } ], - "creation_timestamp": 1761134484.9804401 + "creationTimestamp": 1761134484.9804401 } \ No newline at end of file diff --git a/examples/evaluation/local/data/math-eval-app/math-basic.metrics.json b/examples/evaluation/local/data/math-eval-app/math-basic.metrics.json index c57bd213f..5f50c1e16 100644 --- a/examples/evaluation/local/data/math-eval-app/math-basic.metrics.json +++ b/examples/evaluation/local/data/math-eval-app/math-basic.metrics.json @@ -1,6 +1,25 @@ [ { - "metric_name": "tool_trajectory_avg_score", - "threshold": 1 + "metricName": "tool_trajectory_avg_score", + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } } ] \ No newline at end of file diff --git a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc.evalset_result.json b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc.evalset_result.json deleted file mode 100644 index 21454b672..000000000 --- a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc.evalset_result.json +++ /dev/null @@ -1 +0,0 @@ -"{\"eval_set_result_id\":\"math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc\",\"eval_set_result_name\":\"math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc\",\"eval_set_id\":\"math-basic\",\"eval_case_results\":[{\"eval_set_id\":\"math-basic\",\"eval_id\":\"calc_add\",\"final_eval_status\":1,\"overall_eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}],\"eval_metric_result_per_invocation\":[{\"actual_invocation\":{\"invocation_id\":\"8b205b3f-682e-409a-b751-89ef805d0221\",\"user_content\":{\"parts\":[{\"text\":\"calc add 2 3\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"The result of adding 2 and 3 is **5**.\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"id\":\"call_00_j75SIh8A9xSlG61OrC1ARIab\",\"args\":{\"a\":2,\"b\":3,\"operation\":\"add\"},\"name\":\"calculator\"}]}},\"expected_invocation\":{\"invocation_id\":\"calc_add-1\",\"user_content\":{\"parts\":[{\"text\":\"calc add 2 3\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"calc result: 5\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"args\":{\"a\":2,\"b\":3,\"operation\":\"add\"},\"name\":\"calculator\"}]},\"creation_timestamp\":1761134484.981062},\"eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}]}],\"session_id\":\"74252944-b1a7-4c17-8f39-4a5809395d1d\",\"user_id\":\"user\"},{\"eval_set_id\":\"math-basic\",\"eval_id\":\"calc_multiply\",\"final_eval_status\":1,\"overall_eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}],\"eval_metric_result_per_invocation\":[{\"actual_invocation\":{\"invocation_id\":\"65226930-d45c-43ae-ab88-9c35f3abce70\",\"user_content\":{\"parts\":[{\"text\":\"calc multiply 6 7\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"6 × 7 = 42\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"id\":\"call_00_b3Gj4Y3fJu9Blkbl6H0MLquO\",\"args\":{\"a\":6,\"b\":7,\"operation\":\"multiply\"},\"name\":\"calculator\"}]}},\"expected_invocation\":{\"invocation_id\":\"calc_multiply-1\",\"user_content\":{\"parts\":[{\"text\":\"calc multiply 6 7\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"calc result: 42\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"args\":{\"a\":6,\"b\":7,\"operation\":\"multiply\"},\"name\":\"calculator\"}]},\"creation_timestamp\":1761134484.9812014},\"eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}]}],\"session_id\":\"6393fabd-ab50-49b7-8656-59fcb0a29758\",\"user_id\":\"user\"}],\"creation_timestamp\":1761134849.3572516}" \ No newline at end of file diff --git a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973.evalset_result.json b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973.evalset_result.json new file mode 100644 index 000000000..84f2b50e7 --- /dev/null +++ b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973.evalset_result.json @@ -0,0 +1,304 @@ +{ + "evalSetResultId": "math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973", + "evalSetResultName": "math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973", + "evalSetId": "math-basic", + "evalCaseResults": [ + { + "evalSetId": "math-basic", + "evalId": "calc_add", + "finalEvalStatus": 1, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "49ff84cf-ad89-42ab-be07-1fffc4dc78f2", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "The result of 2 + 3 is **5**." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_pCwL67NPbNQAJEvZjvxuthX6", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_pCwL67NPbNQAJEvZjvxuthX6", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "name": "calculator", + "response": { + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ] + } + ], + "sessionId": "007a49f9-5a2c-49ba-a6ae-b0657d50aafb", + "userId": "user" + }, + { + "evalSetId": "math-basic", + "evalId": "calc_multiply", + "finalEvalStatus": 1, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "353274d3-694d-4de4-8dd8-e2cdde2ad5f5", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "6 multiplied by 7 equals 42." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_sgCNfRj0X4wDh6PqfuUUu5NC", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_sgCNfRj0X4wDh6PqfuUUu5NC", + "name": "calculator", + "response": { + "a": 6, + "b": 7, + "operation": "multiply", + "result": 42 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_multiply-1", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 42" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "name": "calculator", + "response": { + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ] + } + ], + "sessionId": "a47948ed-4bb0-4c2b-a1a0-05d101dfe3e1", + "userId": "user" + } + ], + "creationTimestamp": 1763960812.6226852 +} \ No newline at end of file From a8ec55b3d5d23555492348771ea1cea6f5622c17 Mon Sep 17 00:00:00 2001 From: hackerli Date: Mon, 24 Nov 2025 23:33:29 +0800 Subject: [PATCH 07/14] criterion refactor --- evaluation/evalresult/local/local.go | 13 +- .../tooltrajectory/tooltrajectory.go | 37 +- .../tooltrajectory/tooltrajectory_test.go | 113 +--- .../criterion/internal/maptext/maptext.go | 44 -- .../internal/maptext/maptext_test.go | 66 -- .../metric/criterion/internal/text/text.go | 56 -- .../criterion/internal/text/text_test.go | 85 --- .../internal/tooltrajectory/tooltrajectory.go | 244 -------- .../tooltrajectory/tooltrajectory_test.go | 537 ----------------- evaluation/metric/criterion/json/json.go | 52 ++ evaluation/metric/criterion/json/json_test.go | 44 ++ .../metric/criterion/maptext/maptext.go | 23 - .../metric/criterion/maptext/maptext_test.go | 37 -- evaluation/metric/criterion/text/text.go | 46 +- evaluation/metric/criterion/text/text_test.go | 78 +++ .../criterion/tooltrajectory/options.go | 31 +- .../criterion/tooltrajectory/options_test.go | 14 +- .../tooltrajectory/tooltrajectory.go | 229 ++++++- .../tooltrajectory/tooltrajectory_test.go | 567 +++++++++++++++++- examples/evaluation/inmemory/main.go | 4 + examples/evaluation/inmemory/server.log.txt | 441 ++++++++++++++ .../math-eval-app/math-basic.evalset.json | 182 +++--- ...4439-89d9-84fb9ffc21f8.evalset_result.json | 304 ++++++++++ ...40cd-b3f8-ed8c6c15564d.evalset_result.json | 304 ++++++++++ ...43c4-ac7b-ee12870fa973.evalset_result.json | 6 +- ...44d3-babe-035491901899.evalset_result.json | 304 ++++++++++ 26 files changed, 2515 insertions(+), 1346 deletions(-) delete mode 100644 evaluation/metric/criterion/internal/maptext/maptext.go delete mode 100644 evaluation/metric/criterion/internal/maptext/maptext_test.go delete mode 100644 evaluation/metric/criterion/internal/text/text.go delete mode 100644 evaluation/metric/criterion/internal/text/text_test.go delete mode 100644 evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory.go delete mode 100644 evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory_test.go create mode 100644 evaluation/metric/criterion/json/json.go create mode 100644 evaluation/metric/criterion/json/json_test.go delete mode 100644 evaluation/metric/criterion/maptext/maptext.go delete mode 100644 evaluation/metric/criterion/maptext/maptext_test.go create mode 100644 examples/evaluation/inmemory/server.log.txt create mode 100644 examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_22fca4c5-3e6c-4439-89d9-84fb9ffc21f8.evalset_result.json create mode 100644 examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_67e39166-a96d-40cd-b3f8-ed8c6c15564d.evalset_result.json create mode 100644 examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_cd3fd79f-898b-44d3-babe-035491901899.evalset_result.json diff --git a/evaluation/evalresult/local/local.go b/evaluation/evalresult/local/local.go index 22a77d5e9..c914836bf 100644 --- a/evaluation/evalresult/local/local.go +++ b/evaluation/evalresult/local/local.go @@ -123,15 +123,22 @@ func (m *manager) evalSetResultPath(appName, evalSetResultID string) string { // load loads the EvalSetResult from the file system. func (m *manager) load(appName, evalSetResultID string) (*evalresult.EvalSetResult, error) { path := m.evalSetResultPath(appName, evalSetResultID) - f, err := os.Open(path) + data, err := os.ReadFile(path) if err != nil { return nil, fmt.Errorf("open file %s: %w", path, err) } - defer f.Close() var res evalresult.EvalSetResult - if err := json.NewDecoder(f).Decode(&res); err != nil { + if err := json.Unmarshal(data, &res); err == nil { + return &res, nil + } + // Keep backward compatibility with legacy string-wrapped results. + var legacy string + if err := json.Unmarshal(data, &legacy); err != nil { return nil, fmt.Errorf("decode file %s: %w", path, err) } + if err := json.Unmarshal([]byte(legacy), &res); err != nil { + return nil, fmt.Errorf("decode legacy content in file %s: %w", path, err) + } return &res, nil } diff --git a/evaluation/evaluator/tooltrajectory/tooltrajectory.go b/evaluation/evaluator/tooltrajectory/tooltrajectory.go index 0b5d56a49..3640b97f0 100644 --- a/evaluation/evaluator/tooltrajectory/tooltrajectory.go +++ b/evaluation/evaluator/tooltrajectory/tooltrajectory.go @@ -13,13 +13,13 @@ package tooltrajectory import ( "context" "fmt" - "reflect" - "google.golang.org/genai" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + ctooltrajectory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" "trpc.group/trpc-go/trpc-agent-go/evaluation/status" + "trpc.group/trpc-go/trpc-agent-go/log" ) // toolTrajectoryEvaluator is a tool trajectory evaluator implementation for evaluator. @@ -53,10 +53,11 @@ func (e *toolTrajectoryEvaluator) Evaluate(ctx context.Context, actuals, expecte for i := range len(actuals) { actual := actuals[i] expected := expecteds[i] - actualCalls := getToolCalls(actual) - expectedCalls := getToolCalls(expected) score := 0.0 - if toolCallsEqual(actualCalls, expectedCalls) { + ok, err := toolCallsMatch(actual, expected, evalMetric.Criterion.ToolTrajectory) + if err != nil { + log.Errorf("tool trajectory mismatch: %v", err) + } else if ok { score = 1.0 } status := e.statusForScore(score, evalMetric) @@ -88,24 +89,14 @@ func (e *toolTrajectoryEvaluator) statusForScore(score float64, evalMetric *metr return status.EvalStatusFailed } -func getToolCalls(invocation *evalset.Invocation) []*genai.FunctionCall { - if invocation == nil || invocation.IntermediateData == nil { - return nil +func toolCallsMatch(actual, expected *evalset.Invocation, + criterion *ctooltrajectory.ToolTrajectoryCriterion) (bool, error) { + if criterion == nil { + return false, fmt.Errorf("criterion is nil") } - return invocation.IntermediateData.ToolUses -} - -func toolCallsEqual(actual, expected []*genai.FunctionCall) bool { - if len(actual) != len(expected) { - return false - } - for i := range actual { - if actual[i].Name != expected[i].Name { - return false - } - if !reflect.DeepEqual(actual[i].Args, expected[i].Args) { - return false - } + ok, err := criterion.Match(actual, expected) + if err != nil { + return false, fmt.Errorf("tool trajectory mismatch: %w", err) } - return true + return ok, nil } diff --git a/evaluation/evaluator/tooltrajectory/tooltrajectory_test.go b/evaluation/evaluator/tooltrajectory/tooltrajectory_test.go index 5e826a283..aa91997ae 100644 --- a/evaluation/evaluator/tooltrajectory/tooltrajectory_test.go +++ b/evaluation/evaluator/tooltrajectory/tooltrajectory_test.go @@ -1,98 +1,39 @@ -// -// Tencent is pleased to support the open source community by making trpc-agent-go available. -// -// Copyright (C) 2025 Tencent. All rights reserved. -// -// trpc-agent-go is licensed under the Apache License Version 2.0. -// -// - package tooltrajectory import ( - "context" + "encoding/json" "testing" "github.com/stretchr/testify/assert" - "google.golang.org/genai" - "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" - "trpc.group/trpc-go/trpc-agent-go/evaluation/status" + criterionjson "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" ) -func TestToolTrajectoryEvaluateSuccess(t *testing.T) { - inst := New() - assert.NotEmpty(t, inst.Description()) - assert.Equal(t, "tool_trajectory_avg_score", inst.Name()) - - e := inst.(*toolTrajectoryEvaluator) - actual := makeInvocation([]*genai.FunctionCall{ - {Name: "lookup", Args: map[string]any{"id": 1}}, - }) - expected := makeInvocation([]*genai.FunctionCall{ - {Name: "lookup", Args: map[string]any{"id": 1}}, - }) - - result, err := e.Evaluate(context.Background(), []*evalset.Invocation{actual}, []*evalset.Invocation{expected}, &metric.EvalMetric{Threshold: 0.5}) - assert.NoError(t, err) - assert.Equal(t, 1.0, result.OverallScore) - assert.Equal(t, status.EvalStatusPassed, result.OverallStatus) - assert.Len(t, result.PerInvocationResults, 1) - assert.Equal(t, actual, result.PerInvocationResults[0].ActualInvocation) - assert.Equal(t, expected, result.PerInvocationResults[0].ExpectedInvocation) - assert.Equal(t, status.EvalStatusPassed, result.PerInvocationResults[0].Status) -} - -func TestToolTrajectoryEvaluateMismatch(t *testing.T) { - e := New().(*toolTrajectoryEvaluator) - _, err := e.Evaluate(context.Background(), []*evalset.Invocation{}, []*evalset.Invocation{makeInvocation(nil)}, &metric.EvalMetric{Threshold: 1}) - assert.Error(t, err) - assert.Contains(t, err.Error(), "count mismatch") -} - -func TestToolTrajectoryEvaluateFailureStatus(t *testing.T) { - e := New().(*toolTrajectoryEvaluator) - actual := makeInvocation([]*genai.FunctionCall{ - {Name: "lookup", Args: map[string]any{"id": 1}}, - }) - expected := makeInvocation([]*genai.FunctionCall{ - {Name: "lookup", Args: map[string]any{"id": 2}}, - }) - - result, err := e.Evaluate(context.Background(), []*evalset.Invocation{actual}, []*evalset.Invocation{expected}, &metric.EvalMetric{Threshold: 0.9}) +func TestConfigJSONRoundTrip(t *testing.T) { + cfg := &tooltrajectory.ToolTrajectoryCriterion{ + DefaultStrategy: &tooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, + Arguments: &criterionjson.JSONCriterion{MatchStrategy: criterionjson.JSONMatchStrategyExact}, + Response: &criterionjson.JSONCriterion{MatchStrategy: criterionjson.JSONMatchStrategyExact}, + }, + ToolStrategy: map[string]*tooltrajectory.ToolTrajectoryStrategy{ + "custom": { + Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyRegex}, + }, + }, + OrderInsensitive: true, + } + data, err := json.Marshal(cfg) assert.NoError(t, err) - assert.Zero(t, result.OverallScore) - assert.Equal(t, status.EvalStatusFailed, result.OverallStatus) - assert.Equal(t, status.EvalStatusFailed, result.PerInvocationResults[0].Status) -} + assert.Contains(t, string(data), `"orderInsensitive":true`) + assert.Contains(t, string(data), `"custom"`) -func TestToolTrajectoryEvaluateNotEvaluated(t *testing.T) { - e := New().(*toolTrajectoryEvaluator) - result, err := e.Evaluate(context.Background(), []*evalset.Invocation{}, []*evalset.Invocation{}, &metric.EvalMetric{Threshold: 1}) + var decoded tooltrajectory.ToolTrajectoryCriterion + err = json.Unmarshal(data, &decoded) assert.NoError(t, err) - assert.Equal(t, status.EvalStatusNotEvaluated, result.OverallStatus) - assert.Nil(t, result.PerInvocationResults) -} - -func TestGetToolCallsAndEqual(t *testing.T) { - assert.Nil(t, getToolCalls(nil)) - assert.Nil(t, getToolCalls(&evalset.Invocation{})) - - callA := []*genai.FunctionCall{{Name: "a", Args: map[string]any{"x": 1}}} - callB := []*genai.FunctionCall{{Name: "a", Args: map[string]any{"x": 1}}} - assert.True(t, toolCallsEqual(callA, callB)) - - callNameDiff := []*genai.FunctionCall{{Name: "b", Args: map[string]any{"x": 1}}} - callArgsDiff := []*genai.FunctionCall{{Name: "a", Args: map[string]any{"x": 2}}} - assert.False(t, toolCallsEqual(callA, callNameDiff)) - assert.False(t, toolCallsEqual(callA, callArgsDiff)) - assert.False(t, toolCallsEqual(callA, []*genai.FunctionCall{})) -} - -func makeInvocation(calls []*genai.FunctionCall) *evalset.Invocation { - return &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: calls, - }, - } + assert.True(t, decoded.OrderInsensitive) + assert.NotNil(t, decoded.DefaultStrategy) + assert.NotNil(t, decoded.ToolStrategy["custom"]) + assert.Equal(t, text.TextMatchStrategyRegex, decoded.ToolStrategy["custom"].Name.MatchStrategy) } diff --git a/evaluation/metric/criterion/internal/maptext/maptext.go b/evaluation/metric/criterion/internal/maptext/maptext.go deleted file mode 100644 index d52e0e6e5..000000000 --- a/evaluation/metric/criterion/internal/maptext/maptext.go +++ /dev/null @@ -1,44 +0,0 @@ -// -// Tencent is pleased to support the open source community by making trpc-agent-go available. -// -// Copyright (C) 2025 Tencent. All rights reserved. -// -// trpc-agent-go is licensed under the Apache License Version 2.0. -// -// - -// Package maptext defines map-based comparison criteria. -package maptext - -import ( - "encoding/json" - "fmt" - "reflect" - - itext "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/internal/text" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" -) - -// Match compares two maps using custom logic, text-based matching, or deep equality. -func Match(m *maptext.MapTextCriterion, actual, expected map[string]any) error { - if m.Compare != nil { - return m.Compare(actual, expected) - } - if m.TextCriterion != nil { - // Although the keys in a map are unordered, json.Marshal guarantees the order of the keys, - // so we can directly use json.Marshal for comparison. - actualData, err := json.Marshal(actual) - if err != nil { - return fmt.Errorf("marshal actual: %w", err) - } - expectedData, err := json.Marshal(expected) - if err != nil { - return fmt.Errorf("marshal expected: %w", err) - } - return itext.Match(m.TextCriterion, string(actualData), string(expectedData)) - } - if reflect.DeepEqual(actual, expected) { - return nil - } - return fmt.Errorf("actual %v and expected %v do not match", actual, expected) -} diff --git a/evaluation/metric/criterion/internal/maptext/maptext_test.go b/evaluation/metric/criterion/internal/maptext/maptext_test.go deleted file mode 100644 index 017576ded..000000000 --- a/evaluation/metric/criterion/internal/maptext/maptext_test.go +++ /dev/null @@ -1,66 +0,0 @@ -// -// Tencent is pleased to support the open source community by making trpc-agent-go available. -// -// Copyright (C) 2025 Tencent. All rights reserved. -// -// trpc-agent-go is licensed under the Apache License Version 2.0. -// -// - -package maptext - -import ( - "testing" - - "github.com/stretchr/testify/assert" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" -) - -func TestMapTextCriterionCompareOverride(t *testing.T) { - called := false - criterion := &maptext.MapTextCriterion{ - Compare: func(actual, expected map[string]any) error { - called = true - return nil - }, - } - err := Match(criterion, map[string]any{"k": "v"}, map[string]any{"k": "v"}) - assert.NoError(t, err) - assert.True(t, called) -} - -func TestMapTextCriterionTextMatch(t *testing.T) { - criterion := &maptext.MapTextCriterion{ - TextCriterion: &text.TextCriterion{ - CaseInsensitive: true, - MatchStrategy: text.TextMatchStrategyExact, - }, - } - err := Match(criterion, map[string]any{"msg": "Hello"}, map[string]any{"msg": "hello"}) - assert.NoError(t, err) -} - -func TestMapTextCriterionDeepEqualMismatch(t *testing.T) { - criterion := &maptext.MapTextCriterion{} - err := Match(criterion, map[string]any{"k": "v"}, map[string]any{"k": "diff"}) - assert.Error(t, err) -} - -func TestMapTextCriterionMarshalErrors(t *testing.T) { - criterion := &maptext.MapTextCriterion{ - TextCriterion: &text.TextCriterion{}, - } - // Actual marshal error. - actualErr := Match(criterion, map[string]any{"bad": make(chan int)}, map[string]any{"k": "v"}) - assert.Error(t, actualErr) - // Expected marshal error. - expectedErr := Match(criterion, map[string]any{"k": "v"}, map[string]any{"bad": make(chan int)}) - assert.Error(t, expectedErr) -} - -func TestMapTextCriterionDeepEqualSuccess(t *testing.T) { - criterion := &maptext.MapTextCriterion{} - err := Match(criterion, map[string]any{"k": "v"}, map[string]any{"k": "v"}) - assert.NoError(t, err) -} diff --git a/evaluation/metric/criterion/internal/text/text.go b/evaluation/metric/criterion/internal/text/text.go deleted file mode 100644 index bc752defd..000000000 --- a/evaluation/metric/criterion/internal/text/text.go +++ /dev/null @@ -1,56 +0,0 @@ -// -// Tencent is pleased to support the open source community by making trpc-agent-go available. -// -// Copyright (C) 2025 Tencent. All rights reserved. -// -// trpc-agent-go is licensed under the Apache License Version 2.0. -// -// - -// Package text defines text comparison criteria. -package text - -import ( - "fmt" - "regexp" - "strings" - - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" -) - -// Match compares source and target using the configured strategy. -func Match(t *text.TextCriterion, source, target string) error { - if t.Compare != nil { - return t.Compare(source, target) - } - if t.Ignore { - return nil - } - if t.CaseInsensitive { - source = strings.ToLower(source) - target = strings.ToLower(target) - } - switch t.MatchStrategy { - case text.TextMatchStrategyExact: - if source == target { - return nil - } - return fmt.Errorf("source %s and target %s do not match", source, target) - case text.TextMatchStrategyContains: - if strings.Contains(source, target) { - return nil - } - return fmt.Errorf("source %s does not contain target %s", source, target) - case text.TextMatchStrategyRegex: - re, err := regexp.Compile(target) - if err != nil { - return fmt.Errorf("invalid regex %s: %w", target, err) - } - if re.MatchString(source) { - return nil - } - return fmt.Errorf("source %s does not match regex %s", source, target) - default: - return fmt.Errorf("invalid match strategy %s", t.MatchStrategy) - } -} diff --git a/evaluation/metric/criterion/internal/text/text_test.go b/evaluation/metric/criterion/internal/text/text_test.go deleted file mode 100644 index f78b1edf7..000000000 --- a/evaluation/metric/criterion/internal/text/text_test.go +++ /dev/null @@ -1,85 +0,0 @@ -// -// Tencent is pleased to support the open source community by making trpc-agent-go available. -// -// Copyright (C) 2025 Tencent. All rights reserved. -// -// trpc-agent-go is licensed under the Apache License Version 2.0. -// -// - -package text - -import ( - "testing" - - "github.com/stretchr/testify/assert" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" -) - -func TestTextCriterionMatchStrategies(t *testing.T) { - criterion := &text.TextCriterion{ - CaseInsensitive: true, - MatchStrategy: text.TextMatchStrategyContains, - } - err := Match(criterion, "Hello World", "hello") - assert.NoError(t, err) -} - -func TestTextCriterionIgnore(t *testing.T) { - criterion := &text.TextCriterion{ - Ignore: true, - } - err := Match(criterion, "anything", "value") - assert.NoError(t, err) -} - -func TestTextCriterionRegexInvalid(t *testing.T) { - criterion := &text.TextCriterion{ - MatchStrategy: text.TextMatchStrategyRegex, - } - err := Match(criterion, "source", "[invalid(") - assert.Error(t, err) -} - -func TestTextCriterionUnknownStrategy(t *testing.T) { - criterion := &text.TextCriterion{ - MatchStrategy: text.TextMatchStrategy("unknown"), - } - err := Match(criterion, "a", "b") - assert.Error(t, err) -} - -func TestTextCriterionAllBranches(t *testing.T) { - customCalled := false - custom := &text.TextCriterion{ - Compare: func(actual, expected string) error { - customCalled = true - return nil - }, - } - err := Match(custom, "x", "y") - assert.NoError(t, err) - assert.True(t, customCalled) - - exact := &text.TextCriterion{ - MatchStrategy: text.TextMatchStrategyExact, - } - err = Match(exact, "same", "same") - assert.NoError(t, err) - err = Match(exact, "same", "diff") - assert.Error(t, err) - - contains := &text.TextCriterion{ - MatchStrategy: text.TextMatchStrategyContains, - } - err = Match(contains, "hello", "missing") - assert.Error(t, err) - - regex := &text.TextCriterion{ - MatchStrategy: text.TextMatchStrategyRegex, - } - err = Match(regex, "abc123", "abc[0-9]+") - assert.NoError(t, err) - err = Match(regex, "xyz", "abc[0-9]+") - assert.Error(t, err) -} diff --git a/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory.go b/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory.go deleted file mode 100644 index 9df9abe17..000000000 --- a/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory.go +++ /dev/null @@ -1,244 +0,0 @@ -// -// Tencent is pleased to support the open source community by making trpc-agent-go available. -// -// Copyright (C) 2025 Tencent. All rights reserved. -// -// trpc-agent-go is licensed under the Apache License Version 2.0. -// -// - -// Package tooltrajectory defines tool trajectory comparison criteria. -package tooltrajectory - -import ( - "encoding/json" - "errors" - "fmt" - "reflect" - "sort" - - "google.golang.org/genai" - "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" - imaptext "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/internal/maptext" - itext "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/internal/text" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" -) - -// Match compares actual and expected invocations according to tool trajectory rules. -func Match(t *tooltrajectory.ToolTrajectoryCriterion, actual, expected *evalset.Invocation) error { - if t.Compare != nil { - return t.Compare(actual, expected) - } - if actual == nil || expected == nil { - return fmt.Errorf("actual or expected invocation is nil") - } - if actual.IntermediateData == nil || expected.IntermediateData == nil { - return fmt.Errorf("actual or expected intermediate data is nil") - } - // Ensure one-to-one mapping between tool calls and responses on actual invocation. - if len(actual.IntermediateData.ToolUses) != len(actual.IntermediateData.ToolResponses) { - return fmt.Errorf("tool uses and tool responses count mismatch: %d != %d", - len(actual.IntermediateData.ToolUses), len(actual.IntermediateData.ToolResponses)) - } - // Ensure one-to-one mapping between tool calls and responses on expected invocation. - if len(expected.IntermediateData.ToolUses) != len(expected.IntermediateData.ToolResponses) { - return fmt.Errorf("tool uses and tool responses count mismatch: %d != %d", - len(expected.IntermediateData.ToolUses), len(expected.IntermediateData.ToolResponses)) - } - // Ensure the same number of tool uses before detailed comparison. - if len(actual.IntermediateData.ToolUses) != len(expected.IntermediateData.ToolUses) { - return fmt.Errorf("tool uses count mismatch: %d != %d", - len(actual.IntermediateData.ToolUses), len(expected.IntermediateData.ToolUses)) - } - if len(actual.IntermediateData.ToolUses) == 0 { - return nil - } - actualTools, err := getToolComparers( - actual.IntermediateData.ToolUses, - actual.IntermediateData.ToolResponses, - t.OrderInsensitive, - ) - if err != nil { - return fmt.Errorf("get actual tools: %w", err) - } - expectedTools, err := getToolComparers( - expected.IntermediateData.ToolUses, - expected.IntermediateData.ToolResponses, - t.OrderInsensitive, - ) - if err != nil { - return fmt.Errorf("get expected tools: %w", err) - } - if t.OrderInsensitive { - sort.Slice(actualTools, func(i, j int) bool { - return actualTools[i].lessThan(actualTools[j]) - }) - sort.Slice(expectedTools, func(i, j int) bool { - return expectedTools[i].lessThan(expectedTools[j]) - }) - } - for i := range len(actualTools) { - strategy := getStrategy(t, actualTools[i], expectedTools[i]) - if err := MatchStrategy(strategy, actualTools[i], expectedTools[i]); err != nil { - return fmt.Errorf("tool %s mismatch: %w", actualTools[i].name, err) - } - } - return nil -} - -// getToolComparers aligns tool uses with their responses and builds toolComparer. -func getToolComparers(toolUses []*genai.FunctionCall, toolResponses []*genai.FunctionResponse, - orderInsensitive bool) ([]*toolComparer, error) { - // toolCallIDs ensures every tool use can be matched by ID. - // Map from tool call id to index. - toolCallIDs := make(map[string]int) - for i := range len(toolUses) { - if toolUses[i].ID == "" { - return nil, fmt.Errorf("tool use id is empty") - } - if _, ok := toolCallIDs[toolUses[i].ID]; ok { - return nil, fmt.Errorf("tool use id %s is duplicated", toolUses[i].ID) - } - toolCallIDs[toolUses[i].ID] = i - } - // toolResponseIDs ensures every tool response can be matched by ID. - // Map from tool response id to index. - toolResponseIDs := make(map[string]int) - for i := range len(toolResponses) { - if toolResponses[i].ID == "" { - return nil, fmt.Errorf("tool response id is empty") - } - if _, ok := toolResponseIDs[toolResponses[i].ID]; ok { - return nil, fmt.Errorf("tool response id %s is duplicated", toolResponses[i].ID) - } - toolResponseIDs[toolResponses[i].ID] = i - } - for toolID := range toolCallIDs { - if _, ok := toolResponseIDs[toolID]; !ok { - return nil, fmt.Errorf("tool id %s is missing response", toolID) - } - } - toolComparers := make([]*toolComparer, 0, len(toolUses)) - for i := range len(toolUses) { - toolComparer, err := getToolComparer( - toolUses[i], - toolResponses[toolResponseIDs[toolUses[i].ID]], - orderInsensitive, - ) - if err != nil { - return nil, fmt.Errorf("get tool comparer: %w", err) - } - toolComparers = append(toolComparers, toolComparer) - } - return toolComparers, nil -} - -// getStrategy picks the comparison strategy for a specific tool pair. -func getStrategy(t *tooltrajectory.ToolTrajectoryCriterion, actualTool, - expectedTool *toolComparer) *tooltrajectory.ToolTrajectoryStrategy { - if t.ToolStrategy != nil { - strategy, ok := t.ToolStrategy[actualTool.name] - if ok { - return strategy - } - strategy, ok = t.ToolStrategy[expectedTool.name] - if ok { - return strategy - } - } - if t.DefaultStrategy != nil { - return t.DefaultStrategy - } - return &tooltrajectory.ToolTrajectoryStrategy{ - Name: &text.TextCriterion{ - MatchStrategy: text.TextMatchStrategyExact, - }, - Arguments: &maptext.MapTextCriterion{ - Compare: func(actual, expected map[string]any) error { - if !reflect.DeepEqual(actual, expected) { - return fmt.Errorf("actual %v and expected %v do not match", actual, expected) - } - return nil - }, - }, - Response: &maptext.MapTextCriterion{ - Compare: func(actual, expected map[string]any) error { - if !reflect.DeepEqual(actual, expected) { - return fmt.Errorf("actual %v and expected %v do not match", actual, expected) - } - return nil - }, - }, - } -} - -// Match validates a single tool call pair using configured criteria. -func MatchStrategy(t *tooltrajectory.ToolTrajectoryStrategy, actual, expected *toolComparer) error { - if t.Name != nil { - if err := itext.Match(t.Name, actual.name, expected.name); err != nil { - return fmt.Errorf("name mismatch: %w", err) - } - } - if t.Arguments != nil { - if err := imaptext.Match(t.Arguments, actual.args, expected.args); err != nil { - return fmt.Errorf("arguments mismatch: %w", err) - } - } - if t.Response != nil { - if err := imaptext.Match(t.Response, actual.response, expected.response); err != nil { - return fmt.Errorf("response mismatch: %w", err) - } - } - return nil -} - -// toolComparer normalizes tool call and response data for comparison. -type toolComparer struct { - name string // name holds the tool name. - args map[string]any // args holds parsed tool arguments. - response map[string]any // response holds parsed tool response payload. - argsOrder string // argsOrder caches JSON for order-insensitive compare. - responseOrder string // responseOrder caches JSON for order-insensitive compare. -} - -// lessThan provides deterministic ordering when order-insensitive compares require sorting. -func (t *toolComparer) lessThan(other *toolComparer) bool { - if t.name != other.name { - return t.name < other.name - } - if t.argsOrder != other.argsOrder { - return t.argsOrder < other.argsOrder - } - if t.responseOrder != other.responseOrder { - return t.responseOrder < other.responseOrder - } - return false -} - -// getToolComparer pairs a tool use with its response and precomputes ordering hints. -func getToolComparer(toolUse *genai.FunctionCall, toolResponse *genai.FunctionResponse, - orderInsensitive bool) (*toolComparer, error) { - if toolUse == nil || toolResponse == nil { - return nil, errors.New("tool use or tool response is nil") - } - tool := &toolComparer{ - name: toolUse.Name, - args: toolUse.Args, - response: toolResponse.Response, - } - if orderInsensitive { - args, err := json.Marshal(toolUse.Args) - if err != nil { - return nil, fmt.Errorf("marshal arguments: %w", err) - } - response, err := json.Marshal(toolResponse.Response) - if err != nil { - return nil, fmt.Errorf("marshal response: %w", err) - } - tool.argsOrder = string(args) - tool.responseOrder = string(response) - } - return tool, nil -} diff --git a/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory_test.go b/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory_test.go deleted file mode 100644 index 7525a20a1..000000000 --- a/evaluation/metric/criterion/internal/tooltrajectory/tooltrajectory_test.go +++ /dev/null @@ -1,537 +0,0 @@ -// -// Tencent is pleased to support the open source community by making trpc-agent-go available. -// -// Copyright (C) 2025 Tencent. All rights reserved. -// -// trpc-agent-go is licensed under the Apache License Version 2.0. -// -// - -package tooltrajectory - -import ( - "testing" - - "github.com/stretchr/testify/assert" - "google.golang.org/genai" - "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" - imaptext "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/internal/maptext" - itext "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/internal/text" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" -) - -func TestToolTrajectoryCriterionMatchOrderInsensitive(t *testing.T) { - actual := makeInvocation( - []toolData{ - {id: "call-1", name: "shared", args: map[string]any{"a": 1}, response: map[string]any{"r": 2}}, - {id: "call-2", name: "shared", args: map[string]any{"a": 2}, response: map[string]any{"r": 3}}, - }, - ) - expected := makeInvocation( - []toolData{ - {id: "call-2", name: "shared", args: map[string]any{"a": 2}, response: map[string]any{"r": 3}}, - {id: "call-1", name: "shared", args: map[string]any{"a": 1}, response: map[string]any{"r": 2}}, - }, - ) - - criterion := &tooltrajectory.ToolTrajectoryCriterion{ - OrderInsensitive: true, - } - err := Match(criterion, actual, expected) - assert.NoError(t, err) -} - -func TestToolTrajectoryCriterionMissingResponse(t *testing.T) { - actual := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "call-1", Name: "tool"}, - }, - ToolResponses: []*genai.FunctionResponse{}, - }, - } - expected := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "call-1", Name: "tool"}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "call-1", Name: "tool"}, - }, - }, - } - criterion := &tooltrajectory.ToolTrajectoryCriterion{} - err := Match(criterion, actual, expected) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionCustomStrategy(t *testing.T) { - actual := makeInvocation( - []toolData{ - {id: "call-1", name: "custom", args: map[string]any{"k": "v"}, response: map[string]any{"r": "x"}}, - }, - ) - expected := makeInvocation( - []toolData{ - {id: "call-1", name: "custom", args: map[string]any{"k": "v"}, response: map[string]any{"r": "x"}}, - }, - ) - customStrategy := &tooltrajectory.ToolTrajectoryStrategy{ - Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, - } - criterion := &tooltrajectory.ToolTrajectoryCriterion{ - ToolStrategy: map[string]*tooltrajectory.ToolTrajectoryStrategy{ - "custom": customStrategy, - }, - } - err := Match(criterion, actual, expected) - assert.NoError(t, err) -} - -type toolData struct { - id string - name string - args map[string]any - response map[string]any -} - -func makeInvocation(tools []toolData) *evalset.Invocation { - toolUses := make([]*genai.FunctionCall, 0, len(tools)) - toolResponses := make([]*genai.FunctionResponse, 0, len(tools)) - for _, t := range tools { - toolUses = append(toolUses, &genai.FunctionCall{ - ID: t.id, - Name: t.name, - Args: t.args, - }) - toolResponses = append(toolResponses, &genai.FunctionResponse{ - ID: t.id, - Name: t.name, - Response: t.response, - }) - } - return &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: toolUses, - ToolResponses: toolResponses, - }, - } -} - -func TestToolTrajectoryCriterionIDMismatch(t *testing.T) { - actual := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "use-1", Name: "tool"}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "resp-1", Name: "tool"}, - }, - }, - } - expected := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "use-1", Name: "tool"}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "use-1", Name: "tool"}, - }, - }, - } - criterion := tooltrajectory.New() - err := Match(criterion, actual, expected) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionNilInvocation(t *testing.T) { - criterion := tooltrajectory.New() - err := Match(criterion, nil, makeInvocation(nil)) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionNilIntermediate(t *testing.T) { - criterion := tooltrajectory.New() - err := Match(criterion, &evalset.Invocation{}, &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}}) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionEmptyToolUseID(t *testing.T) { - actual := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {Name: "tool"}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "resp-1", Name: "tool"}, - }, - }, - } - expected := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "resp-1", Name: "tool"}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "resp-1", Name: "tool"}, - }, - }, - } - err := Match(tooltrajectory.New(), actual, expected) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionDuplicateResponseID(t *testing.T) { - actual := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - actual.IntermediateData.ToolResponses = append(actual.IntermediateData.ToolResponses, &genai.FunctionResponse{ - ID: "call-1", - Name: "tool", - Response: map[string]any{"r": 2}, - }) - err := Match(tooltrajectory.New(), actual, makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - })) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionCustomCompare(t *testing.T) { - var called bool - criterion := &tooltrajectory.ToolTrajectoryCriterion{ - Compare: func(actual, expected *evalset.Invocation) error { - called = true - return nil - }, - } - err := Match(criterion, &evalset.Invocation{}, &evalset.Invocation{}) - assert.NoError(t, err) - assert.True(t, called) -} - -func TestToolTrajectoryCriterionExpectedResponseCountMismatch(t *testing.T) { - actual := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - expected := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "call-1", Name: "tool", Args: map[string]any{"a": 1}}, - }, - ToolResponses: []*genai.FunctionResponse{}, - }, - } - err := Match(tooltrajectory.New(), actual, expected) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionToolUsesCountMismatch(t *testing.T) { - actual := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - expected := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - {id: "call-2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, - }) - err := Match(tooltrajectory.New(), actual, expected) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionZeroTools(t *testing.T) { - actual := &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}} - expected := &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}} - err := Match(tooltrajectory.New(), actual, expected) - assert.NoError(t, err) -} - -func TestToolTrajectoryCriterionExpectedInvalidID(t *testing.T) { - actual := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - expected := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "", Name: "tool", Args: map[string]any{"a": 1}}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "call-1", Name: "tool", Response: map[string]any{"r": 1}}, - }, - }, - } - err := Match(tooltrajectory.New(), actual, expected) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionStrategyMismatch(t *testing.T) { - actual := makeInvocation([]toolData{ - {id: "call-1", name: "tool-A", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - expected := makeInvocation([]toolData{ - {id: "call-1", name: "tool-B", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - strategy := &tooltrajectory.ToolTrajectoryStrategy{ - Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, - } - criterion := tooltrajectory.New(tooltrajectory.WithTool(map[string]*tooltrajectory.ToolTrajectoryStrategy{"tool-A": strategy})) - err := Match(criterion, actual, expected) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionDuplicateToolUseID(t *testing.T) { - actual := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "dup", Name: "tool"}, - {ID: "dup", Name: "tool"}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "dup", Name: "tool"}, - {ID: "dup2", Name: "tool"}, - }, - }, - } - expected := makeInvocation([]toolData{ - {id: "dup", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - {id: "dup2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, - }) - err := Match(tooltrajectory.New(), actual, expected) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionDuplicateToolResponseID(t *testing.T) { - actual := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "call-1", Name: "tool"}, - {ID: "call-2", Name: "tool"}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "call-1", Name: "tool"}, - {ID: "call-1", Name: "tool"}, - }, - }, - } - expected := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - {id: "call-2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, - }) - err := Match(tooltrajectory.New(), actual, expected) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionMissingResponseID(t *testing.T) { - actual := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "call-1", Name: "tool"}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "other", Name: "tool"}, - }, - }, - } - expected := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - err := Match(tooltrajectory.New(), actual, expected) - assert.Error(t, err) -} - -func TestToolComparerOrderInsensitiveMarshalError(t *testing.T) { - actual := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "call-1", Name: "tool", Args: map[string]any{"bad": make(chan int)}}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "call-1", Name: "tool", Response: map[string]any{"r": 1}}, - }, - }, - } - expected := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{}, response: map[string]any{"r": 1}}, - }) - err := Match(tooltrajectory.New(tooltrajectory.WithOrderInsensitive(true)), actual, expected) - assert.Error(t, err) -} - -func TestToolComparerOrderInsensitiveMarshalResponseError(t *testing.T) { - actual := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "call-1", Name: "tool", Args: map[string]any{"a": 1}}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "call-1", Name: "tool", Response: map[string]any{"bad": make(chan int)}}, - }, - }, - } - expected := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - err := Match(tooltrajectory.New(tooltrajectory.WithOrderInsensitive(true)), actual, expected) - assert.Error(t, err) -} - -func TestToolComparerLessThanBranches(t *testing.T) { - left := &toolComparer{name: "a", argsOrder: "1", responseOrder: "1"} - right := &toolComparer{name: "b", argsOrder: "0", responseOrder: "0"} - assert.True(t, left.lessThan(right)) - - left2 := &toolComparer{name: "a", argsOrder: "2", responseOrder: "1"} - right2 := &toolComparer{name: "a", argsOrder: "3", responseOrder: "0"} - assert.True(t, left2.lessThan(right2)) - - left3 := &toolComparer{name: "a", argsOrder: "1", responseOrder: "2"} - right3 := &toolComparer{name: "a", argsOrder: "1", responseOrder: "3"} - assert.True(t, left3.lessThan(right3)) -} - -func TestToolTrajectoryStrategyArgumentAndResponseMismatch(t *testing.T) { - strategy := &tooltrajectory.ToolTrajectoryStrategy{ - Arguments: &maptext.MapTextCriterion{}, - Response: &maptext.MapTextCriterion{}, - } - actual := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - expected := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 3}}, - }) - criterion := tooltrajectory.New(tooltrajectory.WithTool(map[string]*tooltrajectory.ToolTrajectoryStrategy{ - "tool": strategy, - })) - err := Match(criterion, actual, expected) - assert.Error(t, err) -} - -func TestGetToolComparerNilInputs(t *testing.T) { - _, err := getToolComparer(nil, &genai.FunctionResponse{}, false) - assert.Error(t, err) - _, err = getToolComparer(&genai.FunctionCall{}, nil, false) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionMissingResponseSet(t *testing.T) { - actual := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "call-1", Name: "tool"}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "call-1", Name: "tool"}, - }, - }, - } - expected := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "call-1", Name: "tool"}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "other", Name: "tool"}, - }, - }, - } - err := Match(tooltrajectory.New(), actual, expected) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionFallbackDefault(t *testing.T) { - actual := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - expected := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - criterion := &tooltrajectory.ToolTrajectoryCriterion{ - DefaultStrategy: nil, - ToolStrategy: nil, - } - err := Match(criterion, actual, expected) - assert.NoError(t, err) -} - -func TestToolTrajectoryCriterionFallbackDefaultStrategy(t *testing.T) { - actual := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - expected := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - criterion := &tooltrajectory.ToolTrajectoryCriterion{ - DefaultStrategy: nil, - ToolStrategy: nil, - } - err := Match(criterion, actual, expected) - assert.NoError(t, err) -} - -func TestToolTrajectoryCriterionEmptyToolResponseID(t *testing.T) { - actual := &evalset.Invocation{ - IntermediateData: &evalset.IntermediateData{ - ToolUses: []*genai.FunctionCall{ - {ID: "call-1", Name: "tool"}, - }, - ToolResponses: []*genai.FunctionResponse{ - {ID: "", Name: "tool"}, - }, - }, - } - expected := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{}, response: map[string]any{}}, - }) - err := Match(tooltrajectory.New(), actual, expected) - assert.Error(t, err) -} - -func TestToolTrajectoryCriterionStrategyLookupByExpectedName(t *testing.T) { - actual := makeInvocation([]toolData{ - {id: "call-1", name: "unknown", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - expected := makeInvocation([]toolData{ - {id: "call-1", name: "custom", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - customStrategy := &tooltrajectory.ToolTrajectoryStrategy{} - criterion := tooltrajectory.New(tooltrajectory.WithTool(map[string]*tooltrajectory.ToolTrajectoryStrategy{ - "custom": customStrategy, - })) - err := Match(criterion, actual, expected) - assert.NoError(t, err) -} - -func TestToolTrajectoryStrategyResponseMismatchOnly(t *testing.T) { - strategy := &tooltrajectory.ToolTrajectoryStrategy{ - Arguments: &maptext.MapTextCriterion{}, - Response: &maptext.MapTextCriterion{}, - } - actual := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, - }) - expected := makeInvocation([]toolData{ - {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 2}}, - }) - criterion := tooltrajectory.New(tooltrajectory.WithTool(map[string]*tooltrajectory.ToolTrajectoryStrategy{ - "tool": strategy, - })) - err := Match(criterion, actual, expected) - assert.Error(t, err) -} - -func TestToolComparerLessThanEqual(t *testing.T) { - left := &toolComparer{name: "same", argsOrder: "1", responseOrder: "1"} - right := &toolComparer{name: "same", argsOrder: "1", responseOrder: "1"} - assert.False(t, left.lessThan(right)) -} - -func TestInternalTextAndMapWrappers(t *testing.T) { - txt := &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact} - err := itext.Match(txt, "same", "same") - assert.NoError(t, err) - - crit := &maptext.MapTextCriterion{} - err = imaptext.Match(crit, map[string]any{"a": 1}, map[string]any{"a": 1}) - assert.NoError(t, err) -} diff --git a/evaluation/metric/criterion/json/json.go b/evaluation/metric/criterion/json/json.go new file mode 100644 index 000000000..2fbf56fde --- /dev/null +++ b/evaluation/metric/criterion/json/json.go @@ -0,0 +1,52 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +// Package json defines json-based comparison criteria. +package json + +import ( + "fmt" + "reflect" +) + +// JSONCriterion compares two JSON objects using exact matching. +type JSONCriterion struct { + // MatchStrategy selects the comparison rule. + MatchStrategy JSONMatchStrategy `json:"matchStrategy,omitempty"` + // Compare overrides default comparison when provided. + Compare func(actual, expected map[string]any) (bool, error) `json:"-"` +} + +// JSONMatchStrategy enumerates supported JSON comparison strategies. +type JSONMatchStrategy string + +const ( + // JSONMatchStrategyExact matches json objects exactly. + JSONMatchStrategyExact JSONMatchStrategy = "exact" +) + +// Match compares two JSON objects using custom logic or deep equality. +func (j *JSONCriterion) Match(actual, expected map[string]any) (bool, error) { + if j == nil { + return false, fmt.Errorf("json criterion is nil") + } + if j.Compare != nil { + return j.Compare(actual, expected) + } + switch j.MatchStrategy { + // Default to exact match. + case JSONMatchStrategyExact, "": + if reflect.DeepEqual(actual, expected) { + return true, nil + } + return false, fmt.Errorf("actual %v and expected %v do not match", actual, expected) + default: + return false, fmt.Errorf("invalid match strategy %s", j.MatchStrategy) + } +} diff --git a/evaluation/metric/criterion/json/json_test.go b/evaluation/metric/criterion/json/json_test.go new file mode 100644 index 000000000..eb6616169 --- /dev/null +++ b/evaluation/metric/criterion/json/json_test.go @@ -0,0 +1,44 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + +package json + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMapCriterionCompareOverride(t *testing.T) { + called := false + criterion := &JSONCriterion{ + Compare: func(actual, expected map[string]any) (bool, error) { + called = true + return true, nil + }, + } + ok, err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "v"}) + assert.True(t, ok) + assert.NoError(t, err) + assert.True(t, called) +} + +func TestMapCriterionDeepEqualMismatch(t *testing.T) { + criterion := &JSONCriterion{} + ok, err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "diff"}) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestMapCriterionDeepEqualSuccess(t *testing.T) { + criterion := &JSONCriterion{} + ok, err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "v"}) + assert.True(t, ok) + assert.NoError(t, err) +} diff --git a/evaluation/metric/criterion/maptext/maptext.go b/evaluation/metric/criterion/maptext/maptext.go deleted file mode 100644 index 76eab58b4..000000000 --- a/evaluation/metric/criterion/maptext/maptext.go +++ /dev/null @@ -1,23 +0,0 @@ -// -// Tencent is pleased to support the open source community by making trpc-agent-go available. -// -// Copyright (C) 2025 Tencent. All rights reserved. -// -// trpc-agent-go is licensed under the Apache License Version 2.0. -// -// - -// Package maptext defines map-based comparison criteria. -package maptext - -import ( - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" -) - -// MapTextCriterion compares two string-keyed maps. -type MapTextCriterion struct { - // TextCriterion applies string-based matching on JSON-serialized maps. - TextCriterion *text.TextCriterion `json:"textCriterion,omitempty"` - // Compare overrides default comparison when provided. - Compare func(actual, expected map[string]any) error `json:"-"` -} diff --git a/evaluation/metric/criterion/maptext/maptext_test.go b/evaluation/metric/criterion/maptext/maptext_test.go deleted file mode 100644 index f57c8f000..000000000 --- a/evaluation/metric/criterion/maptext/maptext_test.go +++ /dev/null @@ -1,37 +0,0 @@ -// -// Tencent is pleased to support the open source community by making trpc-agent-go available. -// -// Copyright (C) 2025 Tencent. All rights reserved. -// -// trpc-agent-go is licensed under the Apache License Version 2.0. -// -// - -package maptext - -import ( - "encoding/json" - "testing" - - "github.com/stretchr/testify/assert" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" -) - -func TestMapTextCriterionJSONRoundTrip(t *testing.T) { - criterion := &MapTextCriterion{ - TextCriterion: &text.TextCriterion{ - Ignore: true, - MatchStrategy: text.TextMatchStrategyExact, - }, - } - data, err := json.Marshal(criterion) - assert.NoError(t, err) - assert.JSONEq(t, `{"textCriterion":{"ignore":true,"matchStrategy":"exact"}}`, string(data)) - - var decoded MapTextCriterion - err = json.Unmarshal(data, &decoded) - assert.NoError(t, err) - assert.NotNil(t, decoded.TextCriterion) - assert.Equal(t, criterion.TextCriterion.Ignore, decoded.TextCriterion.Ignore) - assert.Equal(t, criterion.TextCriterion.MatchStrategy, decoded.TextCriterion.MatchStrategy) -} diff --git a/evaluation/metric/criterion/text/text.go b/evaluation/metric/criterion/text/text.go index 8d306cbfc..031d0bae8 100644 --- a/evaluation/metric/criterion/text/text.go +++ b/evaluation/metric/criterion/text/text.go @@ -10,6 +10,12 @@ // Package text defines text comparison criteria. package text +import ( + "fmt" + "regexp" + "strings" +) + // TextCriterion governs how two strings should be compared. type TextCriterion struct { // Ignore skips comparison when true. @@ -19,7 +25,7 @@ type TextCriterion struct { // MatchStrategy selects the comparison rule. MatchStrategy TextMatchStrategy `json:"matchStrategy,omitempty"` // Compare overrides built-in strategies. - Compare func(actual, expected string) error `json:"-"` + Compare func(actual, expected string) (bool, error) `json:"-"` } // TextMatchStrategy enumerates supported text comparison strategies. @@ -33,3 +39,41 @@ const ( // TextMatchStrategyRegex matches strings that match the regex. TextMatchStrategyRegex TextMatchStrategy = "regex" ) + +// Match compares source and target using the configured strategy. +func (t *TextCriterion) Match(source, target string) (bool, error) { + if t.Compare != nil { + return t.Compare(source, target) + } + if t.Ignore { + return true, nil + } + if t.CaseInsensitive { + source = strings.ToLower(source) + target = strings.ToLower(target) + } + switch t.MatchStrategy { + // Default to exact match. + case TextMatchStrategyExact, "": + if source == target { + return true, nil + } + return false, fmt.Errorf("source %s and target %s do not match", source, target) + case TextMatchStrategyContains: + if strings.Contains(source, target) { + return true, nil + } + return false, fmt.Errorf("source %s does not contain target %s", source, target) + case TextMatchStrategyRegex: + re, err := regexp.Compile(target) + if err != nil { + return false, fmt.Errorf("invalid regex %s: %w", target, err) + } + if re.MatchString(source) { + return true, nil + } + return false, fmt.Errorf("source %s does not match regex %s", source, target) + default: + return false, fmt.Errorf("invalid match strategy %s", t.MatchStrategy) + } +} diff --git a/evaluation/metric/criterion/text/text_test.go b/evaluation/metric/criterion/text/text_test.go index b99ddbe36..238d75770 100644 --- a/evaluation/metric/criterion/text/text_test.go +++ b/evaluation/metric/criterion/text/text_test.go @@ -33,3 +33,81 @@ func TestTextCriterionJSONRoundTrip(t *testing.T) { assert.Equal(t, criterion.CaseInsensitive, decoded.CaseInsensitive) assert.Equal(t, criterion.MatchStrategy, decoded.MatchStrategy) } + +func TestTextCriterionMatchStrategies(t *testing.T) { + criterion := &TextCriterion{ + CaseInsensitive: true, + MatchStrategy: TextMatchStrategyContains, + } + ok, err := criterion.Match("Hello World", "hello") + assert.NoError(t, err) + assert.True(t, ok) +} + +func TestTextCriterionIgnore(t *testing.T) { + criterion := &TextCriterion{ + Ignore: true, + } + ok, err := criterion.Match("anything", "value") + assert.NoError(t, err) + assert.True(t, ok) +} + +func TestTextCriterionRegexInvalid(t *testing.T) { + criterion := &TextCriterion{ + MatchStrategy: TextMatchStrategyRegex, + } + ok, err := criterion.Match("source", "[invalid(") + assert.False(t, ok) + assert.Error(t, err) +} + +func TestTextCriterionUnknownStrategy(t *testing.T) { + criterion := &TextCriterion{ + MatchStrategy: TextMatchStrategy("unknown"), + } + ok, err := criterion.Match("a", "b") + assert.False(t, ok) + assert.Error(t, err) +} + +func TestTextCriterionAllBranches(t *testing.T) { + customCalled := false + custom := &TextCriterion{ + Compare: func(actual, expected string) (bool, error) { + customCalled = true + return true, nil + }, + } + ok, err := custom.Match("x", "y") + assert.True(t, ok) + assert.NoError(t, err) + assert.True(t, customCalled) + + exact := &TextCriterion{ + MatchStrategy: TextMatchStrategyExact, + } + ok, err = exact.Match("same", "same") + assert.True(t, ok) + assert.NoError(t, err) + ok, err = exact.Match("same", "diff") + assert.False(t, ok) + assert.Error(t, err) + + contains := &TextCriterion{ + MatchStrategy: TextMatchStrategyContains, + } + ok, err = contains.Match("hello", "missing") + assert.False(t, ok) + assert.Error(t, err) + + regex := &TextCriterion{ + MatchStrategy: TextMatchStrategyRegex, + } + ok, err = regex.Match("abc123", "abc[0-9]+") + assert.True(t, ok) + assert.NoError(t, err) + ok, err = regex.Match("xyz", "abc[0-9]+") + assert.False(t, ok) + assert.Error(t, err) +} diff --git a/evaluation/metric/criterion/tooltrajectory/options.go b/evaluation/metric/criterion/tooltrajectory/options.go index dcaa87cc6..cc2fb4965 100644 --- a/evaluation/metric/criterion/tooltrajectory/options.go +++ b/evaluation/metric/criterion/tooltrajectory/options.go @@ -10,35 +10,16 @@ package tooltrajectory import ( - "fmt" - "reflect" - "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" ) // defaultToolTrajectoryStrategy is used when no user strategy is supplied. var defaultToolTrajectoryStrategy = &ToolTrajectoryStrategy{ - Name: &text.TextCriterion{ - MatchStrategy: text.TextMatchStrategyExact, - }, - Arguments: &maptext.MapTextCriterion{ - Compare: func(actual, expected map[string]any) error { - if !reflect.DeepEqual(actual, expected) { - return fmt.Errorf("actual %v and expected %v do not match", actual, expected) - } - return nil - }, - }, - Response: &maptext.MapTextCriterion{ - Compare: func(actual, expected map[string]any) error { - if !reflect.DeepEqual(actual, expected) { - return fmt.Errorf("actual %v and expected %v do not match", actual, expected) - } - return nil - }, - }, + Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, + Arguments: &json.JSONCriterion{MatchStrategy: json.JSONMatchStrategyExact}, + Response: &json.JSONCriterion{MatchStrategy: json.JSONMatchStrategyExact}, } // options configures ToolTrajectoryCriterion. @@ -50,7 +31,7 @@ type options struct { // orderInsensitive toggles order-agnostic comparison for args and responses. orderInsensitive bool // compare allows overriding comparison logic entirely. - compare func(actual, expected *evalset.Invocation) error + compare func(actual, expected *evalset.Invocation) (bool, error) } // newOptions applies provided options for ToolTrajectoryCriterion. @@ -92,7 +73,7 @@ func WithOrderInsensitive(orderInsensitive bool) Option { } // WithCompare sets the tool trajectory comparison logic. -func WithCompare(compare func(actual, expected *evalset.Invocation) error) Option { +func WithCompare(compare func(actual, expected *evalset.Invocation) (bool, error)) Option { return func(o *options) { o.compare = compare } diff --git a/evaluation/metric/criterion/tooltrajectory/options_test.go b/evaluation/metric/criterion/tooltrajectory/options_test.go index ebee155d7..64f79b376 100644 --- a/evaluation/metric/criterion/tooltrajectory/options_test.go +++ b/evaluation/metric/criterion/tooltrajectory/options_test.go @@ -14,7 +14,6 @@ import ( "github.com/stretchr/testify/assert" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" - imaptext "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/internal/maptext" ) func TestNewOptionsDefaults(t *testing.T) { @@ -46,27 +45,30 @@ func TestWithOrderInsensitive(t *testing.T) { func TestWithCompare(t *testing.T) { var called bool - compare := func(actual, expected *evalset.Invocation) error { + compare := func(actual, expected *evalset.Invocation) (bool, error) { called = true - return nil + return true, nil } opts := newOptions(WithCompare(compare)) assert.NotNil(t, opts.compare) - err := opts.compare(nil, nil) + ok, err := opts.compare(nil, nil) + assert.True(t, ok) assert.NoError(t, err) assert.True(t, called) } func TestDefaultToolTrajectoryStrategyDeepEqualMismatch(t *testing.T) { - errArgs := imaptext.Match(defaultToolTrajectoryStrategy.Arguments, + ok, errArgs := defaultToolTrajectoryStrategy.Arguments.Match( map[string]any{"a": 1}, map[string]any{"a": 2}, ) + assert.False(t, ok) assert.Error(t, errArgs) - errResp := imaptext.Match(defaultToolTrajectoryStrategy.Response, + ok, errResp := defaultToolTrajectoryStrategy.Response.Match( map[string]any{"r": 1}, map[string]any{"r": 3}, ) + assert.False(t, ok) assert.Error(t, errResp) } diff --git a/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go b/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go index 404d16362..ad1c83f69 100644 --- a/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go +++ b/evaluation/metric/criterion/tooltrajectory/tooltrajectory.go @@ -11,8 +11,14 @@ package tooltrajectory import ( + "encoding/json" + "errors" + "fmt" + "sort" + + "google.golang.org/genai" "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" + criterionjson "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" ) @@ -36,12 +42,225 @@ type ToolTrajectoryCriterion struct { // OrderInsensitive toggles comparison order for args and responses. OrderInsensitive bool `json:"orderInsensitive,omitempty"` // Compare allows custom comparison override. - Compare func(actual, expected *evalset.Invocation) error `json:"-"` + Compare func(actual, expected *evalset.Invocation) (bool, error) `json:"-"` } // ToolTrajectoryStrategy defines comparison strategies for a single tool. type ToolTrajectoryStrategy struct { - Name *text.TextCriterion `json:"name,omitempty"` // Name compares tool names. - Arguments *maptext.MapTextCriterion `json:"arguments,omitempty"` // Arguments compares tool call arguments. - Response *maptext.MapTextCriterion `json:"response,omitempty"` // Response compares tool call responses. + Name *text.TextCriterion `json:"name,omitempty"` // Name compares tool names. + Arguments *criterionjson.JSONCriterion `json:"arguments,omitempty"` // Arguments compares tool call arguments. + Response *criterionjson.JSONCriterion `json:"response,omitempty"` // Response compares tool call responses. +} + +// Match compares actual and expected invocations according to tool trajectory rules. +func (t *ToolTrajectoryCriterion) Match(actual, expected *evalset.Invocation) (bool, error) { + if t.Compare != nil { + return t.Compare(actual, expected) + } + if actual == nil || expected == nil { + return false, fmt.Errorf("actual or expected invocation is nil") + } + if actual.IntermediateData == nil || expected.IntermediateData == nil { + return false, fmt.Errorf("actual or expected intermediate data is nil") + } + // Ensure one-to-one mapping between tool calls and responses on actual invocation. + if len(actual.IntermediateData.ToolUses) != len(actual.IntermediateData.ToolResponses) { + return false, fmt.Errorf("tool uses and tool responses count mismatch: %d != %d", + len(actual.IntermediateData.ToolUses), len(actual.IntermediateData.ToolResponses)) + } + // Ensure one-to-one mapping between tool calls and responses on expected invocation. + if len(expected.IntermediateData.ToolUses) != len(expected.IntermediateData.ToolResponses) { + return false, fmt.Errorf("tool uses and tool responses count mismatch: %d != %d", + len(expected.IntermediateData.ToolUses), len(expected.IntermediateData.ToolResponses)) + } + // Ensure the same number of tool uses before detailed comparison. + if len(actual.IntermediateData.ToolUses) != len(expected.IntermediateData.ToolUses) { + return false, fmt.Errorf("tool uses count mismatch: %d != %d", + len(actual.IntermediateData.ToolUses), len(expected.IntermediateData.ToolUses)) + } + if len(actual.IntermediateData.ToolUses) == 0 { + return true, nil + } + actualTools, err := getToolComparers( + actual.IntermediateData.ToolUses, + actual.IntermediateData.ToolResponses, + t.OrderInsensitive, + ) + if err != nil { + return false, fmt.Errorf("get actual tools: %w", err) + } + expectedTools, err := getToolComparers( + expected.IntermediateData.ToolUses, + expected.IntermediateData.ToolResponses, + t.OrderInsensitive, + ) + if err != nil { + return false, fmt.Errorf("get expected tools: %w", err) + } + if t.OrderInsensitive { + sort.Slice(actualTools, func(i, j int) bool { + return actualTools[i].lessThan(actualTools[j]) + }) + sort.Slice(expectedTools, func(i, j int) bool { + return expectedTools[i].lessThan(expectedTools[j]) + }) + } + for i := range len(actualTools) { + strategy := getStrategy(t, actualTools[i], expectedTools[i]) + ok, err := strategy.match(actualTools[i], expectedTools[i]) + if err != nil { + return false, fmt.Errorf("tool %s mismatch: %w", actualTools[i].name, err) + } + if !ok { + return false, fmt.Errorf("tool %s mismatch", actualTools[i].name) + } + } + return true, nil +} + +// Match validates a single tool call pair using configured criteria. +func (t *ToolTrajectoryStrategy) match(actual, expected *toolComparer) (bool, error) { + if t.Name != nil { + ok, err := t.Name.Match(actual.name, expected.name) + if err != nil { + return false, fmt.Errorf("name mismatch: %w", err) + } + if !ok { + return false, fmt.Errorf("name mismatch") + } + } + if t.Arguments != nil { + ok, err := t.Arguments.Match(actual.args, expected.args) + if err != nil { + return false, fmt.Errorf("arguments mismatch: %w", err) + } + if !ok { + return false, fmt.Errorf("arguments mismatch") + } + } + if t.Response != nil { + ok, err := t.Response.Match(actual.response, expected.response) + if err != nil { + return false, fmt.Errorf("response mismatch: %w", err) + } + if !ok { + return false, fmt.Errorf("response mismatch") + } + } + return true, nil +} + +// toolComparer normalizes tool call and response data for comparison. +type toolComparer struct { + name string // name holds the tool name. + args map[string]any // args holds parsed tool arguments. + response map[string]any // response holds parsed tool response payload. + argsOrder string // argsOrder caches JSON for order-insensitive compare. + responseOrder string // responseOrder caches JSON for order-insensitive compare. +} + +// lessThan provides deterministic ordering when order-insensitive compares require sorting. +func (t *toolComparer) lessThan(other *toolComparer) bool { + if t.name != other.name { + return t.name < other.name + } + if t.argsOrder != other.argsOrder { + return t.argsOrder < other.argsOrder + } + if t.responseOrder != other.responseOrder { + return t.responseOrder < other.responseOrder + } + return false +} + +// getToolComparers aligns tool uses with their responses and builds toolComparer. +func getToolComparers(toolUses []*genai.FunctionCall, toolResponses []*genai.FunctionResponse, + orderInsensitive bool) ([]*toolComparer, error) { + // toolCallIDs ensures every tool use can be matched by ID. + // Map from tool call id to index. + toolCallIDs := make(map[string]int) + for i := range len(toolUses) { + if toolUses[i].ID == "" { + return nil, fmt.Errorf("tool use id is empty") + } + if _, ok := toolCallIDs[toolUses[i].ID]; ok { + return nil, fmt.Errorf("tool use id %s is duplicated", toolUses[i].ID) + } + toolCallIDs[toolUses[i].ID] = i + } + // toolResponseIDs ensures every tool response can be matched by ID. + // Map from tool response id to index. + toolResponseIDs := make(map[string]int) + for i := range len(toolResponses) { + if toolResponses[i].ID == "" { + return nil, fmt.Errorf("tool response id is empty") + } + if _, ok := toolResponseIDs[toolResponses[i].ID]; ok { + return nil, fmt.Errorf("tool response id %s is duplicated", toolResponses[i].ID) + } + toolResponseIDs[toolResponses[i].ID] = i + } + for toolID := range toolCallIDs { + if _, ok := toolResponseIDs[toolID]; !ok { + return nil, fmt.Errorf("tool id %s is missing response", toolID) + } + } + toolComparers := make([]*toolComparer, 0, len(toolUses)) + for i := range len(toolUses) { + toolComparer, err := getToolComparer( + toolUses[i], + toolResponses[toolResponseIDs[toolUses[i].ID]], + orderInsensitive, + ) + if err != nil { + return nil, fmt.Errorf("get tool comparer: %w", err) + } + toolComparers = append(toolComparers, toolComparer) + } + return toolComparers, nil +} + +// getToolComparer pairs a tool use with its response and precomputes ordering hints. +func getToolComparer(toolUse *genai.FunctionCall, toolResponse *genai.FunctionResponse, + orderInsensitive bool) (*toolComparer, error) { + if toolUse == nil || toolResponse == nil { + return nil, errors.New("tool use or tool response is nil") + } + tool := &toolComparer{ + name: toolUse.Name, + args: toolUse.Args, + response: toolResponse.Response, + } + if orderInsensitive { + args, err := json.Marshal(toolUse.Args) + if err != nil { + return nil, fmt.Errorf("marshal arguments: %w", err) + } + response, err := json.Marshal(toolResponse.Response) + if err != nil { + return nil, fmt.Errorf("marshal response: %w", err) + } + tool.argsOrder = string(args) + tool.responseOrder = string(response) + } + return tool, nil +} + +// getStrategy picks the comparison strategy for a specific tool pair. +func getStrategy(t *ToolTrajectoryCriterion, actualTool, + expectedTool *toolComparer) *ToolTrajectoryStrategy { + if t.ToolStrategy != nil { + strategy, ok := t.ToolStrategy[actualTool.name] + if ok { + return strategy + } + strategy, ok = t.ToolStrategy[expectedTool.name] + if ok { + return strategy + } + } + if t.DefaultStrategy != nil { + return t.DefaultStrategy + } + return defaultToolTrajectoryStrategy } diff --git a/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go b/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go index 87742a492..02c4d8ecf 100644 --- a/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go +++ b/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go @@ -14,7 +14,9 @@ import ( "testing" "github.com/stretchr/testify/assert" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" + "google.golang.org/genai" + "trpc.group/trpc-go/trpc-agent-go/evaluation/evalset" + criterionjson "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" ) @@ -26,8 +28,8 @@ func TestToolTrajectoryCriterionJSONRoundTrip(t *testing.T) { CaseInsensitive: true, MatchStrategy: text.TextMatchStrategyExact, }, - Arguments: &maptext.MapTextCriterion{}, - Response: &maptext.MapTextCriterion{}, + Arguments: &criterionjson.JSONCriterion{}, + Response: &criterionjson.JSONCriterion{}, }, ToolStrategy: map[string]*ToolTrajectoryStrategy{ "foo": { @@ -81,19 +83,15 @@ func TestToolTrajectoryStrategyJSONRoundTrip(t *testing.T) { CaseInsensitive: true, MatchStrategy: text.TextMatchStrategyExact, }, - Arguments: &maptext.MapTextCriterion{ - TextCriterion: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyRegex}, - }, - Response: &maptext.MapTextCriterion{ - TextCriterion: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyContains}, - }, + Arguments: &criterionjson.JSONCriterion{}, + Response: &criterionjson.JSONCriterion{}, } data, err := json.Marshal(strategy) assert.NoError(t, err) assert.JSONEq(t, `{ "name":{"ignore":true,"caseInsensitive":true,"matchStrategy":"exact"}, - "arguments":{"textCriterion":{"matchStrategy":"regex"}}, - "response":{"textCriterion":{"matchStrategy":"contains"}} + "arguments":{}, + "response":{} }`, string(data)) var decoded ToolTrajectoryStrategy @@ -104,6 +102,549 @@ func TestToolTrajectoryStrategyJSONRoundTrip(t *testing.T) { assert.True(t, decoded.Name.CaseInsensitive) assert.NotNil(t, decoded.Arguments) assert.NotNil(t, decoded.Response) - assert.Equal(t, text.TextMatchStrategyRegex, decoded.Arguments.TextCriterion.MatchStrategy) - assert.Equal(t, text.TextMatchStrategyContains, decoded.Response.TextCriterion.MatchStrategy) +} + +func TestToolTrajectoryCriterionMatchOrderInsensitive(t *testing.T) { + actual := makeInvocation( + []toolData{ + {id: "call-1", name: "shared", args: map[string]any{"a": 1}, response: map[string]any{"r": 2}}, + {id: "call-2", name: "shared", args: map[string]any{"a": 2}, response: map[string]any{"r": 3}}, + }, + ) + expected := makeInvocation( + []toolData{ + {id: "call-2", name: "shared", args: map[string]any{"a": 2}, response: map[string]any{"r": 3}}, + {id: "call-1", name: "shared", args: map[string]any{"a": 1}, response: map[string]any{"r": 2}}, + }, + ) + + criterion := &ToolTrajectoryCriterion{ + OrderInsensitive: true, + } + ok, err := criterion.Match(actual, expected) + assert.True(t, ok) + assert.NoError(t, err) +} + +func TestToolTrajectoryCriterionMissingResponse(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{}, + }, + } + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool"}, + }, + }, + } + criterion := &ToolTrajectoryCriterion{} + ok, err := criterion.Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionCustomStrategy(t *testing.T) { + actual := makeInvocation( + []toolData{ + {id: "call-1", name: "custom", args: map[string]any{"k": "v"}, response: map[string]any{"r": "x"}}, + }, + ) + expected := makeInvocation( + []toolData{ + {id: "call-1", name: "custom", args: map[string]any{"k": "v"}, response: map[string]any{"r": "x"}}, + }, + ) + customStrategy := &ToolTrajectoryStrategy{ + Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, + } + criterion := &ToolTrajectoryCriterion{ + ToolStrategy: map[string]*ToolTrajectoryStrategy{ + "custom": customStrategy, + }, + } + ok, err := criterion.Match(actual, expected) + assert.True(t, ok) + assert.NoError(t, err) +} + +type toolData struct { + id string + name string + args map[string]any + response map[string]any +} + +func makeInvocation(tools []toolData) *evalset.Invocation { + toolUses := make([]*genai.FunctionCall, 0, len(tools)) + toolResponses := make([]*genai.FunctionResponse, 0, len(tools)) + for _, t := range tools { + toolUses = append(toolUses, &genai.FunctionCall{ + ID: t.id, + Name: t.name, + Args: t.args, + }) + toolResponses = append(toolResponses, &genai.FunctionResponse{ + ID: t.id, + Name: t.name, + Response: t.response, + }) + } + return &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: toolUses, + ToolResponses: toolResponses, + }, + } +} + +func TestToolTrajectoryCriterionIDMismatch(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "use-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "resp-1", Name: "tool"}, + }, + }, + } + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "use-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "use-1", Name: "tool"}, + }, + }, + } + criterion := New() + ok, err := criterion.Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionNilInvocation(t *testing.T) { + criterion := New() + ok, err := criterion.Match(nil, makeInvocation(nil)) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionNilIntermediate(t *testing.T) { + criterion := New() + ok, err := criterion.Match( + &evalset.Invocation{}, + &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}}, + ) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionEmptyToolUseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "resp-1", Name: "tool"}, + }, + }, + } + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "resp-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "resp-1", Name: "tool"}, + }, + }, + } + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionDuplicateResponseID(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + actual.IntermediateData.ToolResponses = append(actual.IntermediateData.ToolResponses, &genai.FunctionResponse{ + ID: "call-1", + Name: "tool", + Response: map[string]any{"r": 2}, + }) + ok, err := New().Match(actual, makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + })) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionCustomCompare(t *testing.T) { + var called bool + criterion := &ToolTrajectoryCriterion{ + Compare: func(actual, expected *evalset.Invocation) (bool, error) { + called = true + return true, nil + }, + } + ok, err := criterion.Match(&evalset.Invocation{}, &evalset.Invocation{}) + assert.True(t, ok) + assert.NoError(t, err) + assert.True(t, called) +} + +func TestToolTrajectoryCriterionExpectedResponseCountMismatch(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool", Args: map[string]any{"a": 1}}, + }, + ToolResponses: []*genai.FunctionResponse{}, + }, + } + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionToolUsesCountMismatch(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + {id: "call-2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, + }) + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionZeroTools(t *testing.T) { + actual := &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}} + expected := &evalset.Invocation{IntermediateData: &evalset.IntermediateData{}} + ok, err := New().Match(actual, expected) + assert.True(t, ok) + assert.NoError(t, err) +} + +func TestToolTrajectoryCriterionExpectedInvalidID(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "", Name: "tool", Args: map[string]any{"a": 1}}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool", Response: map[string]any{"r": 1}}, + }, + }, + } + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionStrategyMismatch(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool-A", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool-B", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + strategy := &ToolTrajectoryStrategy{ + Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact}, + } + criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{"tool-A": strategy})) + ok, err := criterion.Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionDuplicateToolUseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "dup", Name: "tool"}, + {ID: "dup", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "dup", Name: "tool"}, + {ID: "dup2", Name: "tool"}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "dup", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + {id: "dup2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, + }) + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionDuplicateToolResponseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + {ID: "call-2", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool"}, + {ID: "call-1", Name: "tool"}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + {id: "call-2", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 2}}, + }) + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionMissingResponseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "other", Name: "tool"}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolComparerOrderInsensitiveMarshalError(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool", Args: map[string]any{"bad": make(chan int)}}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool", Response: map[string]any{"r": 1}}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{}, response: map[string]any{"r": 1}}, + }) + ok, err := New(WithOrderInsensitive(true)).Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolComparerOrderInsensitiveMarshalResponseError(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool", Args: map[string]any{"a": 1}}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool", Response: map[string]any{"bad": make(chan int)}}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + ok, err := New(WithOrderInsensitive(true)).Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolComparerLessThanBranches(t *testing.T) { + left := &toolComparer{name: "a", argsOrder: "1", responseOrder: "1"} + right := &toolComparer{name: "b", argsOrder: "0", responseOrder: "0"} + assert.True(t, left.lessThan(right)) + + left2 := &toolComparer{name: "a", argsOrder: "2", responseOrder: "1"} + right2 := &toolComparer{name: "a", argsOrder: "3", responseOrder: "0"} + assert.True(t, left2.lessThan(right2)) + + left3 := &toolComparer{name: "a", argsOrder: "1", responseOrder: "2"} + right3 := &toolComparer{name: "a", argsOrder: "1", responseOrder: "3"} + assert.True(t, left3.lessThan(right3)) +} + +func TestToolTrajectoryStrategyArgumentAndResponseMismatch(t *testing.T) { + strategy := &ToolTrajectoryStrategy{ + Arguments: &criterionjson.JSONCriterion{}, + Response: &criterionjson.JSONCriterion{}, + } + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 2}, response: map[string]any{"r": 3}}, + }) + criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{ + "tool": strategy, + })) + ok, err := criterion.Match(actual, expected) + assert.True(t, ok) + assert.Error(t, err) +} + +func TestGetToolComparerNilInputs(t *testing.T) { + _, err := getToolComparer(nil, &genai.FunctionResponse{}, false) + assert.Error(t, err) + _, err = getToolComparer(&genai.FunctionCall{}, nil, false) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionMissingResponseSet(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "call-1", Name: "tool"}, + }, + }, + } + expected := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "other", Name: "tool"}, + }, + }, + } + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionFallbackDefault(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + criterion := &ToolTrajectoryCriterion{ + DefaultStrategy: nil, + ToolStrategy: nil, + } + ok, err := criterion.Match(actual, expected) + assert.True(t, ok) + assert.NoError(t, err) +} + +func TestToolTrajectoryCriterionFallbackDefaultStrategy(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + criterion := &ToolTrajectoryCriterion{ + DefaultStrategy: nil, + ToolStrategy: nil, + } + ok, err := criterion.Match(actual, expected) + assert.True(t, ok) + assert.NoError(t, err) +} + +func TestToolTrajectoryCriterionEmptyToolResponseID(t *testing.T) { + actual := &evalset.Invocation{ + IntermediateData: &evalset.IntermediateData{ + ToolUses: []*genai.FunctionCall{ + {ID: "call-1", Name: "tool"}, + }, + ToolResponses: []*genai.FunctionResponse{ + {ID: "", Name: "tool"}, + }, + }, + } + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{}, response: map[string]any{}}, + }) + ok, err := New().Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolTrajectoryCriterionStrategyLookupByExpectedName(t *testing.T) { + actual := makeInvocation([]toolData{ + {id: "call-1", name: "unknown", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "custom", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + customStrategy := &ToolTrajectoryStrategy{} + criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{ + "custom": customStrategy, + })) + ok, err := criterion.Match(actual, expected) + assert.True(t, ok) + assert.NoError(t, err) +} + +func TestToolTrajectoryStrategyResponseMismatchOnly(t *testing.T) { + strategy := &ToolTrajectoryStrategy{ + Arguments: &criterionjson.JSONCriterion{}, + Response: &criterionjson.JSONCriterion{}, + } + actual := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 1}}, + }) + expected := makeInvocation([]toolData{ + {id: "call-1", name: "tool", args: map[string]any{"a": 1}, response: map[string]any{"r": 2}}, + }) + criterion := New(WithTool(map[string]*ToolTrajectoryStrategy{ + "tool": strategy, + })) + ok, err := criterion.Match(actual, expected) + assert.False(t, ok) + assert.Error(t, err) +} + +func TestToolComparerLessThanEqual(t *testing.T) { + left := &toolComparer{name: "same", argsOrder: "1", responseOrder: "1"} + right := &toolComparer{name: "same", argsOrder: "1", responseOrder: "1"} + assert.False(t, left.lessThan(right)) +} + +func TestInternalTextAndMapWrappers(t *testing.T) { + txt := &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact} + ok, err := txt.Match("same", "same") + assert.True(t, ok) + assert.NoError(t, err) + + crit := &criterionjson.JSONCriterion{} + ok, err = crit.Match(map[string]any{"a": 1}, map[string]any{"a": 1}) + assert.True(t, ok) + assert.NoError(t, err) } diff --git a/examples/evaluation/inmemory/main.go b/examples/evaluation/inmemory/main.go index 678a8ad29..86f6c1d05 100644 --- a/examples/evaluation/inmemory/main.go +++ b/examples/evaluation/inmemory/main.go @@ -150,6 +150,7 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { IntermediateData: &evalset.IntermediateData{ ToolUses: []*genai.FunctionCall{ { + ID: "tool_use_1", Name: "calculator", Args: map[string]interface{}{ "operation": "add", @@ -160,6 +161,7 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { }, ToolResponses: []*genai.FunctionResponse{ { + ID: "tool_use_1", Name: "calculator", Response: map[string]interface{}{ "result": 5.0, @@ -198,6 +200,7 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { IntermediateData: &evalset.IntermediateData{ ToolUses: []*genai.FunctionCall{ { + ID: "tool_use_2", Name: "calculator", Args: map[string]interface{}{ "operation": "multiply", @@ -208,6 +211,7 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { }, ToolResponses: []*genai.FunctionResponse{ { + ID: "tool_use_2", Name: "calculator", Response: map[string]interface{}{ "result": 5.0, diff --git a/examples/evaluation/inmemory/server.log.txt b/examples/evaluation/inmemory/server.log.txt new file mode 100644 index 000000000..d3bb5602c --- /dev/null +++ b/examples/evaluation/inmemory/server.log.txt @@ -0,0 +1,441 @@ +{ + "evalId": "calc_add", + "conversation": [ + { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "result": 5 + } + } + ] + } + } + ], + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" + } +} +{ + "evalId": "calc_multiply", + "conversation": [ + { + "invocationId": "calc_multiply-1", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 42" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_2", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_2", + "name": "calculator", + "response": { + "result": 5 + } + } + ] + } + } + ], + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" + } +} +{ + "metricName": "tool_trajectory_avg_score", + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } +} +✅ Evaluation completed +App: math-eval-app +Eval Set: math-basic +Overall Status: failed +Runs: 1 +Case calc_add -> failed + Metric tool_trajectory_avg_score: score 0.00 (threshold 1.00) => failed + +Case calc_multiply -> failed + Metric tool_trajectory_avg_score: score 0.00 (threshold 1.00) => failed + +✅ Evaluation details: +{ + "evalSetResultId": "math-eval-app_math-basic_eca4b66c-014a-42fc-90ba-b313637dcb9b", + "evalSetResultName": "math-eval-app_math-basic_eca4b66c-014a-42fc-90ba-b313637dcb9b", + "evalSetId": "math-basic", + "evalCaseResults": [ + { + "evalSetId": "math-basic", + "evalId": "calc_add", + "finalEvalStatus": 2, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "73836fae-8d1f-4992-97f6-ff098ddfde88", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "The result of 2 + 3 is **5**." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_B9PxU7IhUhm9sBJFVYLJRoqq", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_B9PxU7IhUhm9sBJFVYLJRoqq", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "result": 5 + } + } + ] + }, + "creationTimestamp": 1763994843.1042104 + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ] + } + ], + "sessionId": "4e881454-4927-445d-ae6b-b958f874de97", + "userId": "user" + }, + { + "evalSetId": "math-basic", + "evalId": "calc_multiply", + "finalEvalStatus": 2, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "9e5a66aa-96c0-44be-98f3-1eb682777349", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "6 multiplied by 7 equals 42." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_Yve8PrpOJrt7PpXR0LTi3Jbm", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_Yve8PrpOJrt7PpXR0LTi3Jbm", + "name": "calculator", + "response": { + "a": 6, + "b": 7, + "operation": "multiply", + "result": 42 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_multiply-1", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 42" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_2", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_2", + "name": "calculator", + "response": { + "result": 5 + } + } + ] + }, + "creationTimestamp": 1763994843.1043565 + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ] + } + ], + "sessionId": "da8a7256-2b1a-4cc6-a695-3e949df0664e", + "userId": "user" + } + ], + "creationTimestamp": 1763994850.5015793 +} diff --git a/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json b/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json index 2370ec0e4..09f4fb08d 100644 --- a/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json +++ b/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json @@ -3,101 +3,105 @@ "name": "math-basic", "evalCases": [ { - "evalId": "calc_add", - "conversation": [ - { - "invocationId": "calc_add-1", - "userContent": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 5" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "name": "calculator", - "response": { - "result": 5 + "evalId": "calc_add", + "conversation": [ + { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" } - } - ] + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "result": 5 + } + } + ] + } } + ], + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" } - ], - "sessionInput": { - "appName": "math-eval-app", - "userId": "user" - } - }, - { - "evalId": "calc_multiply", - "conversation": [ - { - "invocationId": "calc_multiply-1", - "userContent": { - "parts": [ - { - "text": "calc multiply 6 7" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 42" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "args": { - "a": 6, - "b": 7, - "operation": "multiply" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "name": "calculator", - "response": { - "result": 5 + }, + { + "evalId": "calc_multiply", + "conversation": [ + { + "invocationId": "calc_multiply-1", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 42" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_2", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_2", + "name": "calculator", + "response": { + "result": 42 + } } - } - ] + ] + } } + ], + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" } - ], - "sessionInput": { - "appName": "math-eval-app", - "userId": "user" } - } ], "creationTimestamp": 1761134484.9804401 -} \ No newline at end of file +} diff --git a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_22fca4c5-3e6c-4439-89d9-84fb9ffc21f8.evalset_result.json b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_22fca4c5-3e6c-4439-89d9-84fb9ffc21f8.evalset_result.json new file mode 100644 index 000000000..c4f9d3408 --- /dev/null +++ b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_22fca4c5-3e6c-4439-89d9-84fb9ffc21f8.evalset_result.json @@ -0,0 +1,304 @@ +{ + "evalSetResultId": "math-eval-app_math-basic_22fca4c5-3e6c-4439-89d9-84fb9ffc21f8", + "evalSetResultName": "math-eval-app_math-basic_22fca4c5-3e6c-4439-89d9-84fb9ffc21f8", + "evalSetId": "math-basic", + "evalCaseResults": [ + { + "evalSetId": "math-basic", + "evalId": "calc_add", + "finalEvalStatus": 2, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "d1e270be-49cc-4e5c-ab2c-5f9ce3b8dd50", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "The result of 2 + 3 is **5**." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_Lc9bIxjl5clAR3uYhgskn9CL", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_Lc9bIxjl5clAR3uYhgskn9CL", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ] + } + ], + "sessionId": "9f022a19-534f-4b18-ad32-407432ebcc32", + "userId": "user" + }, + { + "evalSetId": "math-basic", + "evalId": "calc_multiply", + "finalEvalStatus": 2, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "04b38039-9d2e-4671-8799-7462836b0b32", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "6 multiplied by 7 equals 42." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_zrI87m7cYyk15EE3awVgOyNF", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_zrI87m7cYyk15EE3awVgOyNF", + "name": "calculator", + "response": { + "a": 6, + "b": 7, + "operation": "multiply", + "result": 42 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_multiply-1", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 42" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_2", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_2", + "name": "calculator", + "response": { + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ] + } + ], + "sessionId": "d0e25241-c79e-4b64-ba0c-6c4115124224", + "userId": "user" + } + ], + "creationTimestamp": 1763995039.3679564 +} diff --git a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_67e39166-a96d-40cd-b3f8-ed8c6c15564d.evalset_result.json b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_67e39166-a96d-40cd-b3f8-ed8c6c15564d.evalset_result.json new file mode 100644 index 000000000..a9ce3e8dd --- /dev/null +++ b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_67e39166-a96d-40cd-b3f8-ed8c6c15564d.evalset_result.json @@ -0,0 +1,304 @@ +{ + "evalSetResultId": "math-eval-app_math-basic_67e39166-a96d-40cd-b3f8-ed8c6c15564d", + "evalSetResultName": "math-eval-app_math-basic_67e39166-a96d-40cd-b3f8-ed8c6c15564d", + "evalSetId": "math-basic", + "evalCaseResults": [ + { + "evalSetId": "math-basic", + "evalId": "calc_add", + "finalEvalStatus": 2, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "585658a1-c131-4dcf-beb4-0245dc6e1941", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "The result of 2 + 3 is **5**." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_EHcJm9ANKdFk42iLG5yXqquP", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_EHcJm9ANKdFk42iLG5yXqquP", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ] + } + ], + "sessionId": "fad87330-e5f2-487a-95ac-3a5d458790bf", + "userId": "user" + }, + { + "evalSetId": "math-basic", + "evalId": "calc_multiply", + "finalEvalStatus": 2, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "6c66b8cc-a8e0-4f52-b40a-9cf97df72dc7", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "6 multiplied by 7 equals 42." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_8PwymQ0Vgw7hhmqgpy4jJ2Pl", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_8PwymQ0Vgw7hhmqgpy4jJ2Pl", + "name": "calculator", + "response": { + "a": 6, + "b": 7, + "operation": "multiply", + "result": 42 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_multiply-1", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 42" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_2", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_2", + "name": "calculator", + "response": { + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ] + } + ], + "sessionId": "a8434547-2943-4779-9199-221005869965", + "userId": "user" + } + ], + "creationTimestamp": 1763994917.7690995 +} diff --git a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973.evalset_result.json b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973.evalset_result.json index 84f2b50e7..6798bc577 100644 --- a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973.evalset_result.json +++ b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973.evalset_result.json @@ -113,7 +113,7 @@ { "name": "calculator", "response": { - "result": 5 + "result": 42 } } ] @@ -261,7 +261,7 @@ { "name": "calculator", "response": { - "result": 5 + "result": 42 } } ] @@ -301,4 +301,4 @@ } ], "creationTimestamp": 1763960812.6226852 -} \ No newline at end of file +} diff --git a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_cd3fd79f-898b-44d3-babe-035491901899.evalset_result.json b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_cd3fd79f-898b-44d3-babe-035491901899.evalset_result.json new file mode 100644 index 000000000..5425b4d62 --- /dev/null +++ b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_cd3fd79f-898b-44d3-babe-035491901899.evalset_result.json @@ -0,0 +1,304 @@ +{ + "evalSetResultId": "math-eval-app_math-basic_cd3fd79f-898b-44d3-babe-035491901899", + "evalSetResultName": "math-eval-app_math-basic_cd3fd79f-898b-44d3-babe-035491901899", + "evalSetId": "math-basic", + "evalCaseResults": [ + { + "evalSetId": "math-basic", + "evalId": "calc_add", + "finalEvalStatus": 2, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "a85ab257-96ba-4650-920d-193363936d1c", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "The result of 2 + 3 is **5**." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_czP9rKlVRjw9BY5uLhjcnDYX", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_czP9rKlVRjw9BY5uLhjcnDYX", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ] + } + ], + "sessionId": "78a21442-3299-4608-b40e-d6db402c5afc", + "userId": "user" + }, + { + "evalSetId": "math-basic", + "evalId": "calc_multiply", + "finalEvalStatus": 2, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "29574cb6-9f23-4342-b6e6-8d2207d4d7eb", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "6 multiplied by 7 equals 42." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_1z0xkVXrnI83JceWH9ScC9PA", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_1z0xkVXrnI83JceWH9ScC9PA", + "name": "calculator", + "response": { + "a": 6, + "b": 7, + "operation": "multiply", + "result": 42 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_multiply-1", + "userContent": { + "parts": [ + { + "text": "calc multiply 6 7" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 42" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_2", + "args": { + "a": 6, + "b": 7, + "operation": "multiply" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_2", + "name": "calculator", + "response": { + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "evalStatus": 2, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "textCriterion": { + "matchStrategy": "exact" + } + }, + "response": { + "textCriterion": { + "matchStrategy": "contains" + } + } + } + } + } + } + ] + } + ], + "sessionId": "15df54b1-8374-432e-a695-cc7199196a6b", + "userId": "user" + } + ], + "creationTimestamp": 1763995079.6665025 +} From 57b4fd91b0652b060db627f49ddd7802159a5159 Mon Sep 17 00:00:00 2001 From: hackerli Date: Mon, 24 Nov 2025 23:33:40 +0800 Subject: [PATCH 08/14] examples --- examples/evaluation/inmemory/main.go | 28 +- examples/evaluation/inmemory/server.log.txt | 441 ------------------ .../math-eval-app/math-basic.evalset.json | 6 + .../math-eval-app/math-basic.metrics.json | 8 +- ...4439-89d9-84fb9ffc21f8.evalset_result.json | 304 ------------ ...40cd-b3f8-ed8c6c15564d.evalset_result.json | 304 ------------ ...44d3-babe-035491901899.evalset_result.json | 304 ------------ ...dcf-816e-3474e85b3494.evalset_result.json} | 66 ++- 8 files changed, 51 insertions(+), 1410 deletions(-) delete mode 100644 examples/evaluation/inmemory/server.log.txt delete mode 100644 examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_22fca4c5-3e6c-4439-89d9-84fb9ffc21f8.evalset_result.json delete mode 100644 examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_67e39166-a96d-40cd-b3f8-ed8c6c15564d.evalset_result.json delete mode 100644 examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_cd3fd79f-898b-44d3-babe-035491901899.evalset_result.json rename examples/evaluation/local/output/math-eval-app/{math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973.evalset_result.json => math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494.evalset_result.json} (80%) diff --git a/examples/evaluation/inmemory/main.go b/examples/evaluation/inmemory/main.go index 86f6c1d05..5655bd876 100644 --- a/examples/evaluation/inmemory/main.go +++ b/examples/evaluation/inmemory/main.go @@ -16,7 +16,7 @@ import ( "trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/registry" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" - "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/maptext" + cjson "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" ctooltrajectory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" metricinmemory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/inmemory" @@ -164,7 +164,10 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { ID: "tool_use_1", Name: "calculator", Response: map[string]interface{}{ - "result": 5.0, + "a": 2.0, + "b": 3.0, + "operation": "add", + "result": 5.0, }, }, }, @@ -214,7 +217,10 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { ID: "tool_use_2", Name: "calculator", Response: map[string]interface{}{ - "result": 5.0, + "a": 6.0, + "b": 7.0, + "operation": "multiply", + "result": 42.0, }, }, }, @@ -228,8 +234,6 @@ func prepareEvalSet(ctx context.Context, evalSetManager evalset.Manager) error { }, } for _, evalCase := range cases { - data, _ := json.MarshalIndent(evalCase, "", " ") - fmt.Println(string(data)) if err := evalSetManager.AddCase(ctx, appName, evalSetID, evalCase); err != nil { return err } @@ -249,15 +253,11 @@ func prepareMetric(ctx context.Context, metricManager metric.Manager) error { Name: &text.TextCriterion{ MatchStrategy: text.TextMatchStrategyExact, }, - Arguments: &maptext.MapTextCriterion{ - TextCriterion: &text.TextCriterion{ - MatchStrategy: text.TextMatchStrategyExact, - }, + Arguments: &cjson.JSONCriterion{ + MatchStrategy: cjson.JSONMatchStrategyExact, }, - Response: &maptext.MapTextCriterion{ - TextCriterion: &text.TextCriterion{ - MatchStrategy: text.TextMatchStrategyContains, - }, + Response: &cjson.JSONCriterion{ + MatchStrategy: cjson.JSONMatchStrategyExact, }, }, ), @@ -265,7 +265,5 @@ func prepareMetric(ctx context.Context, metricManager metric.Manager) error { ), ), } - data, _ := json.MarshalIndent(evalMetric, "", " ") - fmt.Println(string(data)) return metricManager.Add(ctx, appName, evalSetID, evalMetric) } diff --git a/examples/evaluation/inmemory/server.log.txt b/examples/evaluation/inmemory/server.log.txt deleted file mode 100644 index d3bb5602c..000000000 --- a/examples/evaluation/inmemory/server.log.txt +++ /dev/null @@ -1,441 +0,0 @@ -{ - "evalId": "calc_add", - "conversation": [ - { - "invocationId": "calc_add-1", - "userContent": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 5" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "tool_use_1", - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "tool_use_1", - "name": "calculator", - "response": { - "result": 5 - } - } - ] - } - } - ], - "sessionInput": { - "appName": "math-eval-app", - "userId": "user" - } -} -{ - "evalId": "calc_multiply", - "conversation": [ - { - "invocationId": "calc_multiply-1", - "userContent": { - "parts": [ - { - "text": "calc multiply 6 7" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 42" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "tool_use_2", - "args": { - "a": 6, - "b": 7, - "operation": "multiply" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "tool_use_2", - "name": "calculator", - "response": { - "result": 5 - } - } - ] - } - } - ], - "sessionInput": { - "appName": "math-eval-app", - "userId": "user" - } -} -{ - "metricName": "tool_trajectory_avg_score", - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } -} -✅ Evaluation completed -App: math-eval-app -Eval Set: math-basic -Overall Status: failed -Runs: 1 -Case calc_add -> failed - Metric tool_trajectory_avg_score: score 0.00 (threshold 1.00) => failed - -Case calc_multiply -> failed - Metric tool_trajectory_avg_score: score 0.00 (threshold 1.00) => failed - -✅ Evaluation details: -{ - "evalSetResultId": "math-eval-app_math-basic_eca4b66c-014a-42fc-90ba-b313637dcb9b", - "evalSetResultName": "math-eval-app_math-basic_eca4b66c-014a-42fc-90ba-b313637dcb9b", - "evalSetId": "math-basic", - "evalCaseResults": [ - { - "evalSetId": "math-basic", - "evalId": "calc_add", - "finalEvalStatus": 2, - "overallEvalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ], - "evalMetricResultPerInvocation": [ - { - "actualInvocation": { - "invocationId": "73836fae-8d1f-4992-97f6-ff098ddfde88", - "userContent": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "The result of 2 + 3 is **5**." - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "call_00_B9PxU7IhUhm9sBJFVYLJRoqq", - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "call_00_B9PxU7IhUhm9sBJFVYLJRoqq", - "name": "calculator", - "response": { - "a": 2, - "b": 3, - "operation": "add", - "result": 5 - } - } - ] - } - }, - "expectedInvocation": { - "invocationId": "calc_add-1", - "userContent": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 5" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "tool_use_1", - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "tool_use_1", - "name": "calculator", - "response": { - "result": 5 - } - } - ] - }, - "creationTimestamp": 1763994843.1042104 - }, - "evalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ] - } - ], - "sessionId": "4e881454-4927-445d-ae6b-b958f874de97", - "userId": "user" - }, - { - "evalSetId": "math-basic", - "evalId": "calc_multiply", - "finalEvalStatus": 2, - "overallEvalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ], - "evalMetricResultPerInvocation": [ - { - "actualInvocation": { - "invocationId": "9e5a66aa-96c0-44be-98f3-1eb682777349", - "userContent": { - "parts": [ - { - "text": "calc multiply 6 7" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "6 multiplied by 7 equals 42." - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "call_00_Yve8PrpOJrt7PpXR0LTi3Jbm", - "args": { - "a": 6, - "b": 7, - "operation": "multiply" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "call_00_Yve8PrpOJrt7PpXR0LTi3Jbm", - "name": "calculator", - "response": { - "a": 6, - "b": 7, - "operation": "multiply", - "result": 42 - } - } - ] - } - }, - "expectedInvocation": { - "invocationId": "calc_multiply-1", - "userContent": { - "parts": [ - { - "text": "calc multiply 6 7" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 42" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "tool_use_2", - "args": { - "a": 6, - "b": 7, - "operation": "multiply" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "tool_use_2", - "name": "calculator", - "response": { - "result": 5 - } - } - ] - }, - "creationTimestamp": 1763994843.1043565 - }, - "evalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ] - } - ], - "sessionId": "da8a7256-2b1a-4cc6-a695-3e949df0664e", - "userId": "user" - } - ], - "creationTimestamp": 1763994850.5015793 -} diff --git a/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json b/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json index 09f4fb08d..c932d867f 100644 --- a/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json +++ b/examples/evaluation/local/data/math-eval-app/math-basic.evalset.json @@ -40,6 +40,9 @@ "id": "tool_use_1", "name": "calculator", "response": { + "a": 2, + "b": 3, + "operation": "add", "result": 5 } } @@ -90,6 +93,9 @@ "id": "tool_use_2", "name": "calculator", "response": { + "a": 6, + "b": 7, + "operation": "multiply", "result": 42 } } diff --git a/examples/evaluation/local/data/math-eval-app/math-basic.metrics.json b/examples/evaluation/local/data/math-eval-app/math-basic.metrics.json index 5f50c1e16..7d2822b83 100644 --- a/examples/evaluation/local/data/math-eval-app/math-basic.metrics.json +++ b/examples/evaluation/local/data/math-eval-app/math-basic.metrics.json @@ -9,14 +9,10 @@ "matchStrategy": "exact" }, "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } + "matchStrategy": "exact" }, "response": { - "textCriterion": { - "matchStrategy": "contains" - } + "matchStrategy": "exact" } } } diff --git a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_22fca4c5-3e6c-4439-89d9-84fb9ffc21f8.evalset_result.json b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_22fca4c5-3e6c-4439-89d9-84fb9ffc21f8.evalset_result.json deleted file mode 100644 index c4f9d3408..000000000 --- a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_22fca4c5-3e6c-4439-89d9-84fb9ffc21f8.evalset_result.json +++ /dev/null @@ -1,304 +0,0 @@ -{ - "evalSetResultId": "math-eval-app_math-basic_22fca4c5-3e6c-4439-89d9-84fb9ffc21f8", - "evalSetResultName": "math-eval-app_math-basic_22fca4c5-3e6c-4439-89d9-84fb9ffc21f8", - "evalSetId": "math-basic", - "evalCaseResults": [ - { - "evalSetId": "math-basic", - "evalId": "calc_add", - "finalEvalStatus": 2, - "overallEvalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ], - "evalMetricResultPerInvocation": [ - { - "actualInvocation": { - "invocationId": "d1e270be-49cc-4e5c-ab2c-5f9ce3b8dd50", - "userContent": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "The result of 2 + 3 is **5**." - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "call_00_Lc9bIxjl5clAR3uYhgskn9CL", - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "call_00_Lc9bIxjl5clAR3uYhgskn9CL", - "name": "calculator", - "response": { - "a": 2, - "b": 3, - "operation": "add", - "result": 5 - } - } - ] - } - }, - "expectedInvocation": { - "invocationId": "calc_add-1", - "userContent": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 5" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "tool_use_1", - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "tool_use_1", - "name": "calculator", - "response": { - "result": 5 - } - } - ] - } - }, - "evalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ] - } - ], - "sessionId": "9f022a19-534f-4b18-ad32-407432ebcc32", - "userId": "user" - }, - { - "evalSetId": "math-basic", - "evalId": "calc_multiply", - "finalEvalStatus": 2, - "overallEvalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ], - "evalMetricResultPerInvocation": [ - { - "actualInvocation": { - "invocationId": "04b38039-9d2e-4671-8799-7462836b0b32", - "userContent": { - "parts": [ - { - "text": "calc multiply 6 7" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "6 multiplied by 7 equals 42." - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "call_00_zrI87m7cYyk15EE3awVgOyNF", - "args": { - "a": 6, - "b": 7, - "operation": "multiply" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "call_00_zrI87m7cYyk15EE3awVgOyNF", - "name": "calculator", - "response": { - "a": 6, - "b": 7, - "operation": "multiply", - "result": 42 - } - } - ] - } - }, - "expectedInvocation": { - "invocationId": "calc_multiply-1", - "userContent": { - "parts": [ - { - "text": "calc multiply 6 7" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 42" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "tool_use_2", - "args": { - "a": 6, - "b": 7, - "operation": "multiply" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "tool_use_2", - "name": "calculator", - "response": { - "result": 5 - } - } - ] - } - }, - "evalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ] - } - ], - "sessionId": "d0e25241-c79e-4b64-ba0c-6c4115124224", - "userId": "user" - } - ], - "creationTimestamp": 1763995039.3679564 -} diff --git a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_67e39166-a96d-40cd-b3f8-ed8c6c15564d.evalset_result.json b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_67e39166-a96d-40cd-b3f8-ed8c6c15564d.evalset_result.json deleted file mode 100644 index a9ce3e8dd..000000000 --- a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_67e39166-a96d-40cd-b3f8-ed8c6c15564d.evalset_result.json +++ /dev/null @@ -1,304 +0,0 @@ -{ - "evalSetResultId": "math-eval-app_math-basic_67e39166-a96d-40cd-b3f8-ed8c6c15564d", - "evalSetResultName": "math-eval-app_math-basic_67e39166-a96d-40cd-b3f8-ed8c6c15564d", - "evalSetId": "math-basic", - "evalCaseResults": [ - { - "evalSetId": "math-basic", - "evalId": "calc_add", - "finalEvalStatus": 2, - "overallEvalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ], - "evalMetricResultPerInvocation": [ - { - "actualInvocation": { - "invocationId": "585658a1-c131-4dcf-beb4-0245dc6e1941", - "userContent": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "The result of 2 + 3 is **5**." - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "call_00_EHcJm9ANKdFk42iLG5yXqquP", - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "call_00_EHcJm9ANKdFk42iLG5yXqquP", - "name": "calculator", - "response": { - "a": 2, - "b": 3, - "operation": "add", - "result": 5 - } - } - ] - } - }, - "expectedInvocation": { - "invocationId": "calc_add-1", - "userContent": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 5" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "tool_use_1", - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "tool_use_1", - "name": "calculator", - "response": { - "result": 5 - } - } - ] - } - }, - "evalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ] - } - ], - "sessionId": "fad87330-e5f2-487a-95ac-3a5d458790bf", - "userId": "user" - }, - { - "evalSetId": "math-basic", - "evalId": "calc_multiply", - "finalEvalStatus": 2, - "overallEvalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ], - "evalMetricResultPerInvocation": [ - { - "actualInvocation": { - "invocationId": "6c66b8cc-a8e0-4f52-b40a-9cf97df72dc7", - "userContent": { - "parts": [ - { - "text": "calc multiply 6 7" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "6 multiplied by 7 equals 42." - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "call_00_8PwymQ0Vgw7hhmqgpy4jJ2Pl", - "args": { - "a": 6, - "b": 7, - "operation": "multiply" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "call_00_8PwymQ0Vgw7hhmqgpy4jJ2Pl", - "name": "calculator", - "response": { - "a": 6, - "b": 7, - "operation": "multiply", - "result": 42 - } - } - ] - } - }, - "expectedInvocation": { - "invocationId": "calc_multiply-1", - "userContent": { - "parts": [ - { - "text": "calc multiply 6 7" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 42" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "tool_use_2", - "args": { - "a": 6, - "b": 7, - "operation": "multiply" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "tool_use_2", - "name": "calculator", - "response": { - "result": 5 - } - } - ] - } - }, - "evalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ] - } - ], - "sessionId": "a8434547-2943-4779-9199-221005869965", - "userId": "user" - } - ], - "creationTimestamp": 1763994917.7690995 -} diff --git a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_cd3fd79f-898b-44d3-babe-035491901899.evalset_result.json b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_cd3fd79f-898b-44d3-babe-035491901899.evalset_result.json deleted file mode 100644 index 5425b4d62..000000000 --- a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_cd3fd79f-898b-44d3-babe-035491901899.evalset_result.json +++ /dev/null @@ -1,304 +0,0 @@ -{ - "evalSetResultId": "math-eval-app_math-basic_cd3fd79f-898b-44d3-babe-035491901899", - "evalSetResultName": "math-eval-app_math-basic_cd3fd79f-898b-44d3-babe-035491901899", - "evalSetId": "math-basic", - "evalCaseResults": [ - { - "evalSetId": "math-basic", - "evalId": "calc_add", - "finalEvalStatus": 2, - "overallEvalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ], - "evalMetricResultPerInvocation": [ - { - "actualInvocation": { - "invocationId": "a85ab257-96ba-4650-920d-193363936d1c", - "userContent": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "The result of 2 + 3 is **5**." - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "call_00_czP9rKlVRjw9BY5uLhjcnDYX", - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "call_00_czP9rKlVRjw9BY5uLhjcnDYX", - "name": "calculator", - "response": { - "a": 2, - "b": 3, - "operation": "add", - "result": 5 - } - } - ] - } - }, - "expectedInvocation": { - "invocationId": "calc_add-1", - "userContent": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 5" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "tool_use_1", - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "tool_use_1", - "name": "calculator", - "response": { - "result": 5 - } - } - ] - } - }, - "evalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ] - } - ], - "sessionId": "78a21442-3299-4608-b40e-d6db402c5afc", - "userId": "user" - }, - { - "evalSetId": "math-basic", - "evalId": "calc_multiply", - "finalEvalStatus": 2, - "overallEvalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ], - "evalMetricResultPerInvocation": [ - { - "actualInvocation": { - "invocationId": "29574cb6-9f23-4342-b6e6-8d2207d4d7eb", - "userContent": { - "parts": [ - { - "text": "calc multiply 6 7" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "6 multiplied by 7 equals 42." - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "call_00_1z0xkVXrnI83JceWH9ScC9PA", - "args": { - "a": 6, - "b": 7, - "operation": "multiply" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "call_00_1z0xkVXrnI83JceWH9ScC9PA", - "name": "calculator", - "response": { - "a": 6, - "b": 7, - "operation": "multiply", - "result": 42 - } - } - ] - } - }, - "expectedInvocation": { - "invocationId": "calc_multiply-1", - "userContent": { - "parts": [ - { - "text": "calc multiply 6 7" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 42" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "id": "tool_use_2", - "args": { - "a": 6, - "b": 7, - "operation": "multiply" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "id": "tool_use_2", - "name": "calculator", - "response": { - "result": 5 - } - } - ] - } - }, - "evalMetricResults": [ - { - "metricName": "tool_trajectory_avg_score", - "evalStatus": 2, - "threshold": 1, - "criterion": { - "toolTrajectory": { - "defaultStrategy": { - "name": { - "matchStrategy": "exact" - }, - "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } - }, - "response": { - "textCriterion": { - "matchStrategy": "contains" - } - } - } - } - } - } - ] - } - ], - "sessionId": "15df54b1-8374-432e-a695-cc7199196a6b", - "userId": "user" - } - ], - "creationTimestamp": 1763995079.6665025 -} diff --git a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973.evalset_result.json b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494.evalset_result.json similarity index 80% rename from examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973.evalset_result.json rename to examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494.evalset_result.json index 6798bc577..09a53b9ff 100644 --- a/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973.evalset_result.json +++ b/examples/evaluation/local/output/math-eval-app/math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494.evalset_result.json @@ -1,6 +1,6 @@ { - "evalSetResultId": "math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973", - "evalSetResultName": "math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973", + "evalSetResultId": "math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494", + "evalSetResultName": "math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494", "evalSetId": "math-basic", "evalCaseResults": [ { @@ -20,14 +20,10 @@ "matchStrategy": "exact" }, "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } + "matchStrategy": "exact" }, "response": { - "textCriterion": { - "matchStrategy": "contains" - } + "matchStrategy": "exact" } } } @@ -37,7 +33,7 @@ "evalMetricResultPerInvocation": [ { "actualInvocation": { - "invocationId": "49ff84cf-ad89-42ab-be07-1fffc4dc78f2", + "invocationId": "53845847-16e0-4960-9d00-d3abf0ab1807", "userContent": { "parts": [ { @@ -57,7 +53,7 @@ "intermediateData": { "toolUses": [ { - "id": "call_00_pCwL67NPbNQAJEvZjvxuthX6", + "id": "call_00_J7WhW8PJurYtWji3J5H7ITN9", "args": { "a": 2, "b": 3, @@ -68,7 +64,7 @@ ], "toolResponses": [ { - "id": "call_00_pCwL67NPbNQAJEvZjvxuthX6", + "id": "call_00_J7WhW8PJurYtWji3J5H7ITN9", "name": "calculator", "response": { "a": 2, @@ -101,6 +97,7 @@ "intermediateData": { "toolUses": [ { + "id": "tool_use_1", "args": { "a": 2, "b": 3, @@ -111,9 +108,13 @@ ], "toolResponses": [ { + "id": "tool_use_1", "name": "calculator", "response": { - "result": 42 + "a": 2, + "b": 3, + "operation": "add", + "result": 5 } } ] @@ -132,14 +133,10 @@ "matchStrategy": "exact" }, "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } + "matchStrategy": "exact" }, "response": { - "textCriterion": { - "matchStrategy": "contains" - } + "matchStrategy": "exact" } } } @@ -148,7 +145,7 @@ ] } ], - "sessionId": "007a49f9-5a2c-49ba-a6ae-b0657d50aafb", + "sessionId": "e9cc851f-8c89-45f4-b430-7c54991c7dda", "userId": "user" }, { @@ -168,14 +165,10 @@ "matchStrategy": "exact" }, "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } + "matchStrategy": "exact" }, "response": { - "textCriterion": { - "matchStrategy": "contains" - } + "matchStrategy": "exact" } } } @@ -185,7 +178,7 @@ "evalMetricResultPerInvocation": [ { "actualInvocation": { - "invocationId": "353274d3-694d-4de4-8dd8-e2cdde2ad5f5", + "invocationId": "bfc1f3ef-7b7c-4d36-ac50-6fa3a9991abb", "userContent": { "parts": [ { @@ -205,7 +198,7 @@ "intermediateData": { "toolUses": [ { - "id": "call_00_sgCNfRj0X4wDh6PqfuUUu5NC", + "id": "call_00_xcGeTAsvZJxhKMA8oT478nMP", "args": { "a": 6, "b": 7, @@ -216,7 +209,7 @@ ], "toolResponses": [ { - "id": "call_00_sgCNfRj0X4wDh6PqfuUUu5NC", + "id": "call_00_xcGeTAsvZJxhKMA8oT478nMP", "name": "calculator", "response": { "a": 6, @@ -249,6 +242,7 @@ "intermediateData": { "toolUses": [ { + "id": "tool_use_2", "args": { "a": 6, "b": 7, @@ -259,8 +253,12 @@ ], "toolResponses": [ { + "id": "tool_use_2", "name": "calculator", "response": { + "a": 6, + "b": 7, + "operation": "multiply", "result": 42 } } @@ -280,14 +278,10 @@ "matchStrategy": "exact" }, "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } + "matchStrategy": "exact" }, "response": { - "textCriterion": { - "matchStrategy": "contains" - } + "matchStrategy": "exact" } } } @@ -296,9 +290,9 @@ ] } ], - "sessionId": "a47948ed-4bb0-4c2b-a1a0-05d101dfe3e1", + "sessionId": "bd844ee7-066c-43b0-adfa-34e1d8bffeb6", "userId": "user" } ], - "creationTimestamp": 1763960812.6226852 + "creationTimestamp": 1763997862.5581782 } From f06da4e8a855578b5334b133fc6af83b1767f657 Mon Sep 17 00:00:00 2001 From: hackerli Date: Tue, 25 Nov 2025 09:56:45 +0800 Subject: [PATCH 09/14] ignore --- evaluation/metric/criterion/json/json.go | 6 ++++-- evaluation/metric/criterion/text/text.go | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/evaluation/metric/criterion/json/json.go b/evaluation/metric/criterion/json/json.go index 2fbf56fde..f1e1c196e 100644 --- a/evaluation/metric/criterion/json/json.go +++ b/evaluation/metric/criterion/json/json.go @@ -17,6 +17,8 @@ import ( // JSONCriterion compares two JSON objects using exact matching. type JSONCriterion struct { + // Ignore skips comparison when true. + Ignore bool `json:"ignore,omitempty"` // MatchStrategy selects the comparison rule. MatchStrategy JSONMatchStrategy `json:"matchStrategy,omitempty"` // Compare overrides default comparison when provided. @@ -33,8 +35,8 @@ const ( // Match compares two JSON objects using custom logic or deep equality. func (j *JSONCriterion) Match(actual, expected map[string]any) (bool, error) { - if j == nil { - return false, fmt.Errorf("json criterion is nil") + if j.Ignore { + return true, nil } if j.Compare != nil { return j.Compare(actual, expected) diff --git a/evaluation/metric/criterion/text/text.go b/evaluation/metric/criterion/text/text.go index 031d0bae8..356bc361f 100644 --- a/evaluation/metric/criterion/text/text.go +++ b/evaluation/metric/criterion/text/text.go @@ -42,12 +42,12 @@ const ( // Match compares source and target using the configured strategy. func (t *TextCriterion) Match(source, target string) (bool, error) { - if t.Compare != nil { - return t.Compare(source, target) - } if t.Ignore { return true, nil } + if t.Compare != nil { + return t.Compare(source, target) + } if t.CaseInsensitive { source = strings.ToLower(source) target = strings.ToLower(target) From 4877edd8f73875e1b0b145933e952850116dfcc3 Mon Sep 17 00:00:00 2001 From: hackerli Date: Tue, 25 Nov 2025 11:19:45 +0800 Subject: [PATCH 10/14] docs --- docs/mkdocs/zh/evaluation.md | 345 +++++++++++++++++++++++++++-------- 1 file changed, 264 insertions(+), 81 deletions(-) diff --git a/docs/mkdocs/zh/evaluation.md b/docs/mkdocs/zh/evaluation.md index f4a435396..3d7ff982e 100644 --- a/docs/mkdocs/zh/evaluation.md +++ b/docs/mkdocs/zh/evaluation.md @@ -62,53 +62,58 @@ if err != nil { "name": "math-basic", "evalCases": [ { - "evalId": "calc_add", - "conversation": [ - { - "invocationId": "calc_add-1", - "userContent": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "finalResponse": { - "parts": [ - { - "text": "calc result: 5" - } - ], - "role": "assistant" - }, - "intermediateData": { - "toolUses": [ - { - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ], - "toolResponses": [ - { - "name": "calculator", - "response": { - "result": 5 + "evalId": "calc_add", + "conversation": [ + { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" } - } - ] + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } } + ], + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" } - ], - "sessionInput": { - "appName": "math-eval-app", - "userId": "user" } - } ], "creationTimestamp": 1761134484.9804401 } @@ -128,14 +133,10 @@ if err != nil { "matchStrategy": "exact" }, "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } + "matchStrategy": "exact" }, "response": { - "textCriterion": { - "matchStrategy": "contains" - } + "matchStrategy": "exact" } } } @@ -148,8 +149,8 @@ if err != nil { ```json { - "evalSetResultId": "math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973", - "evalSetResultName": "math-eval-app_math-basic_c95c08af-c85c-43c4-ac7b-ee12870fa973", + "evalSetResultId": "math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494", + "evalSetResultName": "math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494", "evalSetId": "math-basic", "evalCaseResults": [ { @@ -169,14 +170,10 @@ if err != nil { "matchStrategy": "exact" }, "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } + "matchStrategy": "exact" }, "response": { - "textCriterion": { - "matchStrategy": "contains" - } + "matchStrategy": "exact" } } } @@ -186,7 +183,7 @@ if err != nil { "evalMetricResultPerInvocation": [ { "actualInvocation": { - "invocationId": "49ff84cf-ad89-42ab-be07-1fffc4dc78f2", + "invocationId": "53845847-16e0-4960-9d00-d3abf0ab1807", "userContent": { "parts": [ { @@ -206,7 +203,7 @@ if err != nil { "intermediateData": { "toolUses": [ { - "id": "call_00_pCwL67NPbNQAJEvZjvxuthX6", + "id": "call_00_J7WhW8PJurYtWji3J5H7ITN9", "args": { "a": 2, "b": 3, @@ -217,7 +214,7 @@ if err != nil { ], "toolResponses": [ { - "id": "call_00_pCwL67NPbNQAJEvZjvxuthX6", + "id": "call_00_J7WhW8PJurYtWji3J5H7ITN9", "name": "calculator", "response": { "a": 2, @@ -250,6 +247,7 @@ if err != nil { "intermediateData": { "toolUses": [ { + "id": "tool_use_1", "args": { "a": 2, "b": 3, @@ -260,8 +258,12 @@ if err != nil { ], "toolResponses": [ { + "id": "tool_use_1", "name": "calculator", "response": { + "a": 2, + "b": 3, + "operation": "add", "result": 5 } } @@ -281,14 +283,10 @@ if err != nil { "matchStrategy": "exact" }, "arguments": { - "textCriterion": { - "matchStrategy": "exact" - } + "matchStrategy": "exact" }, "response": { - "textCriterion": { - "matchStrategy": "contains" - } + "matchStrategy": "exact" } } } @@ -297,11 +295,11 @@ if err != nil { ] } ], - "sessionId": "007a49f9-5a2c-49ba-a6ae-b0657d50aafb", + "sessionId": "e9cc851f-8c89-45f4-b430-7c54991c7dda", "userId": "user" } ], - "creationTimestamp": 1763960812.6226852 + "creationTimestamp": 1763997862.5581782 } ``` @@ -397,6 +395,7 @@ cases := []*evalset.EvalCase{ IntermediateData: &evalset.IntermediateData{ ToolUses: []*genai.FunctionCall{ { + ID: "tool_use_1", Name: "calculator", Args: map[string]interface{}{ "operation": "add", @@ -407,9 +406,13 @@ cases := []*evalset.EvalCase{ }, ToolResponses: []*genai.FunctionResponse{ { + ID: "tool_use_1", Name: "calculator", Response: map[string]interface{}{ - "result": 5.0, + "a": 2.0, + "b": 3.0, + "operation": "add", + "result": 5.0, }, }, }, @@ -445,15 +448,11 @@ evalMetric := &metric.EvalMetric{ Name: &text.TextCriterion{ MatchStrategy: text.TextMatchStrategyExact, }, - Arguments: &maptext.MapTextCriterion{ - TextCriterion: &text.TextCriterion{ - MatchStrategy: text.TextMatchStrategyExact, - }, + Arguments: &cjson.JSONCriterion{ + MatchStrategy: cjson.JSONMatchStrategyExact, }, - Response: &maptext.MapTextCriterion{ - TextCriterion: &text.TextCriterion{ - MatchStrategy: text.TextMatchStrategyContains, - }, + Response: &cjson.JSONCriterion{ + MatchStrategy: cjson.JSONMatchStrategyExact, }, }, ), @@ -561,18 +560,29 @@ type Manager interface { ### 评估指标 -- Metric -Metric 表示一个评估指标,用于衡量 EvalSet 的某一方面表现。 +Metric 表示一个评估指标,用于衡量 EvalSet 的某一方面表现,每个评估指标包含指标名、评估准则和评分阈值。 -每个指标包含指标名和评分阈值: +评估过程中,评估器会根据配置的评估准则对实际会话与预期会话进行比较,计算出该指标的评估得分,并与阈值进行对比: - 当评估得分低于阈值时,指标判定为未通过。 - 当评估得分达到或超过阈值时,指标判定为通过。 ```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + // EvalMetric 表示用于评估 EvalCase 的单项指标 type EvalMetric struct { - MetricName string // 指标名称 - Threshold float64 // 评分阈值 + MetricName string // 指标名称 + Threshold float64 // 评分阈值 + Criterion *criterion.Criterion // 评估准则 +} + +// Criterion 聚合各类评估准则 +type Criterion struct { + ToolTrajectory *tooltrajectory.ToolTrajectoryCriterion // 工具轨迹评估准则 } ``` @@ -1157,3 +1167,176 @@ func (l *customLocator) List(baseDir, appName string) ([]string, error) { return results, nil } ``` + +### 评估准则 + +评估准则描述具体的评估方式,可按需组合使用。 + +框架内置了以下评估准则类型: + +| 准则类型 | 适用对象 | +|-------------------------|--------------------------------------| +| TextCriterion | 文本字符串 | +| JSONCriterion | JSON 对象,通常用于比较 map[string]any | +| ToolTrajectoryCriterion | 工具调用轨迹 | +| Criterion | 多种准则的聚合 | + +#### TextCriterion + +TextCriterion 用于字符串匹配,可配置是否忽略大小写和具体的匹配策略。 + +```go +// TextCriterion 定义字符串的匹配方式。 +type TextCriterion struct { + Ignore bool // 是否跳过匹配 + CaseInsensitive bool // 是否大小写不敏感 + MatchStrategy TextMatchStrategy // 匹配策略 + Compare func(actual, expected string) (bool, error) // 自定义比较 +} +``` + +TextMatchStrategy 取值说明: + +| TextMatchStrategy 取值 | 说明 | +|-----------------------|------------------------------| +| exact | 实际字符串与预期字符串完全一致(默认)。 | +| contains | 实际字符串包含预期字符串。 | +| regex | 实际字符串满足预期字符串作为正则表达式。 | + +#### JSONCriterion + +JSONCriterion 用于对比结构化 JSON 数据,可配置是否忽略比较以及具体的匹配策略。 + +```go +// JSONCriterion 定义 JSON 对象的匹配方式。 +type JSONCriterion struct { + Ignore bool // 是否跳过匹配 + MatchStrategy JSONMatchStrategy // 匹配策略 + Compare func(actual, expected map[string]any) (bool, error) // 自定义比较 +} +``` + +JSONMatchStrategy 取值说明: + +| JSONMatchStrategy 取值 | 说明 | +|-----------------------|------------------------------| +| exact | 实际 JSON 与预期 JSON 完全一致(默认)。 | + +#### ToolTrajectoryCriterion + +ToolTrajectoryCriterion 用于配置工具调用与响应的评估准则,可设置默认策略、按工具名定制策略以及是否忽略调用顺序。 + +```go +// ToolTrajectoryCriterion 定义工具调用与响应的评估准则。 +type ToolTrajectoryCriterion struct { + DefaultStrategy *ToolTrajectoryStrategy // 默认策略 + ToolStrategy map[string]*ToolTrajectoryStrategy // 按工具名定制策略 + OrderInsensitive bool // 是否忽略调用顺序 + Compare func(actual, expected *evalset.Invocation) (bool, error) // 自定义比较 +} + +// ToolTrajectoryStrategy 定义单个工具的匹配策略。 +type ToolTrajectoryStrategy struct { + Name *TextCriterion // 工具名匹配 + Arguments *JSONCriterion // 调用参数匹配 + Response *JSONCriterion // 工具响应匹配 +} +``` + +DefaultStrategy 用于配置全局默认评估准则,适用于所有工具。 + +ToolStrategy 按工具名覆盖特定工具的评估准则,未设置 ToolStrategy 时所有工具调用都使用 DefaultStrategy。 + +若未设置任何评估准则,框架会使用默认评估准则:工具名按 TextCriterion 的 exact 策略比较,参数和响应按 JSONCriterion 的 exact 策略比较,保证工具轨迹评估始终有合理的兜底行为。 + +下面的示例展示了一个典型场景,大部分工具希望严格对齐工具调用和结果,但 current_time 这类时间相关工具的响应值本身不稳定,因此只需要检查是否按预期调用了正确的工具和参数,而不要求时间值本身完全一致。 + +```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +criterion := criterion.New( + criterion.WithToolTrajectory( + tooltrajectory.New( + tooltrajectory.WithDefault( + &tooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &json.JSONCriterion{ + MatchStrategy: json.JSONMatchStrategyExact, + }, + Response: &json.JSONCriterion{ + MatchStrategy: json.JSONMatchStrategyExact, + }, + }, + ), + tooltrajectory.WithTool(map[string]*tooltrajectory.ToolTrajectoryStrategy{ + "current_time": { + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &json.JSONCriterion{ + MatchStrategy: json.JSONMatchStrategyExact, + }, + Response: &json.JSONCriterion{ + Ignore: true, // 忽略该工具响应的匹配. + }, + }, + }), + ), + ), +) +``` + +默认情况下,工具调用是按出现顺序逐条比对的,实际调用工具序列与预期工具调用序列在长度、顺序以及每一步的工具名、参数和响应上都需要匹配,若调用顺序不同则会被判定为评估不通过。 + +OrderInsensitive 用于控制是否对工具调用顺序不敏感。开启后,评估逻辑会先为每一次工具调用生成一个排序键(由工具名以及参数和响应的规范化表示共同构成),再分别对实际调用序列和预期调用序列按照这一键进行排序,得到两个具有稳定顺序的调用列表;随后按排序后的顺序逐一比对对应位置的调用,并根据配置的评估准则判断这些调用是否匹配。换简单来说,只要两侧包含的工具调用在调用内容上完全一致,即使原始调用顺序不同,也不会因为顺序差异而导致评估不通过,示例如下: + +```go +criterion := criterion.New( + criterion.WithToolTrajectory( + ctooltrajectory.New( + ctooltrajectory.WithOrderInsensitive(true), + ), + ), +) +``` + +### 评估器 + +#### 工具轨迹评估器 + +工具轨迹评估器对应的指标名称为 `tool_trajectory_avg_score`,用于评估 Agent 在多次会话中对工具的使用是否符合预期。 + +在单次会话中,评估器会使用 `ToolTrajectoryCriterion` 对实际工具调用轨迹与预期轨迹进行比较: + +- 若整条工具调用轨迹满足评估准则,则该会话在此指标上的得分为 1。 +- 若任意一步调用不满足评估准则,则该会话在此指标上的得分为 0。 + +在多次会话的场景下,评估器会对所有会话在该指标上的得分取平均值,作为最终的 `tool_trajectory_avg_score`,并与 `EvalMetric.Threshold` 比较,得到通过/未通过的判定结果。 + +工具轨迹评估器与 Metric、Criterion 的典型组合方式如下: + +```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + ctooltrajectory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +evalMetric := &metric.EvalMetric{ + MetricName: "tool_trajectory_avg_score", + Threshold: 1.0, + Criterion: criterion.New( + criterion.WithToolTrajectory( + // 使用默认评估准则,工具的名称、参数和响应需严格一致 + ctooltrajectory.New(), + ), + ), +} +``` From aea09fe3161c39a270c8a8223b942591069dd7ab Mon Sep 17 00:00:00 2001 From: hackerli Date: Tue, 25 Nov 2025 11:31:38 +0800 Subject: [PATCH 11/14] docs --- docs/mkdocs/en/evaluation.md | 501 +++++++++++++++++++++++++++++++---- 1 file changed, 455 insertions(+), 46 deletions(-) diff --git a/docs/mkdocs/en/evaluation.md b/docs/mkdocs/en/evaluation.md index f7f1eb5e4..4ff86f38d 100644 --- a/docs/mkdocs/en/evaluation.md +++ b/docs/mkdocs/en/evaluation.md @@ -58,53 +58,64 @@ if err != nil { ```json { - "eval_set_id": "math-basic", + "evalSetId": "math-basic", "name": "math-basic", - "eval_cases": [ + "evalCases": [ { - "eval_id": "calc_add", - "conversation": [ - { - "invocation_id": "calc_add-1", - "user_content": { - "parts": [ - { - "text": "calc add 2 3" - } - ], - "role": "user" - }, - "final_response": { - "parts": [ - { - "text": "calc result: 5" - } - ], - "role": "assistant" - }, - "intermediate_data": { - "tool_uses": [ - { - "args": { - "a": 2, - "b": 3, - "operation": "add" - }, - "name": "calculator" - } - ] - }, - "creation_timestamp": 1761134484.981062 + "evalId": "calc_add", + "conversation": [ + { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + } + ], + "sessionInput": { + "appName": "math-eval-app", + "userId": "user" } - ], - "session_input": { - "app_name": "math-eval-app", - "user_id": "user" - }, - "creation_timestamp": 1761134484.981062 - }, + } ], - "creation_timestamp": 1761134484.9804401 + "creationTimestamp": 1761134484.9804401 } ``` @@ -113,8 +124,23 @@ if err != nil { ```json [ { - "metric_name": "tool_trajectory_avg_score", - "threshold": 1 + "metricName": "tool_trajectory_avg_score", + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } } ] ``` @@ -122,7 +148,159 @@ if err != nil { #### Evaluation Result File Example ```json -"{\"eval_set_result_id\":\"math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc\",\"eval_set_result_name\":\"math-eval-app_math-basic_76798060-dcc3-41e9-b20e-06f23aa3cdbc\",\"eval_set_id\":\"math-basic\",\"eval_case_results\":[{\"eval_set_id\":\"math-basic\",\"eval_id\":\"calc_add\",\"final_eval_status\":1,\"overall_eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}],\"eval_metric_result_per_invocation\":[{\"actual_invocation\":{\"invocation_id\":\"8b205b3f-682e-409a-b751-89ef805d0221\",\"user_content\":{\"parts\":[{\"text\":\"calc add 2 3\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"The result of adding 2 and 3 is **5**.\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"id\":\"call_00_j75SIh8A9xSlG61OrC1ARIab\",\"args\":{\"a\":2,\"b\":3,\"operation\":\"add\"},\"name\":\"calculator\"}]}},\"expected_invocation\":{\"invocation_id\":\"calc_add-1\",\"user_content\":{\"parts\":[{\"text\":\"calc add 2 3\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"calc result: 5\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"args\":{\"a\":2,\"b\":3,\"operation\":\"add\"},\"name\":\"calculator\"}]},\"creation_timestamp\":1761134484.981062},\"eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}]}],\"session_id\":\"74252944-b1a7-4c17-8f39-4a5809395d1d\",\"user_id\":\"user\"},{\"eval_set_id\":\"math-basic\",\"eval_id\":\"calc_multiply\",\"final_eval_status\":1,\"overall_eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}],\"eval_metric_result_per_invocation\":[{\"actual_invocation\":{\"invocation_id\":\"65226930-d45c-43ae-ab88-9c35f3abce70\",\"user_content\":{\"parts\":[{\"text\":\"calc multiply 6 7\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"6 × 7 = 42\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"id\":\"call_00_b3Gj4Y3fJu9Blkbl6H0MLquO\",\"args\":{\"a\":6,\"b\":7,\"operation\":\"multiply\"},\"name\":\"calculator\"}]}},\"expected_invocation\":{\"invocation_id\":\"calc_multiply-1\",\"user_content\":{\"parts\":[{\"text\":\"calc multiply 6 7\"}],\"role\":\"user\"},\"final_response\":{\"parts\":[{\"text\":\"calc result: 42\"}],\"role\":\"assistant\"},\"intermediate_data\":{\"tool_uses\":[{\"args\":{\"a\":6,\"b\":7,\"operation\":\"multiply\"},\"name\":\"calculator\"}]},\"creation_timestamp\":1761134484.9812014},\"eval_metric_results\":[{\"metric_name\":\"tool_trajectory_avg_score\",\"score\":1,\"eval_status\":1,\"threshold\":1}]}],\"session_id\":\"6393fabd-ab50-49b7-8656-59fcb0a29758\",\"user_id\":\"user\"}],\"creation_timestamp\":1761134849.3572516}" +{ + "evalSetResultId": "math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494", + "evalSetResultName": "math-eval-app_math-basic_d545562e-f2fa-4dcf-816e-3474e85b3494", + "evalSetId": "math-basic", + "evalCaseResults": [ + { + "evalSetId": "math-basic", + "evalId": "calc_add", + "finalEvalStatus": 1, + "overallEvalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } + } + ], + "evalMetricResultPerInvocation": [ + { + "actualInvocation": { + "invocationId": "53845847-16e0-4960-9d00-d3abf0ab1807", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "The result of 2 + 3 is **5**." + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "call_00_J7WhW8PJurYtWji3J5H7ITN9", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "call_00_J7WhW8PJurYtWji3J5H7ITN9", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "expectedInvocation": { + "invocationId": "calc_add-1", + "userContent": { + "parts": [ + { + "text": "calc add 2 3" + } + ], + "role": "user" + }, + "finalResponse": { + "parts": [ + { + "text": "calc result: 5" + } + ], + "role": "assistant" + }, + "intermediateData": { + "toolUses": [ + { + "id": "tool_use_1", + "args": { + "a": 2, + "b": 3, + "operation": "add" + }, + "name": "calculator" + } + ], + "toolResponses": [ + { + "id": "tool_use_1", + "name": "calculator", + "response": { + "a": 2, + "b": 3, + "operation": "add", + "result": 5 + } + } + ] + } + }, + "evalMetricResults": [ + { + "metricName": "tool_trajectory_avg_score", + "score": 1, + "evalStatus": 1, + "threshold": 1, + "criterion": { + "toolTrajectory": { + "defaultStrategy": { + "name": { + "matchStrategy": "exact" + }, + "arguments": { + "matchStrategy": "exact" + }, + "response": { + "matchStrategy": "exact" + } + } + } + } + } + ] + } + ], + "sessionId": "e9cc851f-8c89-45f4-b430-7c54991c7dda", + "userId": "user" + } + ], + "creationTimestamp": 1763997862.5581782 +} ``` ### inmemory @@ -217,6 +395,7 @@ cases := []*evalset.EvalCase{ IntermediateData: &evalset.IntermediateData{ ToolUses: []*genai.FunctionCall{ { + ID: "tool_use_1", Name: "calculator", Args: map[string]interface{}{ "operation": "add", @@ -225,6 +404,18 @@ cases := []*evalset.EvalCase{ }, }, }, + ToolResponses: []*genai.FunctionResponse{ + { + ID: "tool_use_1", + Name: "calculator", + Response: map[string]interface{}{ + "a": 2.0, + "b": 3.0, + "operation": "add", + "result": 5.0, + }, + }, + }, }, }, }, @@ -249,6 +440,25 @@ import "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" evalMetric := &metric.EvalMetric{ MetricName: "tool_trajectory_avg_score", Threshold: 1.0, + Criterion: criterion.New( + criterion.WithToolTrajectory( + ctooltrajectory.New( + ctooltrajectory.WithDefault( + &ctooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &cjson.JSONCriterion{ + MatchStrategy: cjson.JSONMatchStrategyExact, + }, + Response: &cjson.JSONCriterion{ + MatchStrategy: cjson.JSONMatchStrategyExact, + }, + }, + ), + ), + ), + ), } metricManager.Add(ctx, appName, evalSetID, evalMetric) ``` @@ -358,6 +568,32 @@ The framework provides two implementations of the EvalSet Manager: ### Metric +Metric represents an evaluation indicator used to measure a certain aspect of EvalSet’s performance. Each evaluation indicator includes the metric name, evaluation criterion, and score threshold. + +During the evaluation process, the evaluator compares the actual conversation with the expected conversation according to the configured evaluation criterion, calculates the evaluation score for this metric, and compares it with the threshold: + +* When the evaluation score is lower than the threshold, the metric is determined as not passed. +* When the evaluation score reaches or exceeds the threshold, the metric is determined as passed. + +```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +// EvalMetric represents a single metric used to evaluate an EvalCase. +type EvalMetric struct { + MetricName string // Metric name. + Threshold float64 // Score threshold. + Criterion *criterion.Criterion // Evaluation criterion. +} + +// Criterion aggregates various evaluation criteria. +type Criterion struct { + ToolTrajectory *tooltrajectory.ToolTrajectoryCriterion // Tool trajectory evaluation criterion. +} +``` + A Metric represents an evaluation metric used to measure a specific aspect of an EvalSet's performance. Each metric consists of a metric name and a scoring threshold: @@ -961,3 +1197,176 @@ func (l *customLocator) List(baseDir, appName string) ([]string, error) { return results, nil } ``` + +### Evaluation Criterion + +The evaluation criterion describes the specific evaluation method and can be combined as needed. + +The framework has the following built-in types of evaluation criteria: + +| Criterion Type | Applicable Object | +| ----------------------- | ----------------------------------------------------- | +| TextCriterion | Text string | +| JSONCriterion | JSON object, usually used to compare `map[string]any` | +| ToolTrajectoryCriterion | Tool invocation trajectory | +| Criterion | Aggregation of multiple criteria | + +#### TextCriterion + +TextCriterion is used for string matching and can be configured to ignore case and to use a specific matching strategy. + +```go +// TextCriterion defines the matching method for strings. +type TextCriterion struct { + Ignore bool // Whether to skip matching. + CaseInsensitive bool // Whether case-insensitive. + MatchStrategy TextMatchStrategy // Matching strategy. + Compare func(actual, expected string) (bool, error) // Custom comparison. +} +``` + +Explanation of TextMatchStrategy values: + +| TextMatchStrategy Value | Description | +| ----------------------- | ----------------------------------------------------------------------- | +| exact | The actual string is exactly the same as the expected string (default). | +| contains | The actual string contains the expected string. | +| regex | The actual string matches the expected string as a regular expression. | + +#### JSONCriterion + +JSONCriterion is used to compare structured JSON data. You can configure whether to ignore the comparison and choose a specific matching strategy. + +```go +// JSONCriterion defines the matching method for JSON objects. +type JSONCriterion struct { + Ignore bool // Whether to skip matching. + MatchStrategy JSONMatchStrategy // Matching strategy. + Compare func(actual, expected map[string]any) (bool, error) // Custom comparison. +} +``` + +Explanation of JSONMatchStrategy values: + +| JSONMatchStrategy Value | Description | +| ----------------------- | ------------------------------------------------------------------- | +| exact | The actual JSON is exactly the same as the expected JSON (default). | + +#### ToolTrajectoryCriterion + +ToolTrajectoryCriterion is used to configure the evaluation criteria for tool invocations and responses. You can set default strategies, customize strategies by tool name, and control whether to ignore the invocation order. + +```go +// ToolTrajectoryCriterion defines the evaluation criteria for tool invocations and responses. +type ToolTrajectoryCriterion struct { + DefaultStrategy *ToolTrajectoryStrategy // Default strategy. + ToolStrategy map[string]*ToolTrajectoryStrategy // Customized strategies by tool name. + OrderInsensitive bool // Whether to ignore invocation order. + Compare func(actual, expected *evalset.Invocation) (bool, error) // Custom comparison. +} + +// ToolTrajectoryStrategy defines the matching strategy for a single tool. +type ToolTrajectoryStrategy struct { + Name *TextCriterion // Tool name matching. + Arguments *JSONCriterion // Invocation arguments matching. + Response *JSONCriterion // Tool response matching. +} +``` + +DefaultStrategy is used to configure the global default evaluation criterion and applies to all tools. + +ToolStrategy overrides the evaluation criterion for specific tools by tool name. When ToolStrategy is not set, all tool invocations use DefaultStrategy. + +If no evaluation criterion is configured, the framework uses the default evaluation criterion: tool names are compared using TextCriterion with the `exact` strategy, and arguments and responses are compared using JSONCriterion with the `exact` strategy. This ensures that tool trajectory evaluation always has a reasonable fallback behavior. + +The following example illustrates a typical scenario: for most tools you want strict alignment of tool invocations and results, but for time-related tools such as `current_time`, the response value itself is unstable. Therefore, you only need to check whether the correct tool and arguments were invoked as expected, without requiring the time value itself to be exactly the same. + +```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +criterion := criterion.New( + criterion.WithToolTrajectory( + tooltrajectory.New( + tooltrajectory.WithDefault( + &tooltrajectory.ToolTrajectoryStrategy{ + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &json.JSONCriterion{ + MatchStrategy: json.JSONMatchStrategyExact, + }, + Response: &json.JSONCriterion{ + MatchStrategy: json.JSONMatchStrategyExact, + }, + }, + ), + tooltrajectory.WithTool(map[string]*tooltrajectory.ToolTrajectoryStrategy{ + "current_time": { + Name: &text.TextCriterion{ + MatchStrategy: text.TextMatchStrategyExact, + }, + Arguments: &json.JSONCriterion{ + MatchStrategy: json.JSONMatchStrategyExact, + }, + Response: &json.JSONCriterion{ + Ignore: true, // Ignore matching of this tool's response. + }, + }, + }), + ), + ), +) +``` + +By default, tool invocations are compared one by one in the order in which they appear. The actual tool invocation sequence and the expected tool invocation sequence must match in length, order, and in the tool name, arguments, and response at each step. If the invocation order is different, the evaluation will be considered as failed. + +OrderInsensitive controls whether the tool invocation order is ignored. When enabled, the evaluation logic first generates a sorting key for each tool invocation (composed of the tool name and the normalized representation of arguments and response). It then sorts the actual invocation sequence and the expected invocation sequence by this key, producing two invocation lists with stable order. Next, it compares the corresponding invocations in the sorted lists one by one, and determines whether these invocations match according to the configured evaluation criteria. Put simply, as long as the tool invocations on both sides are completely identical in content, the evaluation will not fail due to differences in the original invocation order. For example: + +```go +criterion := criterion.New( + criterion.WithToolTrajectory( + ctooltrajectory.New( + ctooltrajectory.WithOrderInsensitive(true), + ), + ), +) +``` + +### Evaluator + +#### Tool Trajectory Evaluator + +The metric name corresponding to the tool trajectory evaluator is `tool_trajectory_avg_score`. It is used to evaluate whether the Agent’s use of tools across multiple conversations conforms to expectations. + +In a single conversation, the evaluator compares the actual tool invocation trajectory with the expected trajectory using `ToolTrajectoryCriterion`: + +* If the entire tool invocation trajectory satisfies the evaluation criterion, the score of this conversation on this metric is 1. +* If any step of the invocation does not satisfy the evaluation criterion, the score of this conversation on this metric is 0. + +In the scenario of multiple conversations, the evaluator takes the average of the scores of all conversations on this metric as the final `tool_trajectory_avg_score`, and compares it with `EvalMetric.Threshold` to determine whether the result is pass or fail. + +A typical way to combine the tool trajectory evaluator with Metric and Criterion is as follows: + +```go +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric" + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + ctooltrajectory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory" +) + +evalMetric := &metric.EvalMetric{ + MetricName: "tool_trajectory_avg_score", + Threshold: 1.0, + Criterion: criterion.New( + criterion.WithToolTrajectory( + // Use the default evaluation criterion; tool name, arguments, and response must be strictly identical. + ctooltrajectory.New(), + ), + ), +} +``` From 69444a3c5ff756ba1bc156d26c0b2f8135e3a191 Mon Sep 17 00:00:00 2001 From: hackerli Date: Tue, 25 Nov 2025 11:50:20 +0800 Subject: [PATCH 12/14] test --- evaluation/metric/criterion/json/json_test.go | 25 +++++++++++++++++++ .../tooltrajectory/tooltrajectory_test.go | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/evaluation/metric/criterion/json/json_test.go b/evaluation/metric/criterion/json/json_test.go index eb6616169..b3eea033f 100644 --- a/evaluation/metric/criterion/json/json_test.go +++ b/evaluation/metric/criterion/json/json_test.go @@ -42,3 +42,28 @@ func TestMapCriterionDeepEqualSuccess(t *testing.T) { assert.True(t, ok) assert.NoError(t, err) } + +func TestJSONCriterionIgnoreSkipsCompare(t *testing.T) { + called := false + criterion := &JSONCriterion{ + Ignore: true, + Compare: func(actual, expected map[string]any) (bool, error) { + called = true + return false, nil + }, + MatchStrategy: JSONMatchStrategyExact, + } + ok, err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "diff"}) + assert.True(t, ok) + assert.NoError(t, err) + assert.False(t, called) +} + +func TestJSONCriterionInvalidMatchStrategy(t *testing.T) { + criterion := &JSONCriterion{ + MatchStrategy: JSONMatchStrategy("invalid"), + } + ok, err := criterion.Match(map[string]any{"k": "v"}, map[string]any{"k": "v"}) + assert.False(t, ok) + assert.Error(t, err) +} diff --git a/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go b/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go index 02c4d8ecf..344dd241f 100644 --- a/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go +++ b/evaluation/metric/criterion/tooltrajectory/tooltrajectory_test.go @@ -508,7 +508,7 @@ func TestToolTrajectoryStrategyArgumentAndResponseMismatch(t *testing.T) { "tool": strategy, })) ok, err := criterion.Match(actual, expected) - assert.True(t, ok) + assert.False(t, ok) assert.Error(t, err) } From 729a16836b9064352dbf66c2b568c24cc03b88a6 Mon Sep 17 00:00:00 2001 From: hackerli Date: Tue, 25 Nov 2025 11:54:14 +0800 Subject: [PATCH 13/14] docs --- docs/mkdocs/en/evaluation.md | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/docs/mkdocs/en/evaluation.md b/docs/mkdocs/en/evaluation.md index 4ff86f38d..daf1afaee 100644 --- a/docs/mkdocs/en/evaluation.md +++ b/docs/mkdocs/en/evaluation.md @@ -572,8 +572,8 @@ Metric represents an evaluation indicator used to measure a certain aspect of Ev During the evaluation process, the evaluator compares the actual conversation with the expected conversation according to the configured evaluation criterion, calculates the evaluation score for this metric, and compares it with the threshold: -* When the evaluation score is lower than the threshold, the metric is determined as not passed. -* When the evaluation score reaches or exceeds the threshold, the metric is determined as passed. +- When the evaluation score is lower than the threshold, the metric is determined as not passed. +- When the evaluation score reaches or exceeds the threshold, the metric is determined as passed. ```go import ( @@ -594,21 +594,6 @@ type Criterion struct { } ``` -A Metric represents an evaluation metric used to measure a specific aspect of an EvalSet's performance. - -Each metric consists of a metric name and a scoring threshold: - -- When the evaluation score falls below the threshold, the metric is considered failed. -- When the evaluation score reaches or exceeds the threshold, the metric is considered passed. - -```go -// EvalMetric represents a single metric used to evaluate an EvalCase. -type EvalMetric struct { - MetricName string // metric name. - Threshold float64 // scoring threshold. -} -``` - The Metric Manager is responsible for managing evaluation metrics. Each EvalSet can have multiple evaluation metrics, identified by `MetricName`. From bf5285f138aedb30c888733d2051918312785018 Mon Sep 17 00:00:00 2001 From: hackerli Date: Tue, 25 Nov 2025 12:00:40 +0800 Subject: [PATCH 14/14] docs --- docs/mkdocs/en/evaluation.md | 16 ++++++++++------ docs/mkdocs/zh/evaluation.md | 16 ++++++++++------ .../tooltrajectory/tooltrajectory_test.go | 9 +++++++++ 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/docs/mkdocs/en/evaluation.md b/docs/mkdocs/en/evaluation.md index daf1afaee..d7f39a18c 100644 --- a/docs/mkdocs/en/evaluation.md +++ b/docs/mkdocs/en/evaluation.md @@ -747,15 +747,19 @@ type EvalCaseResult struct { EvalMetricResult represents the evaluation result of a specific metric, including the score, status, threshold, and additional information. ```go -import "trpc.group/trpc-go/trpc-agent-go/evaluation/status" +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/status" +) // EvalMetricResult represents the evaluation result of a single metric. type EvalMetricResult struct { - MetricName string // Metric name. - Score float64 // Actual score. - EvalStatus status.EvalStatus // Evaluation status. - Threshold float64 // Score threshold. - Details map[string]any // Additional information, such as scoring process, error description, etc. + MetricName string // Metric name. + Score float64 // Actual score. + EvalStatus status.EvalStatus // Evaluation status. + Threshold float64 // Score threshold. + Criterion *criterion.Criterion // Evaluation criterion. + Details map[string]any // Additional information, such as scoring process, error description, etc. } ``` diff --git a/docs/mkdocs/zh/evaluation.md b/docs/mkdocs/zh/evaluation.md index 3d7ff982e..79fc98f57 100644 --- a/docs/mkdocs/zh/evaluation.md +++ b/docs/mkdocs/zh/evaluation.md @@ -735,15 +735,19 @@ type EvalCaseResult struct { EvalMetricResult 表示某一指标的评估结果,包括得分、状态、阈值及附加信息。 ```go -import "trpc.group/trpc-go/trpc-agent-go/evaluation/status" +import ( + "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion" + "trpc.group/trpc-go/trpc-agent-go/evaluation/status" +) // EvalMetricResult 表示单项指标的评估结果 type EvalMetricResult struct { - MetricName string // 指标名称 - Score float64 // 实际得分 - EvalStatus status.EvalStatus // 评测状态 - Threshold float64 // 阈值 - Details map[string]any // 额外信息,如评分过程、错误描述等 + MetricName string // 指标名称 + Score float64 // 实际得分 + EvalStatus status.EvalStatus // 评测状态 + Threshold float64 // 阈值 + Criterion *criterion.Criterion // 评估准则 + Details map[string]any // 额外信息,如评分过程、错误描述等 } ``` diff --git a/evaluation/evaluator/tooltrajectory/tooltrajectory_test.go b/evaluation/evaluator/tooltrajectory/tooltrajectory_test.go index aa91997ae..086194ffc 100644 --- a/evaluation/evaluator/tooltrajectory/tooltrajectory_test.go +++ b/evaluation/evaluator/tooltrajectory/tooltrajectory_test.go @@ -1,3 +1,12 @@ +// +// Tencent is pleased to support the open source community by making trpc-agent-go available. +// +// Copyright (C) 2025 Tencent. All rights reserved. +// +// trpc-agent-go is licensed under the Apache License Version 2.0. +// +// + package tooltrajectory import (