Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
514 changes: 456 additions & 58 deletions docs/mkdocs/en/evaluation.md

Large diffs are not rendered by default.

510 changes: 454 additions & 56 deletions docs/mkdocs/zh/evaluation.md

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions evaluation/evalresult/evalresult.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (

"trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime"
"trpc.group/trpc-go/trpc-agent-go/evaluation/evalset"
"trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion"
"trpc.group/trpc-go/trpc-agent-go/evaluation/status"
)

Expand Down Expand Up @@ -63,6 +64,8 @@ type EvalMetricResult struct {
EvalStatus status.EvalStatus `json:"evalStatus,omitempty"`
// Threshold that was used.
Threshold float64 `json:"threshold,omitempty"`
// Criterion contains the criterion used for this metric evaluation.
Criterion *criterion.Criterion `json:"criterion,omitempty"`
// Details contains additional metric-specific information.
Details map[string]any `json:"details,omitempty"`
}
Expand Down
25 changes: 12 additions & 13 deletions evaluation/evalresult/local/local.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,18 +123,21 @@ func (m *manager) evalSetResultPath(appName, evalSetResultID string) string {
// load loads the EvalSetResult from the file system.
func (m *manager) load(appName, evalSetResultID string) (*evalresult.EvalSetResult, error) {
path := m.evalSetResultPath(appName, evalSetResultID)
f, err := os.Open(path)
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("open file %s: %w", path, err)
}
defer f.Close()
var payload string
if err := json.NewDecoder(f).Decode(&payload); err != nil {
var res evalresult.EvalSetResult
if err := json.Unmarshal(data, &res); err == nil {
return &res, nil
}
// Keep backward compatibility with legacy string-wrapped results.
var legacy string
if err := json.Unmarshal(data, &legacy); err != nil {
return nil, fmt.Errorf("decode file %s: %w", path, err)
}
var res evalresult.EvalSetResult
if err := json.Unmarshal([]byte(payload), &res); err != nil {
return nil, fmt.Errorf("unmarshal eval set result %s: %w", path, err)
if err := json.Unmarshal([]byte(legacy), &res); err != nil {
return nil, fmt.Errorf("decode legacy content in file %s: %w", path, err)
}
return &res, nil
}
Expand All @@ -154,13 +157,9 @@ func (m *manager) store(appName string, evalSetResult *evalresult.EvalSetResult)
if err != nil {
return fmt.Errorf("open file %s: %w", tmp, err)
}
data, err := json.Marshal(evalSetResult)
if err != nil {
file.Close()
return fmt.Errorf("json marshal: %w", err)
}
encoder := json.NewEncoder(file)
if err := encoder.Encode(string(data)); err != nil {
encoder.SetIndent("", " ")
if err := encoder.Encode(evalSetResult); err != nil {
file.Close()
os.Remove(tmp)
return fmt.Errorf("encode file %s: %w", tmp, err)
Expand Down
4 changes: 4 additions & 0 deletions evaluation/evaluation.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/registry"
istatus "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/status"
"trpc.group/trpc-go/trpc-agent-go/evaluation/metric"
"trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion"
"trpc.group/trpc-go/trpc-agent-go/evaluation/service"
"trpc.group/trpc-go/trpc-agent-go/evaluation/service/local"
"trpc.group/trpc-go/trpc-agent-go/evaluation/status"
Expand Down Expand Up @@ -195,6 +196,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu
count int
score float64
threshold float64
criterion *criterion.Criterion
}
// Group metrics results by metric name.
aggregatedMetrics := make(map[string]*aggregatedMetric)
Expand All @@ -208,6 +210,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu
}
aggregatedMetrics[metric.MetricName].count++
aggregatedMetrics[metric.MetricName].score += metric.Score
aggregatedMetrics[metric.MetricName].criterion = metric.Criterion
}
}
// Aggregate metrics results by metric name.
Expand All @@ -223,6 +226,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu
Score: average,
EvalStatus: evalStatus,
Threshold: aggregatedMetric.threshold,
Criterion: aggregatedMetric.criterion,
})
}
status, err := istatus.SummarizeMetricsStatus(metricsResults)
Expand Down
37 changes: 14 additions & 23 deletions evaluation/evaluator/tooltrajectory/tooltrajectory.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ package tooltrajectory
import (
"context"
"fmt"
"reflect"

"google.golang.org/genai"
"trpc.group/trpc-go/trpc-agent-go/evaluation/evalset"
"trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator"
"trpc.group/trpc-go/trpc-agent-go/evaluation/metric"
ctooltrajectory "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory"
"trpc.group/trpc-go/trpc-agent-go/evaluation/status"
"trpc.group/trpc-go/trpc-agent-go/log"
)

// toolTrajectoryEvaluator is a tool trajectory evaluator implementation for evaluator.
Expand Down Expand Up @@ -53,10 +53,11 @@ func (e *toolTrajectoryEvaluator) Evaluate(ctx context.Context, actuals, expecte
for i := range len(actuals) {
actual := actuals[i]
expected := expecteds[i]
actualCalls := getToolCalls(actual)
expectedCalls := getToolCalls(expected)
score := 0.0
if toolCallsEqual(actualCalls, expectedCalls) {
ok, err := toolCallsMatch(actual, expected, evalMetric.Criterion.ToolTrajectory)
if err != nil {
log.Errorf("tool trajectory mismatch: %v", err)
} else if ok {
score = 1.0
}
status := e.statusForScore(score, evalMetric)
Expand Down Expand Up @@ -88,24 +89,14 @@ func (e *toolTrajectoryEvaluator) statusForScore(score float64, evalMetric *metr
return status.EvalStatusFailed
}

func getToolCalls(invocation *evalset.Invocation) []*genai.FunctionCall {
if invocation == nil || invocation.IntermediateData == nil {
return nil
func toolCallsMatch(actual, expected *evalset.Invocation,
criterion *ctooltrajectory.ToolTrajectoryCriterion) (bool, error) {
if criterion == nil {
return false, fmt.Errorf("criterion is nil")
}
return invocation.IntermediateData.ToolUses
}

func toolCallsEqual(actual, expected []*genai.FunctionCall) bool {
if len(actual) != len(expected) {
return false
}
for i := range actual {
if actual[i].Name != expected[i].Name {
return false
}
if !reflect.DeepEqual(actual[i].Args, expected[i].Args) {
return false
}
ok, err := criterion.Match(actual, expected)
if err != nil {
return false, fmt.Errorf("tool trajectory mismatch: %w", err)
}
return true
return ok, nil
}
104 changes: 27 additions & 77 deletions evaluation/evaluator/tooltrajectory/tooltrajectory_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,89 +10,39 @@
package tooltrajectory

import (
"context"
"encoding/json"
"testing"

"github.com/stretchr/testify/assert"
"google.golang.org/genai"
"trpc.group/trpc-go/trpc-agent-go/evaluation/evalset"
"trpc.group/trpc-go/trpc-agent-go/evaluation/metric"
"trpc.group/trpc-go/trpc-agent-go/evaluation/status"
criterionjson "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/json"
"trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/text"
"trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory"
)

func TestToolTrajectoryEvaluateSuccess(t *testing.T) {
inst := New()
assert.NotEmpty(t, inst.Description())
assert.Equal(t, "tool_trajectory_avg_score", inst.Name())

e := inst.(*toolTrajectoryEvaluator)
actual := makeInvocation([]*genai.FunctionCall{
{Name: "lookup", Args: map[string]any{"id": 1}},
})
expected := makeInvocation([]*genai.FunctionCall{
{Name: "lookup", Args: map[string]any{"id": 1}},
})

result, err := e.Evaluate(context.Background(), []*evalset.Invocation{actual}, []*evalset.Invocation{expected}, &metric.EvalMetric{Threshold: 0.5})
assert.NoError(t, err)
assert.Equal(t, 1.0, result.OverallScore)
assert.Equal(t, status.EvalStatusPassed, result.OverallStatus)
assert.Len(t, result.PerInvocationResults, 1)
assert.Equal(t, actual, result.PerInvocationResults[0].ActualInvocation)
assert.Equal(t, expected, result.PerInvocationResults[0].ExpectedInvocation)
assert.Equal(t, status.EvalStatusPassed, result.PerInvocationResults[0].Status)
}

func TestToolTrajectoryEvaluateMismatch(t *testing.T) {
e := New().(*toolTrajectoryEvaluator)
_, err := e.Evaluate(context.Background(), []*evalset.Invocation{}, []*evalset.Invocation{makeInvocation(nil)}, &metric.EvalMetric{Threshold: 1})
assert.Error(t, err)
assert.Contains(t, err.Error(), "count mismatch")
}

func TestToolTrajectoryEvaluateFailureStatus(t *testing.T) {
e := New().(*toolTrajectoryEvaluator)
actual := makeInvocation([]*genai.FunctionCall{
{Name: "lookup", Args: map[string]any{"id": 1}},
})
expected := makeInvocation([]*genai.FunctionCall{
{Name: "lookup", Args: map[string]any{"id": 2}},
})

result, err := e.Evaluate(context.Background(), []*evalset.Invocation{actual}, []*evalset.Invocation{expected}, &metric.EvalMetric{Threshold: 0.9})
func TestConfigJSONRoundTrip(t *testing.T) {
cfg := &tooltrajectory.ToolTrajectoryCriterion{
DefaultStrategy: &tooltrajectory.ToolTrajectoryStrategy{
Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyExact},
Arguments: &criterionjson.JSONCriterion{MatchStrategy: criterionjson.JSONMatchStrategyExact},
Response: &criterionjson.JSONCriterion{MatchStrategy: criterionjson.JSONMatchStrategyExact},
},
ToolStrategy: map[string]*tooltrajectory.ToolTrajectoryStrategy{
"custom": {
Name: &text.TextCriterion{MatchStrategy: text.TextMatchStrategyRegex},
},
},
OrderInsensitive: true,
}
data, err := json.Marshal(cfg)
assert.NoError(t, err)
assert.Zero(t, result.OverallScore)
assert.Equal(t, status.EvalStatusFailed, result.OverallStatus)
assert.Equal(t, status.EvalStatusFailed, result.PerInvocationResults[0].Status)
}
assert.Contains(t, string(data), `"orderInsensitive":true`)
assert.Contains(t, string(data), `"custom"`)

func TestToolTrajectoryEvaluateNotEvaluated(t *testing.T) {
e := New().(*toolTrajectoryEvaluator)
result, err := e.Evaluate(context.Background(), []*evalset.Invocation{}, []*evalset.Invocation{}, &metric.EvalMetric{Threshold: 1})
var decoded tooltrajectory.ToolTrajectoryCriterion
err = json.Unmarshal(data, &decoded)
assert.NoError(t, err)
assert.Equal(t, status.EvalStatusNotEvaluated, result.OverallStatus)
assert.Nil(t, result.PerInvocationResults)
}

func TestGetToolCallsAndEqual(t *testing.T) {
assert.Nil(t, getToolCalls(nil))
assert.Nil(t, getToolCalls(&evalset.Invocation{}))

callA := []*genai.FunctionCall{{Name: "a", Args: map[string]any{"x": 1}}}
callB := []*genai.FunctionCall{{Name: "a", Args: map[string]any{"x": 1}}}
assert.True(t, toolCallsEqual(callA, callB))

callNameDiff := []*genai.FunctionCall{{Name: "b", Args: map[string]any{"x": 1}}}
callArgsDiff := []*genai.FunctionCall{{Name: "a", Args: map[string]any{"x": 2}}}
assert.False(t, toolCallsEqual(callA, callNameDiff))
assert.False(t, toolCallsEqual(callA, callArgsDiff))
assert.False(t, toolCallsEqual(callA, []*genai.FunctionCall{}))
}

func makeInvocation(calls []*genai.FunctionCall) *evalset.Invocation {
return &evalset.Invocation{
IntermediateData: &evalset.IntermediateData{
ToolUses: calls,
},
}
assert.True(t, decoded.OrderInsensitive)
assert.NotNil(t, decoded.DefaultStrategy)
assert.NotNil(t, decoded.ToolStrategy["custom"])
assert.Equal(t, text.TextMatchStrategyRegex, decoded.ToolStrategy["custom"].Name.MatchStrategy)
}
27 changes: 27 additions & 0 deletions evaluation/metric/criterion/criterion.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
//
// Tencent is pleased to support the open source community by making trpc-agent-go available.
//
// Copyright (C) 2025 Tencent. All rights reserved.
//
// trpc-agent-go is licensed under the Apache License Version 2.0.
//
//

// Package criterion provides configurable evaluation criteria.
package criterion

import "trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory"

// Criterion encapsulates multiple evaluation criteria for comprehensive model behavior assessment.
type Criterion struct {
// ToolTrajectory configures checks for tool call and response sequences.
ToolTrajectory *tooltrajectory.ToolTrajectoryCriterion `json:"toolTrajectory,omitempty"`
}

// New creates a Criterion with the provided options.
func New(opt ...Option) *Criterion {
opts := newOptions(opt...)
return &Criterion{
ToolTrajectory: opts.ToolTrajectory,
}
}
42 changes: 42 additions & 0 deletions evaluation/metric/criterion/criterion_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
//
// Tencent is pleased to support the open source community by making trpc-agent-go available.
//
// Copyright (C) 2025 Tencent. All rights reserved.
//
// trpc-agent-go is licensed under the Apache License Version 2.0.
//
//

package criterion

import (
"encoding/json"
"testing"

"github.com/stretchr/testify/assert"
"trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion/tooltrajectory"
)

func TestCriterionNewDefaults(t *testing.T) {
c := New()
assert.NotNil(t, c.ToolTrajectory)
}

func TestCriterionWithToolTrajectory(t *testing.T) {
custom := tooltrajectory.New()
c := New(WithToolTrajectory(custom))
assert.Equal(t, custom, c.ToolTrajectory)
}

func TestCriterionJSONRoundTrip(t *testing.T) {
c := &Criterion{
ToolTrajectory: tooltrajectory.New(),
}
data, err := json.Marshal(c)
assert.NoError(t, err)

var decoded Criterion
err = json.Unmarshal(data, &decoded)
assert.NoError(t, err)
assert.NotNil(t, decoded.ToolTrajectory)
}
Loading
Loading