Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
514 changes: 456 additions & 58 deletions docs/mkdocs/en/evaluation.md

Large diffs are not rendered by default.

624 changes: 565 additions & 59 deletions docs/mkdocs/zh/evaluation.md

Large diffs are not rendered by default.

16 changes: 15 additions & 1 deletion evaluation/evalresult/evalresult.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (

"trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime"
"trpc.group/trpc-go/trpc-agent-go/evaluation/evalset"
"trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion"
"trpc.group/trpc-go/trpc-agent-go/evaluation/status"
)

Expand Down Expand Up @@ -63,8 +64,10 @@ type EvalMetricResult struct {
EvalStatus status.EvalStatus `json:"evalStatus,omitempty"`
// Threshold that was used.
Threshold float64 `json:"threshold,omitempty"`
// Criterion contains the criterion used for this metric evaluation.
Criterion *criterion.Criterion `json:"criterion,omitempty"`
// Details contains additional metric-specific information.
Details map[string]any `json:"details,omitempty"`
Details *EvalMetricResultDetails `json:"details,omitempty"`
}

// EvalMetricResultPerInvocation represents metric results for a single invocation.
Expand All @@ -78,6 +81,17 @@ type EvalMetricResultPerInvocation struct {
EvalMetricResults []*EvalMetricResult `json:"evalMetricResults,omitempty"`
}

// ScoreResult represents the score and rationale for a single metric evaluation.
// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format.
type ScoreResult struct {
Score float64 `json:"score,omitempty"`
}

// EvalMetricResultDetails contains additional metric-specific information.
// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format.
type EvalMetricResultDetails struct {
}

// Manager defines the interface for managing evaluation results.
type Manager interface {
// Save stores an evaluation result.
Expand Down
12 changes: 2 additions & 10 deletions evaluation/evalresult/evalresult_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,7 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) {
"metricName": "tool_trajectory_avg_score",
"score": 0.9,
"evalStatus": 1,
"threshold": 0.8,
"details": {
"comment": "trajectory matched"
}
"threshold": 0.8
}
],
"evalMetricResultPerInvocation": [
Expand Down Expand Up @@ -131,10 +128,7 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) {
"metricName": "tool_trajectory_avg_score",
"score": 0.9,
"evalStatus": 1,
"threshold": 0.8,
"details": {
"comment": "per invocation matched"
}
"threshold": 0.8
}
]
}
Expand Down Expand Up @@ -169,7 +163,6 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) {
assert.Equal(t, 0.9, overallMetric.Score)
assert.Equal(t, status.EvalStatusPassed, overallMetric.EvalStatus)
assert.Equal(t, 0.8, overallMetric.Threshold)
assert.Equal(t, "trajectory matched", overallMetric.Details["comment"])

perInvocation := caseResult.EvalMetricResultPerInvocation[0]
assert.NotNil(t, perInvocation.ActualInvocation)
Expand All @@ -183,7 +176,6 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) {
assert.Equal(t, 0.9, perMetric.Score)
assert.Equal(t, status.EvalStatusPassed, perMetric.EvalStatus)
assert.Equal(t, 0.8, perMetric.Threshold)
assert.Equal(t, "per invocation matched", perMetric.Details["comment"])

encoded, marshalErr := json.Marshal(result)
assert.NoError(t, marshalErr)
Expand Down
25 changes: 12 additions & 13 deletions evaluation/evalresult/local/local.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,18 +123,21 @@ func (m *manager) evalSetResultPath(appName, evalSetResultID string) string {
// load loads the EvalSetResult from the file system.
func (m *manager) load(appName, evalSetResultID string) (*evalresult.EvalSetResult, error) {
path := m.evalSetResultPath(appName, evalSetResultID)
f, err := os.Open(path)
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("open file %s: %w", path, err)
}
defer f.Close()
var payload string
if err := json.NewDecoder(f).Decode(&payload); err != nil {
var res evalresult.EvalSetResult
if err := json.Unmarshal(data, &res); err == nil {
return &res, nil
}
// Keep backward compatibility with legacy string-wrapped results.
var legacy string
if err := json.Unmarshal(data, &legacy); err != nil {
return nil, fmt.Errorf("decode file %s: %w", path, err)
}
var res evalresult.EvalSetResult
if err := json.Unmarshal([]byte(payload), &res); err != nil {
return nil, fmt.Errorf("unmarshal eval set result %s: %w", path, err)
if err := json.Unmarshal([]byte(legacy), &res); err != nil {
return nil, fmt.Errorf("decode legacy content in file %s: %w", path, err)
}
return &res, nil
}
Expand All @@ -154,13 +157,9 @@ func (m *manager) store(appName string, evalSetResult *evalresult.EvalSetResult)
if err != nil {
return fmt.Errorf("open file %s: %w", tmp, err)
}
data, err := json.Marshal(evalSetResult)
if err != nil {
file.Close()
return fmt.Errorf("json marshal: %w", err)
}
encoder := json.NewEncoder(file)
if err := encoder.Encode(string(data)); err != nil {
encoder.SetIndent("", " ")
if err := encoder.Encode(evalSetResult); err != nil {
file.Close()
os.Remove(tmp)
return fmt.Errorf("encode file %s: %w", tmp, err)
Expand Down
4 changes: 4 additions & 0 deletions evaluation/evaluation.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/registry"
istatus "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/status"
"trpc.group/trpc-go/trpc-agent-go/evaluation/metric"
"trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion"
"trpc.group/trpc-go/trpc-agent-go/evaluation/service"
"trpc.group/trpc-go/trpc-agent-go/evaluation/service/local"
"trpc.group/trpc-go/trpc-agent-go/evaluation/status"
Expand Down Expand Up @@ -195,6 +196,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu
count int
score float64
threshold float64
criterion *criterion.Criterion
}
// Group metrics results by metric name.
aggregatedMetrics := make(map[string]*aggregatedMetric)
Expand All @@ -208,6 +210,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu
}
aggregatedMetrics[metric.MetricName].count++
aggregatedMetrics[metric.MetricName].score += metric.Score
aggregatedMetrics[metric.MetricName].criterion = metric.Criterion
}
}
// Aggregate metrics results by metric name.
Expand All @@ -223,6 +226,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu
Score: average,
EvalStatus: evalStatus,
Threshold: aggregatedMetric.threshold,
Criterion: aggregatedMetric.criterion,
})
}
status, err := istatus.SummarizeMetricsStatus(metricsResults)
Expand Down
10 changes: 5 additions & 5 deletions evaluation/evaluator/evaluator.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,20 @@ type Evaluator interface {
// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format.
type EvaluateResult struct {
// OverallScore is the overall score for this evaluation.
OverallScore float64 `json:"overall_score,omitempty"`
OverallScore float64 `json:"overallScore,omitempty"`
// OverallStatus represents pass/fail/not-evaluated for the evaluation run.
OverallStatus status.EvalStatus `json:"overall_status,omitempty"`
OverallStatus status.EvalStatus `json:"overallStatus,omitempty"`
// PerInvocationResults contains results for each invocation.
PerInvocationResults []PerInvocationResult `json:"per_invocation_results,omitempty"`
PerInvocationResults []*PerInvocationResult `json:"perInvocationResults,omitempty"`
}

// PerInvocationResult represents the evaluation result for a single invocation.
// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format.
type PerInvocationResult struct {
// ActualInvocation is the invocation generated by the agent.
ActualInvocation *evalset.Invocation `json:"actual_invocation,omitempty"`
ActualInvocation *evalset.Invocation `json:"actualInvocation,omitempty"`
// ExpectedInvocation is the expected invocation.
ExpectedInvocation *evalset.Invocation `json:"expected_invocation,omitempty"`
ExpectedInvocation *evalset.Invocation `json:"expectedInvocation,omitempty"`
// Score is the evaluator's score for this invocation.
Score float64 `json:"score,omitempty"`
// Status indicates the evaluation status of the invocation.
Expand Down
Loading