trpc-group · Flash-LHR · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 21, 2025
diff --git a/docs/mkdocs/en/evaluation.md b/docs/mkdocs/en/evaluation.md
diff --git a/docs/mkdocs/zh/evaluation.md b/docs/mkdocs/zh/evaluation.md
diff --git a/evaluation/evalresult/evalresult.go b/evaluation/evalresult/evalresult.go
@@ -15,6 +15,7 @@ import (
 
 	"trpc.group/trpc-go/trpc-agent-go/evaluation/epochtime"
 	"trpc.group/trpc-go/trpc-agent-go/evaluation/evalset"
+	"trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion"
 	"trpc.group/trpc-go/trpc-agent-go/evaluation/status"
 )
 
@@ -63,8 +64,10 @@ type EvalMetricResult struct {
 	EvalStatus status.EvalStatus `json:"evalStatus,omitempty"`
 	// Threshold that was used.
 	Threshold float64 `json:"threshold,omitempty"`
+	// Criterion contains the criterion used for this metric evaluation.
+	Criterion *criterion.Criterion `json:"criterion,omitempty"`
 	// Details contains additional metric-specific information.
-	Details map[string]any `json:"details,omitempty"`
+	Details *EvalMetricResultDetails `json:"details,omitempty"`
 }
 
 // EvalMetricResultPerInvocation represents metric results for a single invocation.
@@ -78,6 +81,17 @@ type EvalMetricResultPerInvocation struct {
 	EvalMetricResults []*EvalMetricResult `json:"evalMetricResults,omitempty"`
 }
 
+// ScoreResult represents the score and rationale for a single metric evaluation.
+// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format.
+type ScoreResult struct {
+	Score float64 `json:"score,omitempty"`
+}
+
+// EvalMetricResultDetails contains additional metric-specific information.
+// It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format.
+type EvalMetricResultDetails struct {
+}
+
 // Manager defines the interface for managing evaluation results.
 type Manager interface {
 	// Save stores an evaluation result.

diff --git a/evaluation/evalresult/evalresult_test.go b/evaluation/evalresult/evalresult_test.go
@@ -33,10 +33,7 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) {
           "metricName": "tool_trajectory_avg_score",
           "score": 0.9,
           "evalStatus": 1,
-          "threshold": 0.8,
-          "details": {
-            "comment": "trajectory matched"
-          }
+          "threshold": 0.8
         }
       ],
       "evalMetricResultPerInvocation": [
@@ -131,10 +128,7 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) {
               "metricName": "tool_trajectory_avg_score",
               "score": 0.9,
               "evalStatus": 1,
-              "threshold": 0.8,
-              "details": {
-                "comment": "per invocation matched"
-              }
+              "threshold": 0.8
             }
           ]
         }
@@ -169,7 +163,6 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) {
 	assert.Equal(t, 0.9, overallMetric.Score)
 	assert.Equal(t, status.EvalStatusPassed, overallMetric.EvalStatus)
 	assert.Equal(t, 0.8, overallMetric.Threshold)
-	assert.Equal(t, "trajectory matched", overallMetric.Details["comment"])
 
 	perInvocation := caseResult.EvalMetricResultPerInvocation[0]
 	assert.NotNil(t, perInvocation.ActualInvocation)
@@ -183,7 +176,6 @@ func TestEvalSetResultJSONRoundTrip(t *testing.T) {
 	assert.Equal(t, 0.9, perMetric.Score)
 	assert.Equal(t, status.EvalStatusPassed, perMetric.EvalStatus)
 	assert.Equal(t, 0.8, perMetric.Threshold)
-	assert.Equal(t, "per invocation matched", perMetric.Details["comment"])
 
 	encoded, marshalErr := json.Marshal(result)
 	assert.NoError(t, marshalErr)

diff --git a/evaluation/evalresult/local/local.go b/evaluation/evalresult/local/local.go
@@ -123,18 +123,21 @@ func (m *manager) evalSetResultPath(appName, evalSetResultID string) string {
 // load loads the EvalSetResult from the file system.
 func (m *manager) load(appName, evalSetResultID string) (*evalresult.EvalSetResult, error) {
 	path := m.evalSetResultPath(appName, evalSetResultID)
-	f, err := os.Open(path)
+	data, err := os.ReadFile(path)
 	if err != nil {
 		return nil, fmt.Errorf("open file %s: %w", path, err)
 	}
-	defer f.Close()
-	var payload string
-	if err := json.NewDecoder(f).Decode(&payload); err != nil {
+	var res evalresult.EvalSetResult
+	if err := json.Unmarshal(data, &res); err == nil {
+		return &res, nil
+	}
+	// Keep backward compatibility with legacy string-wrapped results.
+	var legacy string
+	if err := json.Unmarshal(data, &legacy); err != nil {
 		return nil, fmt.Errorf("decode file %s: %w", path, err)
 	}
-	var res evalresult.EvalSetResult
-	if err := json.Unmarshal([]byte(payload), &res); err != nil {
-		return nil, fmt.Errorf("unmarshal eval set result %s: %w", path, err)
+	if err := json.Unmarshal([]byte(legacy), &res); err != nil {
+		return nil, fmt.Errorf("decode legacy content in file %s: %w", path, err)
 	}
 	return &res, nil
 }
@@ -154,13 +157,9 @@ func (m *manager) store(appName string, evalSetResult *evalresult.EvalSetResult)
 	if err != nil {
 		return fmt.Errorf("open file %s: %w", tmp, err)
 	}
-	data, err := json.Marshal(evalSetResult)
-	if err != nil {
-		file.Close()
-		return fmt.Errorf("json marshal: %w", err)
-	}
 	encoder := json.NewEncoder(file)
-	if err := encoder.Encode(string(data)); err != nil {
+	encoder.SetIndent("", "  ")
+	if err := encoder.Encode(evalSetResult); err != nil {
 		file.Close()
 		os.Remove(tmp)
 		return fmt.Errorf("encode file %s: %w", tmp, err)

diff --git a/evaluation/evaluation.go b/evaluation/evaluation.go
@@ -20,6 +20,7 @@ import (
 	"trpc.group/trpc-go/trpc-agent-go/evaluation/evaluator/registry"
 	istatus "trpc.group/trpc-go/trpc-agent-go/evaluation/internal/status"
 	"trpc.group/trpc-go/trpc-agent-go/evaluation/metric"
+	"trpc.group/trpc-go/trpc-agent-go/evaluation/metric/criterion"
 	"trpc.group/trpc-go/trpc-agent-go/evaluation/service"
 	"trpc.group/trpc-go/trpc-agent-go/evaluation/service/local"
 	"trpc.group/trpc-go/trpc-agent-go/evaluation/status"
@@ -195,6 +196,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu
 		count     int
 		score     float64
 		threshold float64
+		criterion *criterion.Criterion
 	}
 	// Group metrics results by metric name.
 	aggregatedMetrics := make(map[string]*aggregatedMetric)
@@ -208,6 +210,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu
 			}
 			aggregatedMetrics[metric.MetricName].count++
 			aggregatedMetrics[metric.MetricName].score += metric.Score
+			aggregatedMetrics[metric.MetricName].criterion = metric.Criterion
 		}
 	}
 	// Aggregate metrics results by metric name.
@@ -223,6 +226,7 @@ func aggregateCaseRuns(caseID string, runs []*evalresult.EvalCaseResult) (*Evalu
 			Score:      average,
 			EvalStatus: evalStatus,
 			Threshold:  aggregatedMetric.threshold,
+			Criterion:  aggregatedMetric.criterion,
 		})
 	}
 	status, err := istatus.SummarizeMetricsStatus(metricsResults)

diff --git a/evaluation/evaluator/evaluator.go b/evaluation/evaluator/evaluator.go
@@ -33,20 +33,20 @@ type Evaluator interface {
 // It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format.
 type EvaluateResult struct {
 	// OverallScore is the overall score for this evaluation.
-	OverallScore float64 `json:"overall_score,omitempty"`
+	OverallScore float64 `json:"overallScore,omitempty"`
 	// OverallStatus represents pass/fail/not-evaluated for the evaluation run.
-	OverallStatus status.EvalStatus `json:"overall_status,omitempty"`
+	OverallStatus status.EvalStatus `json:"overallStatus,omitempty"`
 	// PerInvocationResults contains results for each invocation.
-	PerInvocationResults []PerInvocationResult `json:"per_invocation_results,omitempty"`
+	PerInvocationResults []*PerInvocationResult `json:"perInvocationResults,omitempty"`
 }
 
 // PerInvocationResult represents the evaluation result for a single invocation.
 // It mirrors the schema used by ADK Web, with field names in camel to align with the JSON format.
 type PerInvocationResult struct {
 	// ActualInvocation is the invocation generated by the agent.
-	ActualInvocation *evalset.Invocation `json:"actual_invocation,omitempty"`
+	ActualInvocation *evalset.Invocation `json:"actualInvocation,omitempty"`
 	// ExpectedInvocation is the expected invocation.
-	ExpectedInvocation *evalset.Invocation `json:"expected_invocation,omitempty"`
+	ExpectedInvocation *evalset.Invocation `json:"expectedInvocation,omitempty"`
 	// Score is the evaluator's score for this invocation.
 	Score float64 `json:"score,omitempty"`
 	// Status indicates the evaluation status of the invocation.