diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go index 4e09a80d9..cbfe213ef 100644 --- a/pkg/evaluation/eval.go +++ b/pkg/evaluation/eval.go @@ -90,6 +90,7 @@ func Evaluate(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName st Name: runName, Timestamp: startTime, Duration: duration, + Config: cfg, Results: results, Summary: summary, } @@ -356,12 +357,18 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res if r.judge != nil && len(evals.Relevance) > 0 { // Use transcript for relevance checking to preserve temporal ordering transcript := buildTranscript(events) - passed, failed, err := r.judge.CheckRelevance(ctx, transcript, evals.Relevance) + results, err := r.judge.CheckRelevance(ctx, transcript, evals.Relevance) if err != nil { return result, fmt.Errorf("relevance check failed: %w", err) } - result.RelevancePassed = float64(passed) - result.FailedRelevance = failed + var passed float64 + for _, rr := range results { + if rr.Passed { + passed++ + } + } + result.RelevancePassed = passed + result.RelevanceResults = results } slog.Debug("Evaluation complete", "title", evalSess.Title, "duration", time.Since(startTime)) diff --git a/pkg/evaluation/eval_test.go b/pkg/evaluation/eval_test.go index d5ce06e47..29f38cf2c 100644 --- a/pkg/evaluation/eval_test.go +++ b/pkg/evaluation/eval_test.go @@ -196,7 +196,7 @@ func TestResultCheckResults(t *testing.T) { }, { name: "relevance failures listed", - result: Result{RelevanceExpected: 2, RelevancePassed: 0, FailedRelevance: []RelevanceResult{{Criterion: "check A", Reason: "reason A"}, {Criterion: "check B", Reason: "reason B"}}}, + result: Result{RelevanceExpected: 2, RelevancePassed: 0, RelevanceResults: []RelevanceResult{{Criterion: "check A", Passed: false, Reason: "reason A"}, {Criterion: "check B", Passed: false, Reason: "reason B"}}}, wantSuccess: nil, wantFailures: []string{"relevance: check A (reason: reason A)", "relevance: check B (reason: reason B)"}, }, @@ -658,7 +658,7 @@ func TestProgressBarPrintResult(t *testing.T) { Size: "S", RelevanceExpected: 2, RelevancePassed: 1, - FailedRelevance: []RelevanceResult{{Criterion: "check failed", Reason: "did not meet criteria"}}, + RelevanceResults: []RelevanceResult{{Criterion: "check failed", Passed: false, Reason: "did not meet criteria"}}, }, wantContains: []string{ "✗ mixed-session", // overall failed diff --git a/pkg/evaluation/judge.go b/pkg/evaluation/judge.go index 6062132ae..38ae652fd 100644 --- a/pkg/evaluation/judge.go +++ b/pkg/evaluation/judge.go @@ -97,17 +97,18 @@ func (j *Judge) Validate(ctx context.Context) error { // RelevanceResult contains the result of a single relevance check. type RelevanceResult struct { Criterion string `json:"criterion"` + Passed bool `json:"passed"` Reason string `json:"reason"` } // CheckRelevance runs all relevance checks concurrently with the configured concurrency. -// It returns the number of passed checks, a slice of failed results with reasons, and an error -// if any check encountered an error (e.g. judge model misconfiguration). Errors cause a hard -// failure so that configuration issues are surfaced immediately rather than silently producing -// zero-relevance results. -func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []string) (passed int, failed []RelevanceResult, err error) { +// It returns a result for every criterion (both passed and failed, each with a reason from +// the judge model), and an error if any check encountered an error (e.g. judge model +// misconfiguration). Errors cause a hard failure so that configuration issues are surfaced +// immediately rather than silently producing zero-relevance results. +func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []string) (results []RelevanceResult, err error) { if len(criteria) == 0 { - return 0, nil, nil + return nil, nil } // Create work channel @@ -122,23 +123,23 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria [] close(work) // Results slice preserves order - type result struct { + type rawResult struct { passed bool reason string err error } - results := make([]result, len(criteria)) + rawResults := make([]rawResult, len(criteria)) var wg sync.WaitGroup for range j.concurrency { wg.Go(func() { for item := range work { if ctx.Err() != nil { - results[item.index] = result{err: fmt.Errorf("context cancelled: %w", ctx.Err())} + rawResults[item.index] = rawResult{err: fmt.Errorf("context cancelled: %w", ctx.Err())} continue } pass, reason, checkErr := j.checkSingle(ctx, response, item.criterion) - results[item.index] = result{passed: pass, reason: reason, err: checkErr} + rawResults[item.index] = rawResult{passed: pass, reason: reason, err: checkErr} } }) } @@ -147,26 +148,24 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria [] // Aggregate results. Any error is fatal — return it immediately so the // caller can fail fast on judge misconfiguration. var errs []error - for i, r := range results { + results = make([]RelevanceResult, len(criteria)) + for i := range results { + results[i].Criterion = criteria[i] + } + for i, r := range rawResults { if r.err != nil { errs = append(errs, fmt.Errorf("checking %q: %w", criteria[i], r.err)) continue } - if r.passed { - passed++ - } else { - failed = append(failed, RelevanceResult{ - Criterion: criteria[i], - Reason: r.reason, - }) - } + results[i].Passed = r.passed + results[i].Reason = r.reason } if len(errs) > 0 { - return passed, failed, errors.Join(errs...) + return results, errors.Join(errs...) } - return passed, failed, nil + return results, nil } // checkSingle checks a single relevance criterion against the response. diff --git a/pkg/evaluation/judge_test.go b/pkg/evaluation/judge_test.go index 4b54ca64e..c52b313a6 100644 --- a/pkg/evaluation/judge_test.go +++ b/pkg/evaluation/judge_test.go @@ -47,10 +47,9 @@ func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) { t.Parallel() judge := NewJudge(nil, 1) - passed, failed, err := judge.CheckRelevance(t.Context(), "some response", nil) + results, err := judge.CheckRelevance(t.Context(), "some response", nil) - assert.Equal(t, 0, passed) - assert.Empty(t, failed) + assert.Empty(t, results) assert.NoError(t, err) } @@ -63,11 +62,10 @@ func TestJudge_CheckRelevance_ContextCanceled(t *testing.T) { cancel() // Cancel immediately criteria := []string{"criterion1", "criterion2", "criterion3"} - passed, failed, err := judge.CheckRelevance(ctx, "some response", criteria) + results, err := judge.CheckRelevance(ctx, "some response", criteria) // All should have errors due to context cancellation - assert.Equal(t, 0, passed) - assert.Empty(t, failed) + assert.Len(t, results, len(criteria)) require.Error(t, err) assert.Contains(t, err.Error(), "context cancelled") } diff --git a/pkg/evaluation/save.go b/pkg/evaluation/save.go index 5177b2ea2..011507dfe 100644 --- a/pkg/evaluation/save.go +++ b/pkg/evaluation/save.go @@ -374,11 +374,15 @@ func SaveRunJSON(run *EvalRun, outputDir string) (string, error) { return saveJSON(run, filepath.Join(outputDir, run.Name+".json")) } -// SaveRunSessionsJSON saves all eval sessions to a single JSON file. -// Each session includes its eval criteria in the "evals" field. -// This complements SaveRunSessions which saves to SQLite, providing a -// human-readable format for inspection. +// SaveRunSessionsJSON saves the full evaluation run output to a JSON file. +// The output includes run metadata (config, summary) and all sessions with +// their eval criteria and scoring results (pass/fail, judge reasoning, errors). func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) { + // Populate eval results on each session + for i := range run.Results { + populateEvalResult(&run.Results[i]) + } + // Collect all sessions from results var sessions []*session.Session for i := range run.Results { @@ -387,8 +391,79 @@ func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) { } } + output := RunOutput{ + Name: run.Name, + Timestamp: run.Timestamp, + Duration: run.Duration.Round(time.Millisecond).String(), + Config: RunOutputConfig{ + Agent: run.Config.AgentFilename, + JudgeModel: run.Config.JudgeModel, + Concurrency: run.Config.Concurrency, + EvalsDir: run.Config.EvalsDir, + BaseImage: run.Config.BaseImage, + }, + Summary: run.Summary, + Sessions: sessions, + } + outputPath := filepath.Join(outputDir, run.Name+".json") - return saveJSON(sessions, outputPath) + return saveJSON(output, outputPath) +} + +// populateEvalResult copies scoring data from a Result to its Session's EvalResult field. +func populateEvalResult(result *Result) { + if result.Session == nil { + return + } + + successes, failures := result.checkResults() + + evalResult := &session.EvalResult{ + Passed: len(failures) == 0, + Successes: successes, + Failures: failures, + Error: result.Error, + Cost: result.Cost, + OutputTokens: result.OutputTokens, + } + + // Populate size check if size was evaluated + if result.SizeExpected != "" { + evalResult.Checks.Size = &session.SizeCheck{ + Passed: result.Size == result.SizeExpected, + Actual: result.Size, + Expected: result.SizeExpected, + } + } + + // Populate tool calls check if tool calls were evaluated + if result.ToolCallsExpected > 0 { + evalResult.Checks.ToolCalls = &session.ToolCallsCheck{ + Passed: result.ToolCallsScore >= 1.0, + Score: result.ToolCallsScore, + } + } + + // Populate relevance check if relevance was evaluated + if result.RelevanceExpected > 0 { + results := make([]session.RelevanceCriterionResult, 0, len(result.RelevanceResults)) + for _, rr := range result.RelevanceResults { + results = append(results, session.RelevanceCriterionResult{ + Criterion: rr.Criterion, + Passed: rr.Passed, + Reason: rr.Reason, + }) + } + + evalResult.Checks.Relevance = &session.RelevanceCheck{ + Passed: result.RelevancePassed >= result.RelevanceExpected, + PassedCount: result.RelevancePassed, + Total: result.RelevanceExpected, + Results: results, + } + } + + result.Session.EvalResult = evalResult } func Save(sess *session.Session, filename string) (string, error) { diff --git a/pkg/evaluation/save_test.go b/pkg/evaluation/save_test.go index aa90c9943..42223757b 100644 --- a/pkg/evaluation/save_test.go +++ b/pkg/evaluation/save_test.go @@ -131,6 +131,9 @@ func TestSaveRunSessionsJSON(t *testing.T) { sess1.InputTokens = 100 sess1.OutputTokens = 50 sess1.Cost = 0.01 + sess1.Evals = &session.EvalCriteria{ + Relevance: []string{"mentions Paris", "mentions France"}, + } sess2 := session.New( session.WithTitle("eval-json-2"), @@ -139,23 +142,54 @@ func TestSaveRunSessionsJSON(t *testing.T) { sess2.InputTokens = 80 sess2.OutputTokens = 30 sess2.Cost = 0.005 + sess2.Evals = &session.EvalCriteria{ + Relevance: []string{"gives the correct answer", "explains the math"}, + } // Create an eval run with sessions and eval criteria run := &EvalRun{ Name: "test-json-001", Timestamp: time.Now(), + Duration: 42 * time.Second, + Config: Config{ + AgentFilename: "./test-agent.yaml", + JudgeModel: "anthropic/claude-opus-4-5", + Concurrency: 4, + EvalsDir: "./evals", + }, + Summary: Summary{ + TotalEvals: 3, + FailedEvals: 1, + TotalCost: 0.015, + }, Results: []Result{ { - Title: "eval-json-1", - Question: "What is the capital of France?", - Response: "Paris is the capital of France.", - Session: sess1, + Title: "eval-json-1", + Question: "What is the capital of France?", + Response: "Paris is the capital of France.", + Cost: 0.01, + OutputTokens: 50, + RelevancePassed: 2, + RelevanceExpected: 2, + RelevanceResults: []RelevanceResult{ + {Criterion: "mentions Paris", Passed: true, Reason: "response includes Paris"}, + {Criterion: "mentions France", Passed: true, Reason: "response includes France"}, + }, + Session: sess1, }, { - Title: "eval-json-2", - Question: "What is 2+2?", - Response: "4", - Session: sess2, + Title: "eval-json-2", + Question: "What is 2+2?", + Response: "4", + Cost: 0.005, + OutputTokens: 30, + RelevancePassed: 1, + RelevanceExpected: 2, + RelevanceResults: []RelevanceResult{ + {Criterion: "gives the correct answer", Passed: true, Reason: "the response says 4"}, + {Criterion: "explains the math", Passed: false, Reason: "no explanation given"}, + }, + Session: sess2, }, { // Result without a session (error case) @@ -176,16 +210,29 @@ func TestSaveRunSessionsJSON(t *testing.T) { data, err := os.ReadFile(sessionsPath) require.NoError(t, err) - var loadedSessions []*session.Session - err = json.Unmarshal(data, &loadedSessions) + var output RunOutput + err = json.Unmarshal(data, &output) require.NoError(t, err) + // Verify run-level metadata + assert.Equal(t, "test-json-001", output.Name) + assert.Equal(t, "42s", output.Duration) + assert.Equal(t, "./test-agent.yaml", output.Config.Agent) + assert.Equal(t, "anthropic/claude-opus-4-5", output.Config.JudgeModel) + assert.Equal(t, 4, output.Config.Concurrency) + assert.Equal(t, "./evals", output.Config.EvalsDir) + + // Verify summary + assert.Equal(t, 3, output.Summary.TotalEvals) + assert.Equal(t, 1, output.Summary.FailedEvals) + assert.InDelta(t, 0.015, output.Summary.TotalCost, 0.0001) + // Should have 2 sessions (excluding the error case) - assert.Len(t, loadedSessions, 2) + assert.Len(t, output.Sessions, 2) // Verify session content titles := make(map[string]*session.Session) - for _, sess := range loadedSessions { + for _, sess := range output.Sessions { titles[sess.Title] = sess } @@ -198,10 +245,49 @@ func TestSaveRunSessionsJSON(t *testing.T) { assert.Equal(t, int64(50), sess1Loaded.OutputTokens) assert.InDelta(t, 0.01, sess1Loaded.Cost, 0.0001) + // Verify eval results are populated + require.NotNil(t, sess1Loaded.EvalResult) + assert.True(t, sess1Loaded.EvalResult.Passed) + assert.NotEmpty(t, sess1Loaded.EvalResult.Successes) + assert.Empty(t, sess1Loaded.EvalResult.Failures) + assert.InDelta(t, 0.01, sess1Loaded.EvalResult.Cost, 0.0001) + assert.Equal(t, int64(50), sess1Loaded.EvalResult.OutputTokens) + + // Verify structured relevance check + require.NotNil(t, sess1Loaded.EvalResult.Checks.Relevance) + assert.True(t, sess1Loaded.EvalResult.Checks.Relevance.Passed) + assert.InDelta(t, 2, sess1Loaded.EvalResult.Checks.Relevance.PassedCount, 0.01) + assert.InDelta(t, 2, sess1Loaded.EvalResult.Checks.Relevance.Total, 0.01) + + // No size or tool calls checks were configured + assert.Nil(t, sess1Loaded.EvalResult.Checks.Size) + assert.Nil(t, sess1Loaded.EvalResult.Checks.ToolCalls) + sess2Loaded := titles["eval-json-2"] assert.Equal(t, int64(80), sess2Loaded.InputTokens) assert.Equal(t, int64(30), sess2Loaded.OutputTokens) assert.InDelta(t, 0.005, sess2Loaded.Cost, 0.0001) + + // Verify failed eval result + require.NotNil(t, sess2Loaded.EvalResult) + assert.False(t, sess2Loaded.EvalResult.Passed) + assert.NotEmpty(t, sess2Loaded.EvalResult.Failures) + + // Verify structured relevance check with per-criterion results + require.NotNil(t, sess2Loaded.EvalResult.Checks.Relevance) + assert.False(t, sess2Loaded.EvalResult.Checks.Relevance.Passed) + assert.InDelta(t, 1, sess2Loaded.EvalResult.Checks.Relevance.PassedCount, 0.01) + assert.InDelta(t, 2, sess2Loaded.EvalResult.Checks.Relevance.Total, 0.01) + require.Len(t, sess2Loaded.EvalResult.Checks.Relevance.Results, 2) + + // First criterion should be passed with reason + assert.True(t, sess2Loaded.EvalResult.Checks.Relevance.Results[0].Passed) + assert.Equal(t, "the response says 4", sess2Loaded.EvalResult.Checks.Relevance.Results[0].Reason) + + // Second criterion should be failed with reason + assert.False(t, sess2Loaded.EvalResult.Checks.Relevance.Results[1].Passed) + assert.Equal(t, "explains the math", sess2Loaded.EvalResult.Checks.Relevance.Results[1].Criterion) + assert.Equal(t, "no explanation given", sess2Loaded.EvalResult.Checks.Relevance.Results[1].Reason) } func TestSaveRunSessionsWithCost(t *testing.T) { diff --git a/pkg/evaluation/types.go b/pkg/evaluation/types.go index 5b6483987..7d2c012c0 100644 --- a/pkg/evaluation/types.go +++ b/pkg/evaluation/types.go @@ -28,7 +28,7 @@ type Result struct { ToolCallsExpected float64 `json:"tool_calls_score_expected"` RelevancePassed float64 `json:"relevance"` RelevanceExpected float64 `json:"relevance_expected"` - FailedRelevance []RelevanceResult `json:"failed_relevance,omitempty"` + RelevanceResults []RelevanceResult `json:"relevance_results,omitempty"` Error string `json:"error,omitempty"` RawOutput []map[string]any `json:"raw_output,omitempty"` Session *session.Session `json:"-"` // Full session for database storage (not in JSON) @@ -63,11 +63,13 @@ func (r *Result) checkResults() (successes, failures []string) { if r.RelevancePassed >= r.RelevanceExpected { successes = append(successes, fmt.Sprintf("relevance %.0f/%.0f", r.RelevancePassed, r.RelevanceExpected)) } else { - for _, result := range r.FailedRelevance { - if result.Reason != "" { - failures = append(failures, fmt.Sprintf("relevance: %s (reason: %s)", result.Criterion, result.Reason)) - } else { - failures = append(failures, "relevance: "+result.Criterion) + for _, result := range r.RelevanceResults { + if !result.Passed { + if result.Reason != "" { + failures = append(failures, fmt.Sprintf("relevance: %s (reason: %s)", result.Criterion, result.Reason)) + } else { + failures = append(failures, "relevance: "+result.Criterion) + } } } } @@ -94,10 +96,30 @@ type EvalRun struct { Name string `json:"name"` Timestamp time.Time `json:"timestamp"` Duration time.Duration `json:"duration"` + Config Config `json:"-"` // Used to build RunOutput, not serialized directly Results []Result `json:"results"` Summary Summary `json:"summary"` } +// RunOutput is the top-level structure for the evaluation run JSON output. +type RunOutput struct { + Name string `json:"name"` + Timestamp time.Time `json:"timestamp"` + Duration string `json:"duration"` + Config RunOutputConfig `json:"config"` + Summary Summary `json:"summary"` + Sessions []*session.Session `json:"sessions"` +} + +// RunOutputConfig captures the evaluation run configuration. +type RunOutputConfig struct { + Agent string `json:"agent"` + JudgeModel string `json:"judge_model,omitempty"` + Concurrency int `json:"concurrency"` + EvalsDir string `json:"evals_dir"` + BaseImage string `json:"base_image,omitempty"` +} + // Config holds configuration for evaluation runs. type Config struct { AgentFilename string // Path to the agent configuration file diff --git a/pkg/session/session.go b/pkg/session/session.go index 522c95d0a..808075ca0 100644 --- a/pkg/session/session.go +++ b/pkg/session/session.go @@ -74,6 +74,9 @@ type Session struct { // Evals contains evaluation criteria for this session (used by eval framework) Evals *EvalCriteria `json:"evals,omitempty"` + // EvalResult contains the evaluation scoring outcome (populated after eval run). + EvalResult *EvalResult `json:"eval_result,omitempty"` + // Messages holds the conversation history (messages and sub-sessions) Messages []Item `json:"messages"` @@ -229,6 +232,53 @@ func NewSubSessionItem(subSession *Session) Item { return Item{SubSession: subSession} } +// EvalResult contains the evaluation scoring outcome for a session. +type EvalResult struct { + Passed bool `json:"passed"` + Successes []string `json:"successes,omitempty"` + Failures []string `json:"failures,omitempty"` + Error string `json:"error,omitempty"` + Cost float64 `json:"cost"` + OutputTokens int64 `json:"output_tokens"` + Checks EvalResultChecks `json:"checks"` +} + +// EvalResultChecks groups the individual check results. +// Only checks that were evaluated will be present (omitted if nil). +type EvalResultChecks struct { + Size *SizeCheck `json:"size,omitempty"` + ToolCalls *ToolCallsCheck `json:"tool_calls,omitempty"` + Relevance *RelevanceCheck `json:"relevance,omitempty"` +} + +// SizeCheck contains the result of the response size check. +type SizeCheck struct { + Passed bool `json:"passed"` + Actual string `json:"actual"` + Expected string `json:"expected"` +} + +// ToolCallsCheck contains the result of the tool calls F1 score check. +type ToolCallsCheck struct { + Passed bool `json:"passed"` + Score float64 `json:"score"` +} + +// RelevanceCheck contains the result of the LLM judge relevance check. +type RelevanceCheck struct { + Passed bool `json:"passed"` + PassedCount float64 `json:"passed_count"` + Total float64 `json:"total"` + Results []RelevanceCriterionResult `json:"results"` +} + +// RelevanceCriterionResult contains the judge's verdict on a single relevance criterion. +type RelevanceCriterionResult struct { + Criterion string `json:"criterion"` + Passed bool `json:"passed"` + Reason string `json:"reason,omitempty"` +} + // EvalCriteria contains the evaluation criteria for a session. type EvalCriteria struct { Relevance []string `json:"relevance"` // Statements that should be true about the response