Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions pkg/evaluation/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ func Evaluate(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName st
Name: runName,
Timestamp: startTime,
Duration: duration,
Config: cfg,
Results: results,
Summary: summary,
}
Expand Down Expand Up @@ -356,12 +357,18 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
if r.judge != nil && len(evals.Relevance) > 0 {
// Use transcript for relevance checking to preserve temporal ordering
transcript := buildTranscript(events)
passed, failed, err := r.judge.CheckRelevance(ctx, transcript, evals.Relevance)
results, err := r.judge.CheckRelevance(ctx, transcript, evals.Relevance)
if err != nil {
return result, fmt.Errorf("relevance check failed: %w", err)
}
result.RelevancePassed = float64(passed)
result.FailedRelevance = failed
var passed float64
for _, rr := range results {
if rr.Passed {
passed++
}
}
result.RelevancePassed = passed
result.RelevanceResults = results
}

slog.Debug("Evaluation complete", "title", evalSess.Title, "duration", time.Since(startTime))
Expand Down
4 changes: 2 additions & 2 deletions pkg/evaluation/eval_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ func TestResultCheckResults(t *testing.T) {
},
{
name: "relevance failures listed",
result: Result{RelevanceExpected: 2, RelevancePassed: 0, FailedRelevance: []RelevanceResult{{Criterion: "check A", Reason: "reason A"}, {Criterion: "check B", Reason: "reason B"}}},
result: Result{RelevanceExpected: 2, RelevancePassed: 0, RelevanceResults: []RelevanceResult{{Criterion: "check A", Passed: false, Reason: "reason A"}, {Criterion: "check B", Passed: false, Reason: "reason B"}}},
wantSuccess: nil,
wantFailures: []string{"relevance: check A (reason: reason A)", "relevance: check B (reason: reason B)"},
},
Expand Down Expand Up @@ -658,7 +658,7 @@ func TestProgressBarPrintResult(t *testing.T) {
Size: "S",
RelevanceExpected: 2,
RelevancePassed: 1,
FailedRelevance: []RelevanceResult{{Criterion: "check failed", Reason: "did not meet criteria"}},
RelevanceResults: []RelevanceResult{{Criterion: "check failed", Passed: false, Reason: "did not meet criteria"}},
},
wantContains: []string{
"✗ mixed-session", // overall failed
Expand Down
41 changes: 20 additions & 21 deletions pkg/evaluation/judge.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,17 +97,18 @@ func (j *Judge) Validate(ctx context.Context) error {
// RelevanceResult contains the result of a single relevance check.
type RelevanceResult struct {
Criterion string `json:"criterion"`
Passed bool `json:"passed"`
Reason string `json:"reason"`
}

// CheckRelevance runs all relevance checks concurrently with the configured concurrency.
// It returns the number of passed checks, a slice of failed results with reasons, and an error
// if any check encountered an error (e.g. judge model misconfiguration). Errors cause a hard
// failure so that configuration issues are surfaced immediately rather than silently producing
// zero-relevance results.
func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []string) (passed int, failed []RelevanceResult, err error) {
// It returns a result for every criterion (both passed and failed, each with a reason from
// the judge model), and an error if any check encountered an error (e.g. judge model
// misconfiguration). Errors cause a hard failure so that configuration issues are surfaced
// immediately rather than silently producing zero-relevance results.
func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []string) (results []RelevanceResult, err error) {
if len(criteria) == 0 {
return 0, nil, nil
return nil, nil
}

// Create work channel
Expand All @@ -122,23 +123,23 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
close(work)

// Results slice preserves order
type result struct {
type rawResult struct {
passed bool
reason string
err error
}
results := make([]result, len(criteria))
rawResults := make([]rawResult, len(criteria))

var wg sync.WaitGroup
for range j.concurrency {
wg.Go(func() {
for item := range work {
if ctx.Err() != nil {
results[item.index] = result{err: fmt.Errorf("context cancelled: %w", ctx.Err())}
rawResults[item.index] = rawResult{err: fmt.Errorf("context cancelled: %w", ctx.Err())}
continue
}
pass, reason, checkErr := j.checkSingle(ctx, response, item.criterion)
results[item.index] = result{passed: pass, reason: reason, err: checkErr}
rawResults[item.index] = rawResult{passed: pass, reason: reason, err: checkErr}
}
})
}
Expand All @@ -147,26 +148,24 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
// Aggregate results. Any error is fatal — return it immediately so the
// caller can fail fast on judge misconfiguration.
var errs []error
for i, r := range results {
results = make([]RelevanceResult, len(criteria))
for i := range results {
results[i].Criterion = criteria[i]
}
for i, r := range rawResults {
if r.err != nil {
errs = append(errs, fmt.Errorf("checking %q: %w", criteria[i], r.err))
continue
}
if r.passed {
passed++
} else {
failed = append(failed, RelevanceResult{
Criterion: criteria[i],
Reason: r.reason,
})
}
results[i].Passed = r.passed
results[i].Reason = r.reason
}

if len(errs) > 0 {
return passed, failed, errors.Join(errs...)
return results, errors.Join(errs...)
}

return passed, failed, nil
return results, nil
}

// checkSingle checks a single relevance criterion against the response.
Expand Down
10 changes: 4 additions & 6 deletions pkg/evaluation/judge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,9 @@ func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) {
t.Parallel()

judge := NewJudge(nil, 1)
passed, failed, err := judge.CheckRelevance(t.Context(), "some response", nil)
results, err := judge.CheckRelevance(t.Context(), "some response", nil)

assert.Equal(t, 0, passed)
assert.Empty(t, failed)
assert.Empty(t, results)
assert.NoError(t, err)
}

Expand All @@ -63,11 +62,10 @@ func TestJudge_CheckRelevance_ContextCanceled(t *testing.T) {
cancel() // Cancel immediately

criteria := []string{"criterion1", "criterion2", "criterion3"}
passed, failed, err := judge.CheckRelevance(ctx, "some response", criteria)
results, err := judge.CheckRelevance(ctx, "some response", criteria)

// All should have errors due to context cancellation
assert.Equal(t, 0, passed)
assert.Empty(t, failed)
assert.Len(t, results, len(criteria))
require.Error(t, err)
assert.Contains(t, err.Error(), "context cancelled")
}
85 changes: 80 additions & 5 deletions pkg/evaluation/save.go
Original file line number Diff line number Diff line change
Expand Up @@ -374,11 +374,15 @@ func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {
return saveJSON(run, filepath.Join(outputDir, run.Name+".json"))
}

// SaveRunSessionsJSON saves all eval sessions to a single JSON file.
// Each session includes its eval criteria in the "evals" field.
// This complements SaveRunSessions which saves to SQLite, providing a
// human-readable format for inspection.
// SaveRunSessionsJSON saves the full evaluation run output to a JSON file.
// The output includes run metadata (config, summary) and all sessions with
// their eval criteria and scoring results (pass/fail, judge reasoning, errors).
func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {
// Populate eval results on each session
for i := range run.Results {
populateEvalResult(&run.Results[i])
}

// Collect all sessions from results
var sessions []*session.Session
for i := range run.Results {
Expand All @@ -387,8 +391,79 @@ func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {
}
}

output := RunOutput{
Name: run.Name,
Timestamp: run.Timestamp,
Duration: run.Duration.Round(time.Millisecond).String(),
Config: RunOutputConfig{
Agent: run.Config.AgentFilename,
JudgeModel: run.Config.JudgeModel,
Concurrency: run.Config.Concurrency,
EvalsDir: run.Config.EvalsDir,
BaseImage: run.Config.BaseImage,
},
Summary: run.Summary,
Sessions: sessions,
}

outputPath := filepath.Join(outputDir, run.Name+".json")
return saveJSON(sessions, outputPath)
return saveJSON(output, outputPath)
}

// populateEvalResult copies scoring data from a Result to its Session's EvalResult field.
func populateEvalResult(result *Result) {
if result.Session == nil {
return
}

successes, failures := result.checkResults()

evalResult := &session.EvalResult{
Passed: len(failures) == 0,
Successes: successes,
Failures: failures,
Error: result.Error,
Cost: result.Cost,
OutputTokens: result.OutputTokens,
}

// Populate size check if size was evaluated
if result.SizeExpected != "" {
evalResult.Checks.Size = &session.SizeCheck{
Passed: result.Size == result.SizeExpected,
Actual: result.Size,
Expected: result.SizeExpected,
}
}

// Populate tool calls check if tool calls were evaluated
if result.ToolCallsExpected > 0 {
evalResult.Checks.ToolCalls = &session.ToolCallsCheck{
Passed: result.ToolCallsScore >= 1.0,
Score: result.ToolCallsScore,
}
}

// Populate relevance check if relevance was evaluated
if result.RelevanceExpected > 0 {
results := make([]session.RelevanceCriterionResult, 0, len(result.RelevanceResults))
for _, rr := range result.RelevanceResults {
results = append(results, session.RelevanceCriterionResult{
Criterion: rr.Criterion,
Passed: rr.Passed,
Reason: rr.Reason,
})
}

evalResult.Checks.Relevance = &session.RelevanceCheck{
Passed: result.RelevancePassed >= result.RelevanceExpected,
PassedCount: result.RelevancePassed,
Total: result.RelevanceExpected,
Results: results,
}
}

result.Session.EvalResult = evalResult
}

func Save(sess *session.Session, filename string) (string, error) {
Expand Down
Loading
Loading