eval: add optional setup script support for eval sessions

dgageot · dgageot · commit 6105efbc88b5 · 2026-02-10T08:18:00.000+01:00
Add a 'setup' property to EvalCriteria that allows defining a sh script
to run inside the eval container before cagent exec. The script is
written to a temp file, bind-mounted into the container, and executed
as part of the entrypoint chain. This is useful for preparing the
container environment (e.g., installing packages, initializing repos)
before the agent runs.

Assisted-By: cagent
diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go
@@ -309,7 +309,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
 		return result, fmt.Errorf("building eval image: %w", err)
 	}
 
-	events, err := r.runCagentInContainer(ctx, imageID, getUserMessages(evalSess.Session))
+	events, err := r.runCagentInContainer(ctx, imageID, getUserMessages(evalSess.Session), evals.Setup)
 	if err != nil {
 		return result, fmt.Errorf("running cagent in container: %w", err)
 	}
@@ -346,7 +346,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
 	return result, nil
 }
 
-func (r *Runner) runCagentInContainer(ctx context.Context, imageID string, questions []string) ([]map[string]any, error) {
+func (r *Runner) runCagentInContainer(ctx context.Context, imageID string, questions []string, setup string) ([]map[string]any, error) {
 	agentDir := r.agentSource.ParentDir()
 	agentFile := filepath.Base(r.agentSource.Name())
 	containerName := fmt.Sprintf("cagent-eval-%d", uuid.New().ID())
@@ -396,7 +396,31 @@ func (r *Runner) runCagentInContainer(ctx context.Context, imageID string, quest
 		}
 	}
 
-	args = append(args, imageID, "/configs/"+agentFile)
+	// When a setup script is provided, mount it into the container and
+	// override the entrypoint to run it before cagent exec.
+	// The default entrypoint is: /run.sh /cagent exec --yolo --json
+	// /run.sh starts dockerd then exec's "$@".
+	if setup != "" {
+		setupFile := filepath.Join(os.TempDir(), fmt.Sprintf("cagent-eval-setup-%d.sh", uuid.New().ID()))
+		if err := os.WriteFile(setupFile, []byte(setup), 0o600); err != nil {
+			return nil, fmt.Errorf("writing setup script: %w", err)
+		}
+		defer os.Remove(setupFile)
+
+		args = append(args,
+			"-v", setupFile+":/setup.sh:ro",
+			"--entrypoint", "/run.sh",
+		)
+	}
+
+	args = append(args, imageID)
+
+	if setup != "" {
+		// Run setup script, then cagent exec with the original arguments.
+		args = append(args, "sh", "-c", "sh /setup.sh && exec /cagent exec --yolo --json \"$@\"", "--", "/configs/"+agentFile)
+	} else {
+		args = append(args, "/configs/"+agentFile)
+	}
 	args = append(args, questions...)
 
 	cmd := exec.CommandContext(ctx, "docker", args...)
diff --git a/pkg/session/session.go b/pkg/session/session.go
@@ -207,6 +207,7 @@ type EvalCriteria struct {
 	Relevance  []string `json:"relevance"`             // Statements that should be true about the response
 	WorkingDir string   `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
 	Size       string   `json:"size,omitempty"`        // Expected response size: S, M, L, XL
+	Setup      string   `json:"setup,omitempty"`       // Optional sh script to run in the container before cagent exec
 }
 
 // Session helper methods

Original file line number	Diff line number	Diff line change
`@@ -207,6 +207,7 @@ type EvalCriteria struct {`
`207`	`207`	Relevance []string `json:"relevance"` // Statements that should be true about the response
`208`	`208`	WorkingDir string `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
`209`	`209`	Size string `json:"size,omitempty"` // Expected response size: S, M, L, XL
	`210`	+ Setup string `json:"setup,omitempty"` // Optional sh script to run in the container before cagent exec
`210`	`211`	`}`
`211`	`212`
`212`	`213`	`// Session helper methods`