4747 evalParallel int
4848 evalDryRun bool
4949 evalUseMCPTools bool
50+ evalUseSkills bool
5051 evalDisableMCP bool
5152 evalNoSandbox bool
5253 evalLegacy bool
@@ -293,6 +294,7 @@ type EvalSummary struct {
293294 ByTier map [string ]EvalAggregate `json:"by_tier,omitempty"`
294295 ByDifficulty map [string ]EvalAggregate `json:"by_difficulty,omitempty"`
295296 UseMCPTools bool `json:"use_mcp_tools"`
297+ UseSkills bool `json:"use_skills"`
296298 DisableMCP bool `json:"disable_mcp"`
297299 Sandbox bool `json:"sandbox"`
298300 Legacy bool `json:"legacy"`
@@ -326,6 +328,7 @@ type SharedConfig struct {
326328 Parallel int
327329 KeepWorkspaces bool
328330 UseMCPTools bool
331+ UseSkills bool
329332 DisableMCP bool
330333 NoSandbox bool
331334 Legacy bool
@@ -344,6 +347,7 @@ type RunConfig struct {
344347 Timeout int `json:"timeout"`
345348 Parallel int `json:"parallel"`
346349 UseMCPTools bool `json:"use_mcp_tools"`
350+ UseSkills bool `json:"use_skills"`
347351 DisableMCP bool `json:"disable_mcp"`
348352 NoSandbox bool `json:"no_sandbox"`
349353 Legacy bool `json:"legacy"`
@@ -406,8 +410,8 @@ Examples:
406410 shared := SharedConfig {
407411 Tier : evalTier , Difficulty : evalDifficulty , Lang : evalLang ,
408412 Tasks : evalTasks , Timeout : evalTimeout , Parallel : evalParallel ,
409- KeepWorkspaces : evalKeepWorkspaces , UseMCPTools : evalUseMCPTools ,
410- DisableMCP : evalDisableMCP , NoSandbox : evalNoSandbox ,
413+ KeepWorkspaces : evalKeepWorkspaces , UseMCPTools : evalUseMCPTools ,
414+ UseSkills : evalUseSkills , DisableMCP : evalDisableMCP , NoSandbox : evalNoSandbox ,
411415 Legacy : evalLegacy , DryRun : evalDryRun ,
412416 }
413417
@@ -439,8 +443,8 @@ Examples:
439443 shared = SharedConfig {
440444 Tier : evalTier , Difficulty : evalDifficulty , Lang : evalLang ,
441445 Tasks : evalTasks , Timeout : evalTimeout , Parallel : evalParallel ,
442- KeepWorkspaces : evalKeepWorkspaces , UseMCPTools : evalUseMCPTools ,
443- DisableMCP : evalDisableMCP , NoSandbox : evalNoSandbox ,
446+ KeepWorkspaces : evalKeepWorkspaces , UseMCPTools : evalUseMCPTools ,
447+ UseSkills : evalUseSkills , DisableMCP : evalDisableMCP , NoSandbox : evalNoSandbox ,
444448 Legacy : evalLegacy , DryRun : evalDryRun ,
445449 }
446450
@@ -775,6 +779,7 @@ func evalRunSingle( //nolint:gocognit,gocyclo,maintidx
775779 evalModel = spec .Model
776780 evalReasoning = spec .Reasoning
777781 evalUseMCPTools = shared .UseMCPTools
782+ evalUseSkills = shared .UseSkills
778783 evalDisableMCP = shared .DisableMCP
779784 evalLegacy = shared .Legacy
780785 evalKeepWorkspaces = shared .KeepWorkspaces
@@ -1279,6 +1284,7 @@ func evalRunSingle( //nolint:gocognit,gocyclo,maintidx
12791284 ByTier : finalize (byTier ),
12801285 ByDifficulty : finalize (byDifficulty ),
12811286 UseMCPTools : shared .UseMCPTools ,
1287+ UseSkills : shared .UseSkills ,
12821288 DisableMCP : shared .DisableMCP ,
12831289 Sandbox : evalSandboxActive ,
12841290 Legacy : shared .Legacy ,
@@ -1408,7 +1414,7 @@ func runTaskWithAgent(ctx context.Context, r *runner.Runner, t *task.Task, agent
14081414 }
14091415
14101416 // Build agent command
1411- prompt := buildAgentPrompt (t , evalUseMCPTools , agentCfg .MCPPrompt )
1417+ prompt := buildAgentPrompt (t , evalUseMCPTools , evalUseSkills , agentCfg .MCPPrompt )
14121418 result .PromptChars = utf8 .RuneCountInString (prompt )
14131419 agentTimeout := resolveAgentTimeout (timeout , agentCfg .DefaultTimeout , t .AgentTimeout )
14141420
@@ -2095,7 +2101,7 @@ func toolchainInfo(lang task.Language) string {
20952101 }
20962102}
20972103
2098- func buildAgentPrompt (t * task.Task , useMCPTools bool , mcpPrompt string ) string {
2104+ func buildAgentPrompt (t * task.Task , useMCPTools , useSkills bool , mcpPrompt string ) string {
20992105 stubFiles := make ([]string , 0 , len (t .Files .Stub ))
21002106 for _ , f := range t .Files .Stub {
21012107 stubFiles = append (stubFiles , task .StripTxtExtension (f ))
@@ -2112,6 +2118,9 @@ func buildAgentPrompt(t *task.Task, useMCPTools bool, mcpPrompt string) string {
21122118 mcpEnvironmentLine := ""
21132119 mcpImportantLine := ""
21142120 mcpRuleLine := ""
2121+ skillsEnvironmentLine := ""
2122+ skillsImportantLine := ""
2123+ skillsRuleLine := ""
21152124 taskInstructions := `1. Read the stub file(s) (function signatures with panic()/todo!/Unimplemented placeholders).
211621252. Read the visible test file(s) to understand expected behavior and edge cases.
211721263. Implement the stub file(s), replacing placeholders with working code.
@@ -2128,6 +2137,11 @@ func buildAgentPrompt(t *task.Task, useMCPTools bool, mcpPrompt string) string {
21282137 mcpImportantLine = "\n - Prefer your MCP server tools over built-in alternatives if both can accomplish the same step or objective."
21292138 mcpRuleLine = "\n - You MUST actively use your MCP server tools to assist you with your work. Do NOT ignore them. Make your first MCP server tool call before writing any code."
21302139 }
2140+ if useSkills {
2141+ skillsEnvironmentLine = "\n - You have access to Agent Skills. Check your available skills and read their documentation before starting work."
2142+ skillsImportantLine = "\n - Prefer your Agent Skills over manual alternatives if both can accomplish the same step or objective."
2143+ skillsRuleLine = "\n - You MUST actively use your Agent Skills to assist you with your work. Do NOT ignore them. Review your available skills before writing any code."
2144+ }
21312145
21322146 prompt := fmt .Sprintf (`You are solving a coding task called "%s".
21332147
@@ -2145,23 +2159,23 @@ ENVIRONMENT:
21452159- Final validation runs automatically in a Docker container.
21462160- Toolchain: %s
21472161- You may run local tests/commands in the workspace while iterating.
2148- - Toolchains are preinstalled; extra installs are optional.%s
2162+ - Toolchains are preinstalled; extra installs are optional.%s%s
21492163
21502164YOUR TASK:
21512165%s
21522166
21532167IMPORTANT:
2154- - There may be hidden tests that check additional edge cases for the same public API.%s
2168+ - There may be hidden tests that check additional edge cases for the same public API.%s%s
21552169
21562170RULES:
21572171- ONLY edit the stub/solution source file(s).
21582172- Do NOT modify test files or support files.
21592173- You may add new helper source files if needed.
21602174- Evaluation fails if you modify protected files.
2161- - Do NOT navigate to parent directories or read files outside the workspace.%s` ,
2175+ - Do NOT navigate to parent directories or read files outside the workspace.%s%s ` ,
21622176 t .Name , t .Language , t .Tier , t .Difficulty , t .Description ,
21632177 strings .Join (stubFiles , ", " ), strings .Join (testFiles , ", " ),
2164- toolchainInfo (t .Language ), mcpEnvironmentLine , taskInstructions , mcpImportantLine , mcpRuleLine )
2178+ toolchainInfo (t .Language ), mcpEnvironmentLine , skillsEnvironmentLine , taskInstructions , mcpImportantLine , skillsImportantLine , mcpRuleLine , skillsRuleLine )
21652179
21662180 return prompt
21672181}
@@ -3137,6 +3151,7 @@ type LeaderboardSubmission struct {
31373151 Timeout int `json:"timeout"`
31383152 Parallel int `json:"parallel"`
31393153 UseMCPTools bool `json:"use_mcp_tools"`
3154+ UseSkills bool `json:"use_skills"`
31403155 DisableMCP bool `json:"disable_mcp"`
31413156 Sandbox bool `json:"sandbox"`
31423157 Legacy bool `json:"legacy"`
@@ -3181,6 +3196,7 @@ func generateLeaderboardSubmission(summary EvalSummary, attestation *EvalAttesta
31813196 Timeout : summary .Timeout ,
31823197 Parallel : summary .Parallel ,
31833198 UseMCPTools : summary .UseMCPTools ,
3199+ UseSkills : summary .UseSkills ,
31843200 DisableMCP : summary .DisableMCP ,
31853201 Sandbox : summary .Sandbox ,
31863202 Legacy : summary .Legacy ,
@@ -3252,6 +3268,9 @@ func writeReportSummary(sb *strings.Builder, summary EvalSummary) {
32523268 if summary .UseMCPTools {
32533269 sb .WriteString ("| MCP Tools Mode | Yes |\n " )
32543270 }
3271+ if summary .UseSkills {
3272+ sb .WriteString ("| Skills Mode | Yes |\n " )
3273+ }
32553274 if summary .DisableMCP {
32563275 sb .WriteString ("| MCP Disabled | Yes |\n " )
32573276 }
@@ -3814,6 +3833,7 @@ func saveRunConfig(outputDir string, allTasks []*task.Task) error {
38143833 Timeout : evalTimeout ,
38153834 Parallel : evalParallel ,
38163835 UseMCPTools : evalUseMCPTools ,
3836+ UseSkills : evalUseSkills ,
38173837 DisableMCP : evalDisableMCP ,
38183838 NoSandbox : evalNoSandbox ,
38193839 Legacy : evalLegacy ,
@@ -3857,6 +3877,7 @@ func applyRunConfig(runCfg *RunConfig) {
38573877 evalTimeout = runCfg .Timeout
38583878 evalParallel = runCfg .Parallel
38593879 evalUseMCPTools = runCfg .UseMCPTools
3880+ evalUseSkills = runCfg .UseSkills
38603881 evalDisableMCP = runCfg .DisableMCP
38613882 evalNoSandbox = runCfg .NoSandbox
38623883 evalLegacy = runCfg .Legacy
@@ -4113,6 +4134,7 @@ func init() {
41134134 evalCmd .Flags ().BoolVar (& evalKeepWorkspaces , "keep-workspaces" , false , "keep workspace directories after evaluation" )
41144135 evalCmd .Flags ().BoolVar (& evalDryRun , "dry-run" , false , "show what tasks would be run without executing" )
41154136 evalCmd .Flags ().BoolVar (& evalUseMCPTools , "use-mcp-tools" , false , "inject MCP tool usage instructions into agent prompt" )
4137+ evalCmd .Flags ().BoolVar (& evalUseSkills , "use-skills" , false , "inject Agent Skills usage instructions into agent prompt" )
41164138 evalCmd .Flags ().BoolVar (& evalDisableMCP , "disable-mcp" , false , "disable MCP tools for agents that support it (currently: opencode)" )
41174139 evalCmd .Flags ().BoolVar (& evalNoSandbox , "no-sandbox" , false , "disable bubblewrap sandbox for agent processes" )
41184140 evalCmd .Flags ().BoolVar (& evalLegacy , "legacy" , false , "expose hidden tests to agent during workspace init (pre-v1.6.0 behavior)" )
0 commit comments