66 - ' skills/**'
77 - ' evals/**'
88 workflow_dispatch :
9- inputs :
10- agent :
11- description : ' Agent to run evals against (copilot or gemini)'
12- required : true
13- default : ' copilot'
14- type : choice
15- options :
16- - copilot
17- - gemini
18- trials :
19- description : ' Number of trials per task'
20- required : false
21- default : ' 1'
22- type : string
239
2410permissions :
2511 contents : read
2612 pull-requests : write
2713
2814jobs :
29- # Job 1: Always validate graders against reference solutions
15+ # Job 1: Validate graders against reference solutions
3016 validate_graders :
3117 runs-on : ubuntu-latest
3218 timeout-minutes : 10
5238 path : evals/results/
5339 retention-days : 30
5440
55- # Job 2: Run evals against an AI agent (copilot or gemini)
56- # Triggered manually via workflow_dispatch, or can be called from other workflows
57- agent_eval :
41+ # Job 2: Run evals against the Copilot agent
42+ agent_eval_copilot :
5843 runs-on : ubuntu-latest
5944 timeout-minutes : 60
6045
@@ -68,43 +53,80 @@ jobs:
6853 node-version : ' 22'
6954
7055 - name : Install Copilot CLI
71- if : inputs.agent == 'copilot'
7256 run : npm install -g @github/copilot
7357
58+ - name : Run eval against Copilot
59+ working-directory : evals
60+ env :
61+ GITHUB_TOKEN : ${{ secrets.GITHUB_TOKEN }}
62+ run : bash run-eval.sh --all --agent copilot
63+
64+ - name : Upload Copilot eval results
65+ if : always()
66+ uses : actions/upload-artifact@v4
67+ with :
68+ name : skill-eval-agent-copilot-results
69+ path : evals/results/
70+ retention-days : 30
71+
72+ # Job 3: Run evals against the Gemini agent
73+ agent_eval_gemini :
74+ runs-on : ubuntu-latest
75+ timeout-minutes : 60
76+
77+ steps :
78+ - name : Checkout repository
79+ uses : actions/checkout@v4
80+
81+ - name : Set up Node.js
82+ uses : actions/setup-node@v4
83+ with :
84+ node-version : ' 22'
85+
7486 - name : Install Gemini CLI
75- if : inputs.agent == 'gemini'
7687 run : npm install -g @google/gemini-cli
7788
78- - name : Run agent-based eval
89+ - name : Run eval against Gemini
7990 working-directory : evals
8091 env :
81- GITHUB_TOKEN : ${{ secrets.GITHUB_TOKEN }}
8292 GEMINI_API_KEY : ${{ secrets.GEMINI_API_KEY }}
83- run : |
84- bash run-eval.sh --all \
85- --agent ${{ inputs.agent }} \
86- --trials ${{ inputs.trials || '1' }}
93+ run : bash run-eval.sh --all --agent gemini
8794
88- - name : Upload agent eval results
95+ - name : Upload Gemini eval results
8996 if : always()
9097 uses : actions/upload-artifact@v4
9198 with :
92- name : skill-eval-agent-${{ inputs.agent }} -results
99+ name : skill-eval-agent-gemini -results
93100 path : evals/results/
94101 retention-days : 30
95102
96- # Job 3 : Post summary comment on PRs
103+ # Job 4 : Post combined summary comment on PRs
97104 post_summary :
98105 if : always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
99- needs : [validate_graders]
106+ needs : [validate_graders, agent_eval_copilot, agent_eval_gemini ]
100107 runs-on : ubuntu-latest
101108
102109 steps :
103110 - name : Download validation results
104111 uses : actions/download-artifact@v4
105112 with :
106113 name : skill-eval-validation-results
107- path : evals/results/
114+ path : evals/results/validation
115+ continue-on-error : true
116+
117+ - name : Download Copilot results
118+ uses : actions/download-artifact@v4
119+ with :
120+ name : skill-eval-agent-copilot-results
121+ path : evals/results/copilot
122+ continue-on-error : true
123+
124+ - name : Download Gemini results
125+ uses : actions/download-artifact@v4
126+ with :
127+ name : skill-eval-agent-gemini-results
128+ path : evals/results/gemini
129+ continue-on-error : true
108130
109131 - name : Post summary comment
110132 uses : actions/github-script@v7
@@ -113,40 +135,69 @@ jobs:
113135 const fs = require('fs');
114136 const path = require('path');
115137
116- const resultsDir = 'evals/results';
117- let summary = '## 📊 Skill Eval Results\n\n';
118-
119- try {
120- const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
121- if (files.length === 0) {
122- summary += '> ⚠️ No eval results found. The eval run may have failed.\n';
123- } else {
124- summary += '| Task | Agent | Pass Rate | pass@k | Status |\n';
125- summary += '|---|---|---|---|---|\n';
126-
138+ function readResults(dir) {
139+ const results = [];
140+ try {
141+ if (!fs.existsSync(dir)) return results;
142+ const files = fs.readdirSync(dir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
127143 for (const file of files) {
128144 try {
129- const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8'));
130- const taskName = data.task || file.replace('.json', '');
131- const agent = data.agent || 'reference';
132- const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A';
133- const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A';
134- const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌';
135- summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`;
145+ results.push(JSON.parse(fs.readFileSync(path.join(dir, file), 'utf8')));
136146 } catch (e) {
137- summary += `| ${ file} | — | Error | Error | ❌ |\n` ;
147+ results.push({ task: file.replace('.json', ''), error: true }) ;
138148 }
139149 }
150+ } catch (e) { /* dir doesn't exist */ }
151+ return results;
152+ }
153+
154+ let summary = '## 📊 Skill Eval Results\n\n';
155+
156+ // --- Validation results ---
157+ const validation = readResults('evals/results/validation');
158+ if (validation.length > 0) {
159+ summary += '### Grader Validation (reference solutions)\n\n';
160+ summary += '| Task | Pass Rate | Status |\n';
161+ summary += '|---|---|---|\n';
162+ for (const r of validation) {
163+ if (r.error) { summary += `| ${r.task} | Error | ❌ |\n`; continue; }
164+ const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A';
165+ const status = r.passRate >= 1.0 ? '✅' : '❌';
166+ summary += `| ${r.task} | ${passRate} | ${status} |\n`;
167+ }
168+ summary += '\n';
169+ }
140170
141- summary += '\n### Thresholds\n';
142- summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
143- summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
144- summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';
171+ // --- Agent results ---
172+ const copilot = readResults('evals/results/copilot');
173+ const gemini = readResults('evals/results/gemini');
174+
175+ if (copilot.length > 0 || gemini.length > 0) {
176+ summary += '### Agent Evaluation\n\n';
177+ summary += '| Task | Agent | Pass Rate | pass@k | Status |\n';
178+ summary += '|---|---|---|---|---|\n';
179+
180+ for (const r of [...copilot, ...gemini]) {
181+ if (r.error) { summary += `| ${r.task} | — | Error | Error | ❌ |\n`; continue; }
182+ const taskName = r.task || 'unknown';
183+ const agent = r.agent || 'unknown';
184+ const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A';
185+ const passAtK = r.passAtK != null ? `${(r.passAtK * 100).toFixed(0)}%` : 'N/A';
186+ const status = r.passAtK >= 0.8 ? '✅' : r.passAtK >= 0.6 ? '⚠️' : '❌';
187+ summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`;
145188 }
146- } catch (e) {
147- summary += `> ⚠️ Could not read results: ${e.message}\n`;
189+ summary += '\n';
148190 }
149191
192+ if (validation.length === 0 && copilot.length === 0 && gemini.length === 0) {
193+ summary += '> ⚠️ No eval results found. The eval runs may have failed.\n';
194+ }
195+
196+ summary += '### Thresholds\n';
197+ summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
198+ summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
199+ summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';
200+
150201 await github.rest.issues.createComment({
151202 owner: context.repo.owner,
152203 repo: context.repo.repo,
0 commit comments