Skip to content

Commit 1330989

Browse files
Copilotkdinev
andcommitted
rework CI workflow to always run against both copilot and gemini agents
Co-authored-by: kdinev <1472513+kdinev@users.noreply.github.com>
1 parent b3fa973 commit 1330989

File tree

2 files changed

+119
-73
lines changed

2 files changed

+119
-73
lines changed

.github/workflows/skill-eval.yml

Lines changed: 107 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,13 @@ on:
66
- 'skills/**'
77
- 'evals/**'
88
workflow_dispatch:
9-
inputs:
10-
agent:
11-
description: 'Agent to run evals against (copilot or gemini)'
12-
required: true
13-
default: 'copilot'
14-
type: choice
15-
options:
16-
- copilot
17-
- gemini
18-
trials:
19-
description: 'Number of trials per task'
20-
required: false
21-
default: '1'
22-
type: string
239

2410
permissions:
2511
contents: read
2612
pull-requests: write
2713

2814
jobs:
29-
# Job 1: Always validate graders against reference solutions
15+
# Job 1: Validate graders against reference solutions
3016
validate_graders:
3117
runs-on: ubuntu-latest
3218
timeout-minutes: 10
@@ -52,9 +38,8 @@ jobs:
5238
path: evals/results/
5339
retention-days: 30
5440

55-
# Job 2: Run evals against an AI agent (copilot or gemini)
56-
# Triggered manually via workflow_dispatch, or can be called from other workflows
57-
agent_eval:
41+
# Job 2: Run evals against the Copilot agent
42+
agent_eval_copilot:
5843
runs-on: ubuntu-latest
5944
timeout-minutes: 60
6045

@@ -68,43 +53,80 @@ jobs:
6853
node-version: '22'
6954

7055
- name: Install Copilot CLI
71-
if: inputs.agent == 'copilot'
7256
run: npm install -g @github/copilot
7357

58+
- name: Run eval against Copilot
59+
working-directory: evals
60+
env:
61+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
62+
run: bash run-eval.sh --all --agent copilot
63+
64+
- name: Upload Copilot eval results
65+
if: always()
66+
uses: actions/upload-artifact@v4
67+
with:
68+
name: skill-eval-agent-copilot-results
69+
path: evals/results/
70+
retention-days: 30
71+
72+
# Job 3: Run evals against the Gemini agent
73+
agent_eval_gemini:
74+
runs-on: ubuntu-latest
75+
timeout-minutes: 60
76+
77+
steps:
78+
- name: Checkout repository
79+
uses: actions/checkout@v4
80+
81+
- name: Set up Node.js
82+
uses: actions/setup-node@v4
83+
with:
84+
node-version: '22'
85+
7486
- name: Install Gemini CLI
75-
if: inputs.agent == 'gemini'
7687
run: npm install -g @google/gemini-cli
7788

78-
- name: Run agent-based eval
89+
- name: Run eval against Gemini
7990
working-directory: evals
8091
env:
81-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
8292
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
83-
run: |
84-
bash run-eval.sh --all \
85-
--agent ${{ inputs.agent }} \
86-
--trials ${{ inputs.trials || '1' }}
93+
run: bash run-eval.sh --all --agent gemini
8794

88-
- name: Upload agent eval results
95+
- name: Upload Gemini eval results
8996
if: always()
9097
uses: actions/upload-artifact@v4
9198
with:
92-
name: skill-eval-agent-${{ inputs.agent }}-results
99+
name: skill-eval-agent-gemini-results
93100
path: evals/results/
94101
retention-days: 30
95102

96-
# Job 3: Post summary comment on PRs
103+
# Job 4: Post combined summary comment on PRs
97104
post_summary:
98105
if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
99-
needs: [validate_graders]
106+
needs: [validate_graders, agent_eval_copilot, agent_eval_gemini]
100107
runs-on: ubuntu-latest
101108

102109
steps:
103110
- name: Download validation results
104111
uses: actions/download-artifact@v4
105112
with:
106113
name: skill-eval-validation-results
107-
path: evals/results/
114+
path: evals/results/validation
115+
continue-on-error: true
116+
117+
- name: Download Copilot results
118+
uses: actions/download-artifact@v4
119+
with:
120+
name: skill-eval-agent-copilot-results
121+
path: evals/results/copilot
122+
continue-on-error: true
123+
124+
- name: Download Gemini results
125+
uses: actions/download-artifact@v4
126+
with:
127+
name: skill-eval-agent-gemini-results
128+
path: evals/results/gemini
129+
continue-on-error: true
108130

109131
- name: Post summary comment
110132
uses: actions/github-script@v7
@@ -113,40 +135,69 @@ jobs:
113135
const fs = require('fs');
114136
const path = require('path');
115137
116-
const resultsDir = 'evals/results';
117-
let summary = '## 📊 Skill Eval Results\n\n';
118-
119-
try {
120-
const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
121-
if (files.length === 0) {
122-
summary += '> ⚠️ No eval results found. The eval run may have failed.\n';
123-
} else {
124-
summary += '| Task | Agent | Pass Rate | pass@k | Status |\n';
125-
summary += '|---|---|---|---|---|\n';
126-
138+
function readResults(dir) {
139+
const results = [];
140+
try {
141+
if (!fs.existsSync(dir)) return results;
142+
const files = fs.readdirSync(dir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
127143
for (const file of files) {
128144
try {
129-
const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8'));
130-
const taskName = data.task || file.replace('.json', '');
131-
const agent = data.agent || 'reference';
132-
const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A';
133-
const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A';
134-
const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌';
135-
summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`;
145+
results.push(JSON.parse(fs.readFileSync(path.join(dir, file), 'utf8')));
136146
} catch (e) {
137-
summary += `| ${file} | — | Error | Error | ❌ |\n`;
147+
results.push({ task: file.replace('.json', ''), error: true });
138148
}
139149
}
150+
} catch (e) { /* dir doesn't exist */ }
151+
return results;
152+
}
153+
154+
let summary = '## 📊 Skill Eval Results\n\n';
155+
156+
// --- Validation results ---
157+
const validation = readResults('evals/results/validation');
158+
if (validation.length > 0) {
159+
summary += '### Grader Validation (reference solutions)\n\n';
160+
summary += '| Task | Pass Rate | Status |\n';
161+
summary += '|---|---|---|\n';
162+
for (const r of validation) {
163+
if (r.error) { summary += `| ${r.task} | Error | ❌ |\n`; continue; }
164+
const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A';
165+
const status = r.passRate >= 1.0 ? '✅' : '❌';
166+
summary += `| ${r.task} | ${passRate} | ${status} |\n`;
167+
}
168+
summary += '\n';
169+
}
140170
141-
summary += '\n### Thresholds\n';
142-
summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
143-
summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
144-
summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';
171+
// --- Agent results ---
172+
const copilot = readResults('evals/results/copilot');
173+
const gemini = readResults('evals/results/gemini');
174+
175+
if (copilot.length > 0 || gemini.length > 0) {
176+
summary += '### Agent Evaluation\n\n';
177+
summary += '| Task | Agent | Pass Rate | pass@k | Status |\n';
178+
summary += '|---|---|---|---|---|\n';
179+
180+
for (const r of [...copilot, ...gemini]) {
181+
if (r.error) { summary += `| ${r.task} | — | Error | Error | ❌ |\n`; continue; }
182+
const taskName = r.task || 'unknown';
183+
const agent = r.agent || 'unknown';
184+
const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A';
185+
const passAtK = r.passAtK != null ? `${(r.passAtK * 100).toFixed(0)}%` : 'N/A';
186+
const status = r.passAtK >= 0.8 ? '✅' : r.passAtK >= 0.6 ? '⚠️' : '❌';
187+
summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`;
145188
}
146-
} catch (e) {
147-
summary += `> ⚠️ Could not read results: ${e.message}\n`;
189+
summary += '\n';
148190
}
149191
192+
if (validation.length === 0 && copilot.length === 0 && gemini.length === 0) {
193+
summary += '> ⚠️ No eval results found. The eval runs may have failed.\n';
194+
}
195+
196+
summary += '### Thresholds\n';
197+
summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
198+
summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
199+
summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';
200+
150201
await github.rest.issues.createComment({
151202
owner: context.repo.owner,
152203
repo: context.repo.repo,

evals/README.md

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -180,23 +180,18 @@ Following [Anthropic's recommendations](https://www.anthropic.com/engineering/de
180180

181181
## CI Integration
182182

183-
The GitHub Actions workflow at `.github/workflows/skill-eval.yml` provides two
184-
evaluation modes:
185-
186-
### Automatic (on PR)
187-
Runs on every PR that modifies `skills/**` or `evals/**`:
188-
1. Validates all graders against their reference solutions
189-
2. Uploads results as an artifact
190-
3. Posts a summary comment on the PR
191-
192-
### Manual (workflow_dispatch)
193-
Triggered manually from the Actions tab to run agent-based evaluation:
194-
1. Select the agent (`copilot` or `gemini`) and number of trials
195-
2. Installs the selected agent CLI
196-
3. Runs all tasks against the agent
197-
4. Uploads results as an artifact
198-
199-
**Secrets required for agent-based CI:**
183+
The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs
184+
both on PRs (that modify `skills/**` or `evals/**`) and via manual
185+
`workflow_dispatch`. Every run executes three parallel jobs:
186+
187+
1. **Grader validation** — applies reference solutions, verifies graders score 100%
188+
2. **Copilot agent eval** — installs `@github/copilot`, runs all tasks against Copilot CLI
189+
3. **Gemini agent eval** — installs `@google/gemini-cli`, runs all tasks against Gemini CLI
190+
191+
A fourth summary job collects results from all three and posts a combined
192+
PR comment showing pass rates per task per agent.
193+
194+
**Secrets required:**
200195
- `GITHUB_TOKEN` — automatically available (for Copilot)
201196
- `GEMINI_API_KEY` — must be added as a repository secret (for Gemini)
202197

0 commit comments

Comments
 (0)