-
Notifications
You must be signed in to change notification settings - Fork 160
156 lines (134 loc) · 5 KB
/
skill-eval.yml
File metadata and controls
156 lines (134 loc) · 5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
name: Skill Eval
on:
pull_request:
paths:
- 'skills/**'
- 'evals/**'
workflow_dispatch:
inputs:
agent:
description: 'Agent to run evals against (copilot or gemini)'
required: true
default: 'copilot'
type: choice
options:
- copilot
- gemini
trials:
description: 'Number of trials per task'
required: false
default: '1'
type: string
permissions:
contents: read
pull-requests: write
jobs:
# Job 1: Always validate graders against reference solutions
validate_graders:
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: '22'
- name: Validate graders against reference solutions
working-directory: evals
run: bash run-eval.sh --all --validate
- name: Upload validation results
if: always()
uses: actions/upload-artifact@v4
with:
name: skill-eval-validation-results
path: evals/results/
retention-days: 30
# Job 2: Run evals against an AI agent (copilot or gemini)
# Triggered manually via workflow_dispatch, or can be called from other workflows
agent_eval:
if: github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: '22'
- name: Install Copilot CLI
if: inputs.agent == 'copilot'
run: npm install -g @github/copilot
- name: Install Gemini CLI
if: inputs.agent == 'gemini'
run: npm install -g @google/gemini-cli
- name: Run agent-based eval
working-directory: evals
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
run: |
bash run-eval.sh --all \
--agent ${{ inputs.agent }} \
--trials ${{ inputs.trials || '1' }}
- name: Upload agent eval results
if: always()
uses: actions/upload-artifact@v4
with:
name: skill-eval-agent-${{ inputs.agent }}-results
path: evals/results/
retention-days: 30
# Job 3: Post summary comment on PRs
post_summary:
if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
needs: [validate_graders]
runs-on: ubuntu-latest
steps:
- name: Download validation results
uses: actions/download-artifact@v4
with:
name: skill-eval-validation-results
path: evals/results/
- name: Post summary comment
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = require('path');
const resultsDir = 'evals/results';
let summary = '## 📊 Skill Eval Results\n\n';
try {
const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
if (files.length === 0) {
summary += '> ⚠️ No eval results found. The eval run may have failed.\n';
} else {
summary += '| Task | Agent | Pass Rate | pass@k | Status |\n';
summary += '|---|---|---|---|---|\n';
for (const file of files) {
try {
const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8'));
const taskName = data.task || file.replace('.json', '');
const agent = data.agent || 'reference';
const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A';
const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A';
const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌';
summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`;
} catch (e) {
summary += `| ${file} | — | Error | Error | ❌ |\n`;
}
}
summary += '\n### Thresholds\n';
summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';
}
} catch (e) {
summary += `> ⚠️ Could not read results: ${e.message}\n`;
}
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: summary,
});