-
Notifications
You must be signed in to change notification settings - Fork 160
89 lines (75 loc) · 3.06 KB
/
skill-eval.yml
File metadata and controls
89 lines (75 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
name: Skill Eval
on:
pull_request:
paths:
- 'skills/**'
- 'evals/**'
permissions:
contents: read
pull-requests: write
jobs:
eval:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
- name: Install eval dependencies
working-directory: evals
run: npm install --ignore-scripts
- name: Validate graders against reference solutions
working-directory: evals
run: bash run-eval.sh --all --validate
- name: Upload results
if: always()
uses: actions/upload-artifact@v4
with:
name: skill-eval-results
path: evals/results/
retention-days: 30
- name: Post summary comment
if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = require('path');
const resultsDir = 'evals/results';
let summary = '## 📊 Skill Eval Results\n\n';
try {
const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
if (files.length === 0) {
summary += '> ⚠️ No eval results found. The eval run may have failed.\n';
} else {
summary += '| Task | Pass Rate | pass@5 | Status |\n';
summary += '|---|---|---|---|\n';
for (const file of files) {
try {
const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8'));
const taskName = data.task || file.replace('.json', '');
const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A';
const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A';
const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌';
summary += `| ${taskName} | ${passRate} | ${passAtK} | ${status} |\n`;
} catch (e) {
summary += `| ${file} | Error | Error | ❌ |\n`;
}
}
summary += '\n### Thresholds\n';
summary += '- ✅ `pass@5 ≥ 80%` — merge gate passed\n';
summary += '- ⚠️ `pass@5 ≥ 60%` — needs investigation\n';
summary += '- ❌ `pass@5 < 60%` — blocks merge for affected skill\n';
}
} catch (e) {
summary += `> ⚠️ Could not read results: ${e.message}\n`;
}
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: summary,
});