Skip to content

Implement automated eval test suite for Angular Skills #18

Implement automated eval test suite for Angular Skills

Implement automated eval test suite for Angular Skills #18

Workflow file for this run

name: Skill Eval
on:
pull_request:
paths:
- 'skills/**'
- 'evals/**'
workflow_dispatch:
permissions:
contents: read
pull-requests: write
jobs:
# Job 1: Validate graders against reference solutions
validate_graders:
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: '22'
- name: Validate graders against reference solutions
working-directory: evals
run: bash run-eval.sh --all --validate
- name: Upload validation results
if: always()
uses: actions/upload-artifact@v4
with:
name: skill-eval-validation-results
path: evals/results/
retention-days: 30
# Job 2: Run evals against the Copilot agent
agent_eval_copilot:
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: '22'
- name: Install Copilot CLI
run: npm install -g @github/copilot
- name: Run eval against Copilot
working-directory: evals
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: bash run-eval.sh --all --agent copilot
- name: Upload Copilot eval results
if: always()
uses: actions/upload-artifact@v4
with:
name: skill-eval-agent-copilot-results
path: evals/results/
retention-days: 30
# Job 3: Run evals against the Gemini agent
agent_eval_gemini:
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: '22'
- name: Install Gemini CLI
run: npm install -g @google/gemini-cli
- name: Run eval against Gemini
working-directory: evals
env:
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
run: bash run-eval.sh --all --agent gemini
- name: Upload Gemini eval results
if: always()
uses: actions/upload-artifact@v4
with:
name: skill-eval-agent-gemini-results
path: evals/results/
retention-days: 30
# Job 4: Post combined summary comment on PRs
post_summary:
if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
needs: [validate_graders, agent_eval_copilot, agent_eval_gemini]
runs-on: ubuntu-latest
steps:
- name: Download validation results
uses: actions/download-artifact@v4
with:
name: skill-eval-validation-results
path: evals/results/validation
continue-on-error: true
- name: Download Copilot results
uses: actions/download-artifact@v4
with:
name: skill-eval-agent-copilot-results
path: evals/results/copilot
continue-on-error: true
- name: Download Gemini results
uses: actions/download-artifact@v4
with:
name: skill-eval-agent-gemini-results
path: evals/results/gemini
continue-on-error: true
- name: Post summary comment
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = require('path');
function readResults(dir) {
const results = [];
try {
if (!fs.existsSync(dir)) return results;
const files = fs.readdirSync(dir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
for (const file of files) {
try {
results.push(JSON.parse(fs.readFileSync(path.join(dir, file), 'utf8')));
} catch (e) {
results.push({ task: file.replace('.json', ''), error: true });
}
}
} catch (e) { /* dir doesn't exist */ }
return results;
}
let summary = '## 📊 Skill Eval Results\n\n';
// --- Validation results ---
const validation = readResults('evals/results/validation');
if (validation.length > 0) {
summary += '### Grader Validation (reference solutions)\n\n';
summary += '| Task | Pass Rate | Status |\n';
summary += '|---|---|---|\n';
for (const r of validation) {
if (r.error) { summary += `| ${r.task} | Error | ❌ |\n`; continue; }
const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A';
const status = r.passRate >= 1.0 ? '✅' : '❌';
summary += `| ${r.task} | ${passRate} | ${status} |\n`;
}
summary += '\n';
}
// --- Agent results ---
const copilot = readResults('evals/results/copilot');
const gemini = readResults('evals/results/gemini');
if (copilot.length > 0 || gemini.length > 0) {
summary += '### Agent Evaluation\n\n';
summary += '| Task | Agent | Pass Rate | pass@k | Status |\n';
summary += '|---|---|---|---|---|\n';
for (const r of [...copilot, ...gemini]) {
if (r.error) { summary += `| ${r.task} | — | Error | Error | ❌ |\n`; continue; }
const taskName = r.task || 'unknown';
const agent = r.agent || 'unknown';
const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A';
const passAtK = r.passAtK != null ? `${(r.passAtK * 100).toFixed(0)}%` : 'N/A';
const status = r.passAtK >= 0.8 ? '✅' : r.passAtK >= 0.6 ? '⚠️' : '❌';
summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`;
}
summary += '\n';
}
if (validation.length === 0 && copilot.length === 0 && gemini.length === 0) {
summary += '> ⚠️ No eval results found. The eval runs may have failed.\n';
}
summary += '### Thresholds\n';
summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: summary,
});