Implement automated eval test suite for Angular Skills #19
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Skill Eval | |
| on: | |
| pull_request: | |
| paths: | |
| - 'skills/**' | |
| - 'evals/**' | |
| workflow_dispatch: | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| jobs: | |
| # Job 1: Validate graders against reference solutions | |
| validate_graders: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 10 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: '22' | |
| - name: Validate graders against reference solutions | |
| working-directory: evals | |
| run: npm run validate | |
| - name: Upload validation results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: skill-eval-validation-results | |
| path: evals/results/ | |
| retention-days: 30 | |
| # Job 2: Run evals against the Copilot agent | |
| agent_eval_copilot: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: '22' | |
| - name: Install Copilot CLI | |
| run: npm install -g @github/copilot | |
| - name: Run eval against Copilot | |
| working-directory: evals | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: npm run agent:copilot | |
| - name: Upload Copilot eval results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: skill-eval-agent-copilot-results | |
| path: evals/results/ | |
| retention-days: 30 | |
| # Job 3: Run evals against the Gemini agent | |
| agent_eval_gemini: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: '22' | |
| - name: Install Gemini CLI | |
| run: npm install -g @google/gemini-cli | |
| - name: Run eval against Gemini | |
| working-directory: evals | |
| env: | |
| GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} | |
| run: npm run agent:gemini | |
| - name: Upload Gemini eval results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: skill-eval-agent-gemini-results | |
| path: evals/results/ | |
| retention-days: 30 | |
| # Job 4: Post combined summary comment on PRs | |
| post_summary: | |
| if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false | |
| needs: [validate_graders, agent_eval_copilot, agent_eval_gemini] | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Download validation results | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: skill-eval-validation-results | |
| path: evals/results/validation | |
| continue-on-error: true | |
| - name: Download Copilot results | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: skill-eval-agent-copilot-results | |
| path: evals/results/copilot | |
| continue-on-error: true | |
| - name: Download Gemini results | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: skill-eval-agent-gemini-results | |
| path: evals/results/gemini | |
| continue-on-error: true | |
| - name: Post summary comment | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| function readResults(dir) { | |
| const results = []; | |
| try { | |
| if (!fs.existsSync(dir)) return results; | |
| const files = fs.readdirSync(dir).filter(f => f.endsWith('.json') && f !== 'baseline.json'); | |
| for (const file of files) { | |
| try { | |
| results.push(JSON.parse(fs.readFileSync(path.join(dir, file), 'utf8'))); | |
| } catch (e) { | |
| results.push({ task: file.replace('.json', ''), error: true }); | |
| } | |
| } | |
| } catch (e) { /* dir doesn't exist */ } | |
| return results; | |
| } | |
| let summary = '## 📊 Skill Eval Results\n\n'; | |
| // --- Validation results --- | |
| const validation = readResults('evals/results/validation'); | |
| if (validation.length > 0) { | |
| summary += '### Grader Validation (reference solutions)\n\n'; | |
| summary += '| Task | Pass Rate | Status |\n'; | |
| summary += '|---|---|---|\n'; | |
| for (const r of validation) { | |
| if (r.error) { summary += `| ${r.task} | Error | ❌ |\n`; continue; } | |
| const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A'; | |
| const status = r.passRate >= 1.0 ? '✅' : '❌'; | |
| summary += `| ${r.task} | ${passRate} | ${status} |\n`; | |
| } | |
| summary += '\n'; | |
| } | |
| // --- Agent results --- | |
| const copilot = readResults('evals/results/copilot'); | |
| const gemini = readResults('evals/results/gemini'); | |
| if (copilot.length > 0 || gemini.length > 0) { | |
| summary += '### Agent Evaluation\n\n'; | |
| summary += '| Task | Agent | Pass Rate | pass@k | Status |\n'; | |
| summary += '|---|---|---|---|---|\n'; | |
| for (const r of [...copilot, ...gemini]) { | |
| if (r.error) { summary += `| ${r.task} | — | Error | Error | ❌ |\n`; continue; } | |
| const taskName = r.task || 'unknown'; | |
| const agent = r.agent || 'unknown'; | |
| const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A'; | |
| const passAtK = r.passAtK != null ? `${(r.passAtK * 100).toFixed(0)}%` : 'N/A'; | |
| const status = r.passAtK >= 0.8 ? '✅' : r.passAtK >= 0.6 ? '⚠️' : '❌'; | |
| summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`; | |
| } | |
| summary += '\n'; | |
| } | |
| if (validation.length === 0 && copilot.length === 0 && gemini.length === 0) { | |
| summary += '> ⚠️ No eval results found. The eval runs may have failed.\n'; | |
| } | |
| summary += '### Thresholds\n'; | |
| summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n'; | |
| summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n'; | |
| summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n'; | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: context.issue.number, | |
| body: summary, | |
| }); |