Implement automated eval test suite for Angular Skills #16
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Skill Eval | |
| on: | |
| pull_request: | |
| paths: | |
| - 'skills/**' | |
| - 'evals/**' | |
| workflow_dispatch: | |
| inputs: | |
| agent: | |
| description: 'Agent to run evals against (copilot or gemini)' | |
| required: true | |
| default: 'copilot' | |
| type: choice | |
| options: | |
| - copilot | |
| - gemini | |
| trials: | |
| description: 'Number of trials per task' | |
| required: false | |
| default: '1' | |
| type: string | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| jobs: | |
| # Job 1: Always validate graders against reference solutions | |
| validate_graders: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 10 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: '22' | |
| - name: Validate graders against reference solutions | |
| working-directory: evals | |
| run: bash run-eval.sh --all --validate | |
| - name: Upload validation results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: skill-eval-validation-results | |
| path: evals/results/ | |
| retention-days: 30 | |
| # Job 2: Run evals against an AI agent (copilot or gemini) | |
| # Triggered manually via workflow_dispatch, or can be called from other workflows | |
| agent_eval: | |
| if: github.event_name == 'workflow_dispatch' | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: '22' | |
| - name: Install Copilot CLI | |
| if: inputs.agent == 'copilot' | |
| run: npm install -g @github/copilot | |
| - name: Install Gemini CLI | |
| if: inputs.agent == 'gemini' | |
| run: npm install -g @google/gemini-cli | |
| - name: Run agent-based eval | |
| working-directory: evals | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} | |
| run: | | |
| bash run-eval.sh --all \ | |
| --agent ${{ inputs.agent }} \ | |
| --trials ${{ inputs.trials || '1' }} | |
| - name: Upload agent eval results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: skill-eval-agent-${{ inputs.agent }}-results | |
| path: evals/results/ | |
| retention-days: 30 | |
| # Job 3: Post summary comment on PRs | |
| post_summary: | |
| if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false | |
| needs: [validate_graders] | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Download validation results | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: skill-eval-validation-results | |
| path: evals/results/ | |
| - name: Post summary comment | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| const resultsDir = 'evals/results'; | |
| let summary = '## 📊 Skill Eval Results\n\n'; | |
| try { | |
| const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json') && f !== 'baseline.json'); | |
| if (files.length === 0) { | |
| summary += '> ⚠️ No eval results found. The eval run may have failed.\n'; | |
| } else { | |
| summary += '| Task | Agent | Pass Rate | pass@k | Status |\n'; | |
| summary += '|---|---|---|---|---|\n'; | |
| for (const file of files) { | |
| try { | |
| const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8')); | |
| const taskName = data.task || file.replace('.json', ''); | |
| const agent = data.agent || 'reference'; | |
| const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A'; | |
| const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A'; | |
| const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌'; | |
| summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`; | |
| } catch (e) { | |
| summary += `| ${file} | — | Error | Error | ❌ |\n`; | |
| } | |
| } | |
| summary += '\n### Thresholds\n'; | |
| summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n'; | |
| summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n'; | |
| summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n'; | |
| } | |
| } catch (e) { | |
| summary += `> ⚠️ Could not read results: ${e.message}\n`; | |
| } | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: context.issue.number, | |
| body: summary, | |
| }); |