Implement automated eval test suite for Angular Skills #18

Workflow file for this run

.github/workflows/skill-eval.yml at 1330989

	name: Skill Eval

	on:
	pull_request:
	paths:
	- 'skills/**'
	- 'evals/**'
	workflow_dispatch:

	permissions:
	contents: read
	pull-requests: write

	jobs:
	# Job 1: Validate graders against reference solutions
	validate_graders:
	runs-on: ubuntu-latest
	timeout-minutes: 10

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Set up Node.js
	uses: actions/setup-node@v4
	with:
	node-version: '22'

	- name: Validate graders against reference solutions
	working-directory: evals
	run: bash run-eval.sh --all --validate

	- name: Upload validation results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: skill-eval-validation-results
	path: evals/results/
	retention-days: 30

	# Job 2: Run evals against the Copilot agent
	agent_eval_copilot:
	runs-on: ubuntu-latest
	timeout-minutes: 60

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Set up Node.js
	uses: actions/setup-node@v4
	with:
	node-version: '22'

	- name: Install Copilot CLI
	run: npm install -g @github/copilot

	- name: Run eval against Copilot
	working-directory: evals
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: bash run-eval.sh --all --agent copilot

	- name: Upload Copilot eval results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: skill-eval-agent-copilot-results
	path: evals/results/
	retention-days: 30

	# Job 3: Run evals against the Gemini agent
	agent_eval_gemini:
	runs-on: ubuntu-latest
	timeout-minutes: 60

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Set up Node.js
	uses: actions/setup-node@v4
	with:
	node-version: '22'

	- name: Install Gemini CLI
	run: npm install -g @google/gemini-cli

	- name: Run eval against Gemini
	working-directory: evals
	env:
	GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
	run: bash run-eval.sh --all --agent gemini

	- name: Upload Gemini eval results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: skill-eval-agent-gemini-results
	path: evals/results/
	retention-days: 30

	# Job 4: Post combined summary comment on PRs
	post_summary:
	if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
	needs: [validate_graders, agent_eval_copilot, agent_eval_gemini]
	runs-on: ubuntu-latest

	steps:
	- name: Download validation results
	uses: actions/download-artifact@v4
	with:
	name: skill-eval-validation-results
	path: evals/results/validation
	continue-on-error: true

	- name: Download Copilot results
	uses: actions/download-artifact@v4
	with:
	name: skill-eval-agent-copilot-results
	path: evals/results/copilot
	continue-on-error: true

	- name: Download Gemini results
	uses: actions/download-artifact@v4
	with:
	name: skill-eval-agent-gemini-results
	path: evals/results/gemini
	continue-on-error: true

	- name: Post summary comment
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');
	const path = require('path');

	function readResults(dir) {
	const results = [];
	try {
	if (!fs.existsSync(dir)) return results;
	const files = fs.readdirSync(dir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
	for (const file of files) {
	try {
	results.push(JSON.parse(fs.readFileSync(path.join(dir, file), 'utf8')));
	} catch (e) {
	results.push({ task: file.replace('.json', ''), error: true });
	}
	}
	} catch (e) { /* dir doesn't exist */ }
	return results;
	}

	let summary = '## 📊 Skill Eval Results\n\n';

	// --- Validation results ---
	const validation = readResults('evals/results/validation');
	if (validation.length > 0) {
	summary += '### Grader Validation (reference solutions)\n\n';
	summary += '\| Task \| Pass Rate \| Status \|\n';
	summary += '\|---\|---\|---\|\n';
	for (const r of validation) {
	if (r.error) { summary += `\| ${r.task} \| Error \| ❌ \|\n`; continue; }
	const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A';
	const status = r.passRate >= 1.0 ? '✅' : '❌';
	summary += `\| ${r.task} \| ${passRate} \| ${status} \|\n`;
	}
	summary += '\n';
	}

	// --- Agent results ---
	const copilot = readResults('evals/results/copilot');
	const gemini = readResults('evals/results/gemini');

	if (copilot.length > 0 \|\| gemini.length > 0) {
	summary += '### Agent Evaluation\n\n';
	summary += '\| Task \| Agent \| Pass Rate \| pass@k \| Status \|\n';
	summary += '\|---\|---\|---\|---\|---\|\n';

	for (const r of [...copilot, ...gemini]) {
	if (r.error) { summary += `\| ${r.task} \| — \| Error \| Error \| ❌ \|\n`; continue; }
	const taskName = r.task \|\| 'unknown';
	const agent = r.agent \|\| 'unknown';
	const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A';
	const passAtK = r.passAtK != null ? `${(r.passAtK * 100).toFixed(0)}%` : 'N/A';
	const status = r.passAtK >= 0.8 ? '✅' : r.passAtK >= 0.6 ? '⚠️' : '❌';
	summary += `\| ${taskName} \| ${agent} \| ${passRate} \| ${passAtK} \| ${status} \|\n`;
	}
	summary += '\n';
	}

	if (validation.length === 0 && copilot.length === 0 && gemini.length === 0) {
	summary += '> ⚠️ No eval results found. The eval runs may have failed.\n';
	}

	summary += '### Thresholds\n';
	summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
	summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
	summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';

	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: context.issue.number,
	body: summary,
	});

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Implement automated eval test suite for Angular Skills #18

Workflow file

Implement automated eval test suite for Angular Skills #18

Uh oh!

Workflow file for this run