-
Notifications
You must be signed in to change notification settings - Fork 160
Implement automated eval test suite for Angular Skills #17007
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 7 commits
5b7cca0
23aecf0
ac1335a
f807aa3
c183089
6e7b838
b2047d8
1691296
2df335e
b22b13f
c684351
94d4bf8
18f3e25
568b04d
5da6711
b181ca0
665264b
b3fa973
1330989
a9da524
566551b
148691b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,89 @@ | ||
| name: Skill Eval | ||
|
|
||
| on: | ||
| pull_request: | ||
| paths: | ||
| - 'skills/**' | ||
| - 'evals/**' | ||
|
|
||
| permissions: | ||
| contents: read | ||
| pull-requests: write | ||
|
|
||
| jobs: | ||
| eval: | ||
| runs-on: ubuntu-latest | ||
| timeout-minutes: 30 | ||
|
|
||
| steps: | ||
| - name: Checkout repository | ||
| uses: actions/checkout@v4 | ||
|
|
||
| - name: Set up Node.js | ||
| uses: actions/setup-node@v4 | ||
| with: | ||
| node-version: '20' | ||
|
|
||
| - name: Install eval dependencies | ||
| working-directory: evals | ||
| run: npm install --ignore-scripts | ||
|
|
||
| - name: Validate graders against reference solutions | ||
| working-directory: evals | ||
| run: bash run-eval.sh --all --validate | ||
|
|
||
| - name: Upload results | ||
| if: always() | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: skill-eval-results | ||
| path: evals/results/ | ||
| retention-days: 30 | ||
|
|
||
| - name: Post summary comment | ||
| if: always() && github.event_name == 'pull_request' | ||
| uses: actions/github-script@v7 | ||
| with: | ||
| script: | | ||
| const fs = require('fs'); | ||
| const path = require('path'); | ||
|
|
||
| const resultsDir = 'evals/results'; | ||
| let summary = '## 📊 Skill Eval Results\n\n'; | ||
|
|
||
| try { | ||
| const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json') && f !== 'baseline.json'); | ||
| if (files.length === 0) { | ||
| summary += '> ⚠️ No eval results found. The eval run may have failed.\n'; | ||
| } else { | ||
| summary += '| Task | Pass Rate | pass@5 | Status |\n'; | ||
| summary += '|---|---|---|---|\n'; | ||
|
|
||
| for (const file of files) { | ||
| try { | ||
| const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8')); | ||
| const taskName = data.task || file.replace('.json', ''); | ||
| const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A'; | ||
| const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A'; | ||
| const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌'; | ||
| summary += `| ${taskName} | ${passRate} | ${passAtK} | ${status} |\n`; | ||
| } catch (e) { | ||
| summary += `| ${file} | Error | Error | ❌ |\n`; | ||
| } | ||
| } | ||
|
|
||
| summary += '\n### Thresholds\n'; | ||
| summary += '- ✅ `pass@5 ≥ 80%` — merge gate passed\n'; | ||
| summary += '- ⚠️ `pass@5 ≥ 60%` — needs investigation\n'; | ||
| summary += '- ❌ `pass@5 < 60%` — blocks merge for affected skill\n'; | ||
| } | ||
| } catch (e) { | ||
| summary += `> ⚠️ Could not read results: ${e.message}\n`; | ||
| } | ||
|
|
||
| await github.rest.issues.createComment({ | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| issue_number: context.issue.number, | ||
| body: summary, | ||
| }); | ||
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,135 @@ | ||
| # Ignite UI for Angular — Skill Evals | ||
|
|
||
| Automated evaluation suite for the Ignite UI for Angular agent skills. | ||
| Inspired by the [skill-eval](https://github.com/mgechev/skill-eval) reference | ||
| architecture and extended with patterns from | ||
| [Anthropic's agent eval research](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents). | ||
|
|
||
| The infrastructure is **self-contained** — there are no external eval-framework | ||
| dependencies. A lightweight shell runner (`run-eval.sh`) executes each task's | ||
| reference solution and deterministic grader. | ||
|
|
||
| ## Overview | ||
|
|
||
| The suite tests three skills: | ||
|
|
||
| | Skill | Task ID | What it tests | | ||
| |---|---|---| | ||
| | `igniteui-angular-grids` | `grid-basic-setup` | Flat grid with sorting and pagination on flat employee data | | ||
| | `igniteui-angular-components` | `component-combo-reactive-form` | Multi-select combo bound to a reactive form control | | ||
| | `igniteui-angular-theming` | `theming-palette-generation` | Custom branded palette with `palette()` and `theme()` | | ||
|
|
||
| Each task includes: | ||
|
|
||
| - **`instruction.md`** — the prompt given to the agent | ||
| - **`tests/test.sh`** — deterministic grader (file checks, compilation, lint) | ||
| - **`prompts/quality.md`** — LLM rubric grader (intent routing, API usage) | ||
| - **`solution/solve.sh`** — reference solution for baseline validation | ||
| - **`environment/Dockerfile`** — isolated environment for agent execution | ||
| - **`skills/`** — symlinked skill files under test | ||
|
|
||
| ## Prerequisites | ||
|
|
||
| - Bash 4+ | ||
| - `bc` (installed by default on most Linux / macOS systems) | ||
|
|
||
| ## Running Evals Locally | ||
|
|
||
| ### Validate graders against reference solutions | ||
|
|
||
| This applies each task's `solution/solve.sh`, then runs `tests/test.sh` to | ||
| confirm the grader scores 100%. Use this to catch grader regressions. | ||
|
|
||
| ```bash | ||
| cd evals | ||
|
|
||
| # Validate all tasks | ||
| bash run-eval.sh --all --validate | ||
|
|
||
| # Validate a single task | ||
| bash run-eval.sh grid-basic-setup --validate | ||
| ``` | ||
|
|
||
| ### npm scripts (convenience wrappers) | ||
|
|
||
| ```bash | ||
| cd evals | ||
| npm run validate # all tasks | ||
| npm run validate:grid # grid-basic-setup only | ||
| npm run validate:combo # component-combo-reactive-form only | ||
| npm run validate:theming # theming-palette-generation only | ||
| ``` | ||
|
|
||
| ## Adding a New Task | ||
|
|
||
| 1. Create a directory under `evals/tasks/<task-id>/` with the standard structure: | ||
|
|
||
| ``` | ||
| tasks/<task-id>/ | ||
| ├── task.toml # Config: grader metadata, weights, timeouts | ||
| ├── instruction.md # Agent prompt | ||
| ├── environment/Dockerfile # Container setup (for future Docker-based runs) | ||
| ├── tests/test.sh # Deterministic grader | ||
| ├── prompts/quality.md # LLM rubric grader | ||
| ├── solution/solve.sh # Reference solution | ||
| └── skills/ # Skill files under test | ||
| └── <skill-name>/SKILL.md | ||
| ``` | ||
|
|
||
| 2. Write a clear, unambiguous `instruction.md` that tells the agent exactly what | ||
| to build. | ||
|
|
||
| 3. Write `tests/test.sh` to check **outcomes** (files exist, project compiles, | ||
| correct selectors are present) rather than specific steps. The grader must | ||
| write a reward (0.0–1.0) to `logs/verifier/reward.txt`. | ||
|
|
||
| 4. Write `prompts/quality.md` with rubric dimensions that sum to 1.0. | ||
|
|
||
| 5. Write `solution/solve.sh` — a shell script that proves the task is solvable | ||
| and validates that the graders work correctly. | ||
|
|
||
| 6. Validate graders before submitting: | ||
|
|
||
| ```bash | ||
| bash run-eval.sh <task-id> --validate | ||
| ``` | ||
|
|
||
| ## Pass / Fail Thresholds | ||
|
|
||
| Following [Anthropic's recommendations](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents): | ||
|
|
||
| | Metric | Threshold | Effect | | ||
| |---|---|---| | ||
| | `pass@5 ≥ 80%` | **Merge gate** | At least 1 success in 5 trials required | | ||
| | `pass^5 ≥ 60%` | **Tracked** | Flags flaky skills for investigation | | ||
| | `pass@5 < 60%` | **Blocks merge** | On PRs touching the relevant skill | | ||
|
|
||
| ## CI Integration | ||
|
|
||
| The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs | ||
| automatically on PRs that modify `skills/**` or `evals/**`. It: | ||
|
|
||
| 1. Checks out the repo | ||
| 2. Validates all graders against their reference solutions | ||
| 3. Uploads results as an artifact | ||
| 4. Posts a summary comment on the PR | ||
|
|
||
| ## Grading Strategy | ||
|
|
||
| **Deterministic grader (60% weight)** — checks: | ||
| - Project builds without errors | ||
| - Correct Ignite UI selector is present in the generated template | ||
| - Required imports exist | ||
| - No use of forbidden alternatives | ||
|
Comment on lines
+207
to
+211
|
||
|
|
||
| **LLM rubric grader (40% weight)** — evaluates: | ||
| - Correct intent routing | ||
| - Idiomatic API usage | ||
| - Absence of hallucinated APIs | ||
| - Following the skill's guidance | ||
|
|
||
| ## Results | ||
|
|
||
| Baseline results are stored in `evals/results/baseline.json` and used for | ||
| regression comparison on PRs. The CI workflow uploads per-run results as | ||
| GitHub Actions artifacts. | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| { | ||
| "name": "igniteui-angular-skill-evals", | ||
| "version": "1.0.0", | ||
| "description": "Evaluation suite for Ignite UI for Angular agent skills", | ||
| "private": true, | ||
| "scripts": { | ||
| "eval": "bash run-eval.sh", | ||
| "eval:grid": "bash run-eval.sh grid-basic-setup", | ||
| "eval:combo": "bash run-eval.sh component-combo-reactive-form", | ||
| "eval:theming": "bash run-eval.sh theming-palette-generation", | ||
| "eval:all": "bash run-eval.sh --all", | ||
| "validate": "bash run-eval.sh --all --validate", | ||
| "validate:grid": "bash run-eval.sh grid-basic-setup --validate", | ||
| "validate:combo": "bash run-eval.sh component-combo-reactive-form --validate", | ||
| "validate:theming": "bash run-eval.sh theming-palette-generation --validate" | ||
| }, | ||
| "engines": { | ||
| "node": ">=20.0.0" | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| { | ||
| "generated_at": "2026-03-08T07:00:00.000Z", | ||
| "framework_version": "1.0.0", | ||
| "description": "Initial baseline results for skill evals. Actual scores will be populated after the first full eval run with an API key.", | ||
| "thresholds": { | ||
| "pass_at_5_merge_gate": 0.8, | ||
| "pass_at_5_block": 0.6, | ||
| "pass_pow_5_tracked": 0.6 | ||
| }, | ||
| "tasks": { | ||
| "grid-basic-setup": { | ||
| "skill": "igniteui-angular-grids", | ||
| "trials": 5, | ||
| "pass_rate": null, | ||
| "pass_at_5": null, | ||
| "pass_pow_5": null, | ||
| "status": "pending_first_run" | ||
| }, | ||
| "component-combo-reactive-form": { | ||
| "skill": "igniteui-angular-components", | ||
| "trials": 5, | ||
| "pass_rate": null, | ||
| "pass_at_5": null, | ||
| "pass_pow_5": null, | ||
| "status": "pending_first_run" | ||
| }, | ||
| "theming-palette-generation": { | ||
| "skill": "igniteui-angular-theming", | ||
| "trials": 5, | ||
| "pass_rate": null, | ||
| "pass_at_5": null, | ||
| "pass_pow_5": null, | ||
| "status": "pending_first_run" | ||
| } | ||
| } | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.