IgniteUI · Copilot · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
@@ -0,0 +1,89 @@
+name: Skill Eval
+
+on:
+  pull_request:
+    paths:
+      - 'skills/**'
+      - 'evals/**'
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  eval:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+
+      - name: Install eval dependencies
+        working-directory: evals
+        run: npm install --ignore-scripts
+
+      - name: Validate graders against reference solutions
+        working-directory: evals
+        run: bash run-eval.sh --all --validate
+
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: skill-eval-results
+          path: evals/results/
+          retention-days: 30
+
+      - name: Post summary comment
+        if: always() && github.event_name == 'pull_request'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const path = require('path');
+
+            const resultsDir = 'evals/results';
+            let summary = '## 📊 Skill Eval Results\n\n';
+
+            try {
+              const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
+              if (files.length === 0) {
+                summary += '> ⚠️ No eval results found. The eval run may have failed.\n';
+              } else {
+                summary += '| Task | Pass Rate | pass@5 | Status |\n';
+                summary += '|---|---|---|---|\n';
+
+                for (const file of files) {
+                  try {
+                    const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8'));
+                    const taskName = data.task || file.replace('.json', '');
+                    const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A';
+                    const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A';
+                    const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌';
+                    summary += `| ${taskName} | ${passRate} | ${passAtK} | ${status} |\n`;
+                  } catch (e) {
+                    summary += `| ${file} | Error | Error | ❌ |\n`;
+                  }
+                }
+
+                summary += '\n### Thresholds\n';
+                summary += '- ✅ `pass@5 ≥ 80%` — merge gate passed\n';
+                summary += '- ⚠️ `pass@5 ≥ 60%` — needs investigation\n';
+                summary += '- ❌ `pass@5 < 60%` — blocks merge for affected skill\n';
+              }
+            } catch (e) {
+              summary += `> ⚠️ Could not read results: ${e.message}\n`;
+            }
+
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: summary,
+            });
diff --git a/.gitignore b/.gitignore
@@ -56,3 +56,8 @@ extras/docs/themes/sassdoc/sassdoc/*
 
 # Localization sources
 i18nRepo
+
+# Eval artifacts (keep baseline results)
+evals/node_modules
+evals/results/*.json
+!evals/results/baseline.json
diff --git a/evals/README.md b/evals/README.md
@@ -0,0 +1,135 @@
+# Ignite UI for Angular — Skill Evals
+
+Automated evaluation suite for the Ignite UI for Angular agent skills.
+Inspired by the [skill-eval](https://github.com/mgechev/skill-eval) reference
+architecture and extended with patterns from
+[Anthropic's agent eval research](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents).
+
+The infrastructure is **self-contained** — there are no external eval-framework
+dependencies. A lightweight shell runner (`run-eval.sh`) executes each task's
+reference solution and deterministic grader.
+
+## Overview
+
+The suite tests three skills:
+
+| Skill | Task ID | What it tests |
+|---|---|---|
+| `igniteui-angular-grids` | `grid-basic-setup` | Flat grid with sorting and pagination on flat employee data |
+| `igniteui-angular-components` | `component-combo-reactive-form` | Multi-select combo bound to a reactive form control |
+| `igniteui-angular-theming` | `theming-palette-generation` | Custom branded palette with `palette()` and `theme()` |
+
+Each task includes:
+
+- **`instruction.md`** — the prompt given to the agent
+- **`tests/test.sh`** — deterministic grader (file checks, compilation, lint)
+- **`prompts/quality.md`** — LLM rubric grader (intent routing, API usage)
+- **`solution/solve.sh`** — reference solution for baseline validation
+- **`environment/Dockerfile`** — isolated environment for agent execution
+- **`skills/`** — symlinked skill files under test
+
+## Prerequisites
+
+- Bash 4+
+- `bc` (installed by default on most Linux / macOS systems)
+
+## Running Evals Locally
+
+### Validate graders against reference solutions
+
+This applies each task's `solution/solve.sh`, then runs `tests/test.sh` to
+confirm the grader scores 100%. Use this to catch grader regressions.
+
+```bash
+cd evals
+
+# Validate all tasks
+bash run-eval.sh --all --validate
+
+# Validate a single task
+bash run-eval.sh grid-basic-setup --validate
+```
+
+### npm scripts (convenience wrappers)
+
+```bash
+cd evals
+npm run validate               # all tasks
+npm run validate:grid          # grid-basic-setup only
+npm run validate:combo         # component-combo-reactive-form only
+npm run validate:theming       # theming-palette-generation only
+```
+
+## Adding a New Task
+
+1. Create a directory under `evals/tasks/<task-id>/` with the standard structure:
+
+   ```
+   tasks/<task-id>/
+   ├── task.toml               # Config: grader metadata, weights, timeouts
+   ├── instruction.md          # Agent prompt
+   ├── environment/Dockerfile  # Container setup (for future Docker-based runs)
+   ├── tests/test.sh           # Deterministic grader
+   ├── prompts/quality.md      # LLM rubric grader
+   ├── solution/solve.sh       # Reference solution
+   └── skills/                 # Skill files under test
+       └── <skill-name>/SKILL.md
+   ```
+
+2. Write a clear, unambiguous `instruction.md` that tells the agent exactly what
+   to build.
+
+3. Write `tests/test.sh` to check **outcomes** (files exist, project compiles,
+   correct selectors are present) rather than specific steps. The grader must
+   write a reward (0.0–1.0) to `logs/verifier/reward.txt`.
+
+4. Write `prompts/quality.md` with rubric dimensions that sum to 1.0.
+
+5. Write `solution/solve.sh` — a shell script that proves the task is solvable
+   and validates that the graders work correctly.
+
+6. Validate graders before submitting:
+
+   ```bash
+   bash run-eval.sh <task-id> --validate
+   ```
+
+## Pass / Fail Thresholds
+
+Following [Anthropic's recommendations](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents):
+
+| Metric | Threshold | Effect |
+|---|---|---|
+| `pass@5 ≥ 80%` | **Merge gate** | At least 1 success in 5 trials required |
+| `pass^5 ≥ 60%` | **Tracked** | Flags flaky skills for investigation |
+| `pass@5 < 60%` | **Blocks merge** | On PRs touching the relevant skill |
+
+## CI Integration
+
+The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs
+automatically on PRs that modify `skills/**` or `evals/**`. It:
+
+1. Checks out the repo
+2. Validates all graders against their reference solutions
+3. Uploads results as an artifact
+4. Posts a summary comment on the PR
+
+## Grading Strategy
+
+**Deterministic grader (60% weight)** — checks:
+- Project builds without errors
+- Correct Ignite UI selector is present in the generated template
+- Required imports exist
+- No use of forbidden alternatives
+
+**LLM rubric grader (40% weight)** — evaluates:
+- Correct intent routing
+- Idiomatic API usage
+- Absence of hallucinated APIs
+- Following the skill's guidance
+
+## Results
+
+Baseline results are stored in `evals/results/baseline.json` and used for
+regression comparison on PRs. The CI workflow uploads per-run results as
+GitHub Actions artifacts.
diff --git a/evals/package.json b/evals/package.json
@@ -0,0 +1,20 @@
+{
+  "name": "igniteui-angular-skill-evals",
+  "version": "1.0.0",
+  "description": "Evaluation suite for Ignite UI for Angular agent skills",
+  "private": true,
+  "scripts": {
+    "eval": "bash run-eval.sh",
+    "eval:grid": "bash run-eval.sh grid-basic-setup",
+    "eval:combo": "bash run-eval.sh component-combo-reactive-form",
+    "eval:theming": "bash run-eval.sh theming-palette-generation",
+    "eval:all": "bash run-eval.sh --all",
+    "validate": "bash run-eval.sh --all --validate",
+    "validate:grid": "bash run-eval.sh grid-basic-setup --validate",
+    "validate:combo": "bash run-eval.sh component-combo-reactive-form --validate",
+    "validate:theming": "bash run-eval.sh theming-palette-generation --validate"
+  },
+  "engines": {
+    "node": ">=20.0.0"
+  }
+}
diff --git a/evals/results/baseline.json b/evals/results/baseline.json
@@ -0,0 +1,36 @@
+{
+  "generated_at": "2026-03-08T07:00:00.000Z",
+  "framework_version": "1.0.0",
+  "description": "Initial baseline results for skill evals. Actual scores will be populated after the first full eval run with an API key.",
+  "thresholds": {
+    "pass_at_5_merge_gate": 0.8,
+    "pass_at_5_block": 0.6,
+    "pass_pow_5_tracked": 0.6
+  },
+  "tasks": {
+    "grid-basic-setup": {
+      "skill": "igniteui-angular-grids",
+      "trials": 5,
+      "pass_rate": null,
+      "pass_at_5": null,
+      "pass_pow_5": null,
+      "status": "pending_first_run"
+    },
+    "component-combo-reactive-form": {
+      "skill": "igniteui-angular-components",
+      "trials": 5,
+      "pass_rate": null,
+      "pass_at_5": null,
+      "pass_pow_5": null,
+      "status": "pending_first_run"
+    },
+    "theming-palette-generation": {
+      "skill": "igniteui-angular-theming",
+      "trials": 5,
+      "pass_rate": null,
+      "pass_at_5": null,
+      "pass_pow_5": null,
+      "status": "pending_first_run"
+    }
+  }
+}