igniteui-angular/.github/workflows/skill-eval.yml at 665264b892ec37e00b09439753fdeaf4f2dabf80 · IgniteUI/igniteui-angular · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
name: Skill Eval

on:
  pull_request:
    paths:
      - 'skills/**'
      - 'evals/**'
  workflow_dispatch:
    inputs:
      agent:
        description: 'Agent to run evals against (copilot or gemini)'
        required: true
        default: 'copilot'
        type: choice
        options:
          - copilot
          - gemini
      trials:
        description: 'Number of trials per task'
        required: false
        default: '1'
        type: string

permissions:
  contents: read
  pull-requests: write

jobs:
  # Job 1: Always validate graders against reference solutions
  validate_graders:
    runs-on: ubuntu-latest
    timeout-minutes: 10

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Node.js
        uses: actions/setup-node@v4
        with:
          node-version: '22'

      - name: Validate graders against reference solutions
        working-directory: evals
        run: bash run-eval.sh --all --validate

      - name: Upload validation results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: skill-eval-validation-results
          path: evals/results/
          retention-days: 30

  # Job 2: Run evals against an AI agent (copilot or gemini)
  # Triggered manually via workflow_dispatch, or can be called from other workflows
  agent_eval:
    if: github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    timeout-minutes: 60

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Node.js
        uses: actions/setup-node@v4
        with:
          node-version: '22'

      - name: Install Copilot CLI
        if: inputs.agent == 'copilot'
        run: npm install -g @github/copilot

      - name: Install Gemini CLI
        if: inputs.agent == 'gemini'
        run: npm install -g @google/gemini-cli

      - name: Run agent-based eval
        working-directory: evals
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
        run: |
          bash run-eval.sh --all \
            --agent ${{ inputs.agent }} \
            --trials ${{ inputs.trials || '1' }}

      - name: Upload agent eval results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: skill-eval-agent-${{ inputs.agent }}-results
          path: evals/results/
          retention-days: 30

  # Job 3: Post summary comment on PRs
  post_summary:
    if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
    needs: [validate_graders]
    runs-on: ubuntu-latest

    steps:
      - name: Download validation results
        uses: actions/download-artifact@v4
        with:
          name: skill-eval-validation-results
          path: evals/results/

      - name: Post summary comment
        uses: actions/github-script@v7
        with:
          script: |
            const fs = require('fs');
            const path = require('path');

            const resultsDir = 'evals/results';
            let summary = '## 📊 Skill Eval Results\n\n';

            try {
              const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
              if (files.length === 0) {
                summary += '> ⚠️ No eval results found. The eval run may have failed.\n';
              } else {
                summary += '| Task | Agent | Pass Rate | pass@k | Status |\n';
                summary += '|---|---|---|---|---|\n';

                for (const file of files) {
                  try {
                    const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8'));
                    const taskName = data.task || file.replace('.json', '');
                    const agent = data.agent || 'reference';
                    const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A';
                    const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A';
                    const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌';
                    summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`;
                  } catch (e) {
                    summary += `| ${file} | — | Error | Error | ❌ |\n`;
                  }
                }

                summary += '\n### Thresholds\n';
                summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
                summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
                summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';
              }
            } catch (e) {
              summary += `> ⚠️ Could not read results: ${e.message}\n`;
            }

            await github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
              body: summary,
            });