Skip to content

Commit 5051269

Browse files
committed
Add gevals action (#2)
* feat: add initial eval tasks Signed-off-by: Calum Murray <[email protected]> * feat: add initial gevals workflow Signed-off-by: Calum Murray <[email protected]> --------- Signed-off-by: Calum Murray <[email protected]>
1 parent ac053ff commit 5051269

File tree

119 files changed

+2603
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

119 files changed

+2603
-0
lines changed

.github/workflows/gevals.yaml

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
name: Gevals MCP Evaluation
2+
3+
on:
4+
# Weekly schedule - runs every Monday at 9 AM UTC
5+
schedule:
6+
- cron: '0 9 * * 1'
7+
8+
# Manual trigger via PR comments
9+
issue_comment:
10+
types: [created]
11+
12+
# Allow manual workflow dispatch for testing
13+
workflow_dispatch:
14+
inputs:
15+
task-filter:
16+
description: 'Regular expression to filter tasks (optional)'
17+
required: false
18+
default: ''
19+
verbose:
20+
description: 'Enable verbose output'
21+
required: false
22+
type: boolean
23+
default: false
24+
25+
concurrency:
26+
# Only run once for latest commit per ref and cancel other (previous) runs.
27+
group: ${{ github.workflow }}-${{ github.ref }}
28+
cancel-in-progress: true
29+
30+
env:
31+
GO_VERSION: 1.25
32+
KIND_CLUSTER_NAME: mcp-eval-cluster
33+
34+
defaults:
35+
run:
36+
shell: bash
37+
38+
jobs:
39+
# Check if workflow should run based on trigger
40+
check-trigger:
41+
name: Check if evaluation should run
42+
runs-on: ubuntu-latest
43+
if: |
44+
github.event_name == 'schedule' ||
45+
github.event_name == 'workflow_dispatch' ||
46+
(github.event_name == 'issue_comment' &&
47+
github.event.issue.pull_request &&
48+
contains(github.event.comment.body, '/run-gevals'))
49+
outputs:
50+
should-run: ${{ steps.check.outputs.should-run }}
51+
pr-number: ${{ steps.check.outputs.pr-number }}
52+
pr-ref: ${{ steps.check.outputs.pr-ref }}
53+
steps:
54+
- name: Check trigger conditions
55+
id: check
56+
run: |
57+
if [[ "${{ github.event_name }}" == "issue_comment" ]]; then
58+
# Check if commenter is a maintainer (has write access)
59+
PERMISSION=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
60+
"https://api.github.com/repos/${{ github.repository }}/collaborators/${{ github.event.comment.user.login }}/permission" \
61+
| jq -r '.permission')
62+
63+
if [[ "$PERMISSION" == "admin" || "$PERMISSION" == "write" ]]; then
64+
echo "should-run=true" >> $GITHUB_OUTPUT
65+
echo "pr-number=${{ github.event.issue.number }}" >> $GITHUB_OUTPUT
66+
echo "pr-ref=refs/pull/${{ github.event.issue.number }}/head" >> $GITHUB_OUTPUT
67+
else
68+
echo "should-run=false" >> $GITHUB_OUTPUT
69+
echo "User ${{ github.event.comment.user.login }} does not have permission to trigger evaluations"
70+
fi
71+
else
72+
echo "should-run=true" >> $GITHUB_OUTPUT
73+
echo "pr-ref=${{ github.ref }}" >> $GITHUB_OUTPUT
74+
fi
75+
76+
# Run gevals evaluation with Kind cluster
77+
run-evaluation:
78+
name: Run MCP Evaluation
79+
needs: check-trigger
80+
if: needs.check-trigger.outputs.should-run == 'true'
81+
runs-on: ubuntu-latest
82+
steps:
83+
- name: Checkout
84+
uses: actions/checkout@v4
85+
with:
86+
ref: ${{ needs.check-trigger.outputs.pr-ref }}
87+
88+
- name: Setup Go
89+
uses: actions/setup-go@v5
90+
with:
91+
go-version: ${{ env.GO_VERSION }}
92+
93+
- name: Setup Kind cluster
94+
run: |
95+
# Install Kind if not already available
96+
if ! command -v kind &> /dev/null; then
97+
curl -Lo ./kind https://kind.sigs.k8s.io/dl/latest/kind-linux-amd64
98+
chmod +x ./kind
99+
sudo mv ./kind /usr/local/bin/kind
100+
fi
101+
102+
# Create Kind cluster (automatically updates ~/.kube/config)
103+
kind create cluster --name ${{ env.KIND_CLUSTER_NAME }} --wait 5m
104+
105+
# Verify cluster is ready
106+
kubectl cluster-info
107+
kubectl get nodes
108+
109+
- name: Build MCP server
110+
run: make build
111+
112+
- name: Start MCP server
113+
run: |
114+
# Start MCP server in background (uses default kubeconfig at ~/.kube/config)
115+
./kubernetes-mcp-server --port 8080 &
116+
MCP_PID=$!
117+
echo "MCP_PID=$MCP_PID" >> $GITHUB_ENV
118+
119+
# Wait for server to be ready
120+
echo "Waiting for MCP server to start..."
121+
for i in {1..30}; do
122+
if curl -s http://localhost:8080/health > /dev/null 2>&1; then
123+
echo "MCP server is ready"
124+
exit 0
125+
fi
126+
echo " Attempt $i/30..."
127+
sleep 2
128+
done
129+
130+
echo "ERROR: MCP server failed to start within 60 seconds"
131+
exit 1
132+
133+
- name: Run gevals evaluation
134+
uses: genmcp/gevals/.github/actions/gevals-action@main
135+
with:
136+
eval-config: 'evals/openai-agent/eval.yaml'
137+
gevals-version: 'latest'
138+
task-filter: ${{ github.event.inputs.task-filter || '' }}
139+
output-format: 'json'
140+
verbose: ${{ github.event.inputs.verbose || 'false' }}
141+
upload-artifacts: 'true'
142+
artifact-name: 'gevals-results'
143+
fail-on-error: 'false'
144+
task-pass-threshold: '0.8'
145+
assertion-pass-threshold: '0.8'
146+
working-directory: '.'
147+
env:
148+
# OpenAI Agent configuration
149+
MODEL_BASE_URL: ${{ secrets.MODEL_BASE_URL }}
150+
MODEL_KEY: ${{ secrets.MODEL_KEY }}
151+
MODEL_NAME: ${{ secrets.MODEL_NAME }}
152+
# LLM Judge configuration
153+
JUDGE_BASE_URL: ${{ secrets.JUDGE_BASE_URL }}
154+
JUDGE_API_KEY: ${{ secrets.JUDGE_API_KEY }}
155+
JUDGE_MODEL_NAME: ${{ secrets.JUDGE_MODEL_NAME }}
156+
157+
- name: Cleanup
158+
if: always()
159+
run: |
160+
# Stop MCP server
161+
if [ -n "$MCP_PID" ]; then
162+
echo "Stopping MCP server (PID: $MCP_PID)"
163+
kill $MCP_PID 2>/dev/null || true
164+
fi
165+
166+
# Delete Kind cluster
167+
echo "Deleting Kind cluster"
168+
kind delete cluster --name ${{ env.KIND_CLUSTER_NAME }} 2>/dev/null || true
169+
170+
- name: Post results comment on PR
171+
if: github.event_name == 'issue_comment' && always()
172+
uses: actions/github-script@v7
173+
with:
174+
script: |
175+
const fs = require('fs');
176+
177+
// Find the results file
178+
const resultsPattern = /gevals-.*-out\.json/;
179+
const files = fs.readdirSync('.');
180+
const resultsFile = files.find(f => resultsPattern.test(f));
181+
182+
if (!resultsFile) {
183+
await github.rest.issues.createComment({
184+
owner: context.repo.owner,
185+
repo: context.repo.repo,
186+
issue_number: ${{ needs.check-trigger.outputs.pr-number }},
187+
body: '❌ Gevals evaluation completed but no results file was found.'
188+
});
189+
return;
190+
}
191+
192+
// Read and parse results
193+
const results = JSON.parse(fs.readFileSync(resultsFile, 'utf8'));
194+
195+
// Calculate summary stats
196+
const totalTasks = results.length;
197+
const passedTasks = results.filter(r => r.taskPassed && r.allAssertionsPassed).length;
198+
const failedTasks = totalTasks - passedTasks;
199+
const passRate = totalTasks > 0 ? ((passedTasks / totalTasks) * 100).toFixed(1) : 0;
200+
201+
// Build comment body
202+
let comment = '## 🤖 Gevals MCP Evaluation Results\n\n';
203+
comment += `**Summary:** ${passedTasks}/${totalTasks} tasks passed (${passRate}%)\n\n`;
204+
205+
if (failedTasks > 0) {
206+
comment += '### ❌ Failed Tasks\n\n';
207+
results.filter(r => !r.taskPassed || !r.allAssertionsPassed).forEach(task => {
208+
comment += `- **${task.taskName}**\n`;
209+
comment += ` - Task Passed: ${task.taskPassed ? '✅' : '❌'}\n`;
210+
comment += ` - Assertions Passed: ${task.allAssertionsPassed ? '✅' : '❌'}\n`;
211+
});
212+
comment += '\n';
213+
}
214+
215+
if (passedTasks > 0) {
216+
comment += '### ✅ Passed Tasks\n\n';
217+
results.filter(r => r.taskPassed && r.allAssertionsPassed).forEach(task => {
218+
comment += `- ${task.taskName}\n`;
219+
});
220+
comment += '\n';
221+
}
222+
223+
comment += `[View full results](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})`;
224+
225+
await github.rest.issues.createComment({
226+
owner: context.repo.owner,
227+
repo: context.repo.repo,
228+
issue_number: ${{ needs.check-trigger.outputs.pr-number }},
229+
body: comment
230+
});

evals/README.md

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Kubernetes MCP Server Test Examples
2+
3+
This directory contains examples for testing the **same Kubernetes MCP server** using different AI agents.
4+
5+
## Structure
6+
7+
```
8+
kube-mcp-server/
9+
├── README.md # This file
10+
├── mcp-config.yaml # Shared MCP server configuration
11+
├── tasks/ # Shared test tasks
12+
│ ├── create-pod.yaml
13+
│ ├── setup.sh
14+
│ ├── verify.sh
15+
│ └── cleanup.sh
16+
├── claude-code/ # Claude Code agent configuration
17+
│ ├── agent.yaml
18+
│ ├── eval.yaml
19+
│ └── eval-inline.yaml
20+
└── openai-agent/ # OpenAI-compatible agent configuration
21+
├── agent.yaml
22+
├── eval.yaml
23+
└── eval-inline.yaml
24+
```
25+
26+
## What This Tests
27+
28+
Both examples test the **same Kubernetes MCP server** using **shared task definitions**:
29+
- Creates an nginx pod named `web-server` in the `create-pod-test` namespace
30+
- Verifies the pod is running
31+
- Validates that the agent called appropriate Kubernetes tools
32+
- Cleans up resources
33+
34+
The tasks and MCP configuration are shared - only the agent configuration differs.
35+
36+
## Prerequisites
37+
38+
- Kubernetes cluster (kind, minikube, or any cluster)
39+
- kubectl configured
40+
- Kubernetes MCP server running at `http://localhost:8080/mcp`
41+
- Built binaries: `gevals` and `agent`
42+
43+
## Running Examples
44+
45+
### Option 1: Claude Code
46+
47+
```bash
48+
./gevals eval examples/kube-mcp-server/claude-code/eval.yaml
49+
```
50+
51+
**Requirements:**
52+
- Claude Code installed and in PATH
53+
54+
**Tool Usage:**
55+
- Claude typically uses pod-specific tools like `pods_run`, `pods_create`
56+
57+
---
58+
59+
### Option 2: OpenAI-Compatible Agent (Built-in)
60+
61+
```bash
62+
# Set your model credentials
63+
export MODEL_BASE_URL='https://your-api-endpoint.com/v1'
64+
export MODEL_KEY='your-api-key'
65+
export MODEL_NAME='your-model-name'
66+
67+
# Run the test
68+
./gevals eval examples/kube-mcp-server/openai-agent/eval.yaml
69+
```
70+
71+
**Note:** Different AI models may choose different tools from the MCP server (`pods_*` or `resources_*`) to accomplish the same task. Both approaches work correctly.
72+
73+
## Assertions
74+
75+
Both examples use flexible assertions that accept either tool approach:
76+
77+
```yaml
78+
toolPattern: "(pods_.*|resources_.*)" # Accepts both pod-specific and generic resource tools
79+
```
80+
81+
This makes the tests robust across different AI models that may prefer different tools.
82+
83+
## Key Difference: Agent Configuration
84+
85+
### Claude Code (claude-code/agent.yaml)
86+
```yaml
87+
commands:
88+
argTemplateMcpServer: "--mcp-config {{ .File }}"
89+
argTemplateAllowedTools: "mcp__{{ .ServerName }}__{{ .ToolName }}"
90+
runPrompt: |-
91+
claude {{ .McpServerFileArgs }} --print "{{ .Prompt }}"
92+
```
93+
94+
### OpenAI Agent (openai-agent/agent.yaml)
95+
```yaml
96+
builtin:
97+
type: "openai-agent"
98+
model: "gpt-4"
99+
```
100+
101+
Uses the built-in OpenAI agent with model configuration.
102+
103+
## Expected Results
104+
105+
Both examples should produce:
106+
- ✅ Task passed - pod created successfully
107+
- ✅ Assertions passed - appropriate tools were called
108+
- ✅ Verification passed - pod exists and is running
109+
110+
Results saved to: `gevals-<eval-name>-out.json`

evals/claude-code/agent.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
kind: Agent
2+
metadata:
3+
name: "claude-code"
4+
builtin:
5+
type: "claude-code"

evals/claude-code/eval-inline.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
kind: Eval
2+
metadata:
3+
name: "kubernetes-basic-operations"
4+
config:
5+
# Inline agent configuration - no separate agent.yaml file needed
6+
agent:
7+
type: "builtin.claude-code"
8+
mcpConfigFile: ../mcp-config.yaml
9+
llmJudge:
10+
env:
11+
baseUrlKey: JUDGE_BASE_URL
12+
apiKeyKey: JUDGE_API_KEY
13+
modelNameKey: JUDGE_MODEL_NAME
14+
taskSets:
15+
- glob: ../tasks/*/*.yaml
16+
assertions:
17+
toolsUsed:
18+
- server: kubernetes
19+
toolPattern: ".*"
20+
minToolCalls: 1
21+
maxToolCalls: 20

evals/claude-code/eval.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
kind: Eval
2+
metadata:
3+
name: "kubernetes-basic-operations"
4+
config:
5+
agent:
6+
type: "file"
7+
path: agent.yaml
8+
mcpConfigFile: ../mcp-config.yaml
9+
llmJudge:
10+
env:
11+
baseUrlKey: JUDGE_BASE_URL
12+
apiKeyKey: JUDGE_API_KEY
13+
modelNameKey: JUDGE_MODEL_NAME
14+
taskSets:
15+
- glob: ../tasks/*/*.yaml
16+
assertions:
17+
toolsUsed:
18+
- server: kubernetes
19+
toolPattern: ".*"
20+
minToolCalls: 1
21+
maxToolCalls: 20

evals/mcp-config.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
mcpServers:
2+
kubernetes:
3+
type: http
4+
url: http://localhost:8080/mcp
5+
enableAllTools: true

0 commit comments

Comments
 (0)