Skip to content

Commit 130e42c

Browse files
authored
chore(ci): add gevals action to github workflows (containers#505)
Signed-off-by: Calum Murray <[email protected]>
1 parent 10918df commit 130e42c

File tree

120 files changed

+2563
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

120 files changed

+2563
-0
lines changed

.github/workflows/gevals.yaml

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
name: Gevals MCP Evaluation
2+
3+
on:
4+
# Weekly schedule - runs every Monday at 9 AM UTC
5+
schedule:
6+
- cron: '0 9 * * 1'
7+
8+
# Manual trigger via PR comments
9+
issue_comment:
10+
types: [created]
11+
12+
# Allow manual workflow dispatch for testing
13+
workflow_dispatch:
14+
inputs:
15+
task-filter:
16+
description: 'Regular expression to filter tasks (optional)'
17+
required: false
18+
default: ''
19+
verbose:
20+
description: 'Enable verbose output'
21+
required: false
22+
type: boolean
23+
default: false
24+
25+
permissions:
26+
contents: read
27+
pull-requests: write
28+
issues: write
29+
30+
concurrency:
31+
# Only run once for latest commit per ref and cancel other (previous) runs.
32+
group: ${{ github.workflow }}-${{ github.ref }}
33+
cancel-in-progress: true
34+
35+
env:
36+
GO_VERSION: 1.25
37+
KIND_CLUSTER_NAME: mcp-eval-cluster
38+
39+
defaults:
40+
run:
41+
shell: bash
42+
43+
jobs:
44+
# Check if workflow should run based on trigger
45+
check-trigger:
46+
name: Check if evaluation should run
47+
runs-on: ubuntu-latest
48+
if: |
49+
github.event_name == 'schedule' ||
50+
github.event_name == 'workflow_dispatch' ||
51+
(github.event_name == 'issue_comment' &&
52+
github.event.issue.pull_request &&
53+
contains(github.event.comment.body, '/run-gevals'))
54+
outputs:
55+
should-run: ${{ steps.check.outputs.should-run }}
56+
pr-number: ${{ steps.check.outputs.pr-number }}
57+
pr-ref: ${{ steps.check.outputs.pr-ref }}
58+
steps:
59+
- name: Check trigger conditions
60+
id: check
61+
run: |
62+
if [[ "${{ github.event_name }}" == "issue_comment" ]]; then
63+
# Check if commenter is a maintainer (has write access)
64+
PERMISSION=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
65+
"https://api.github.com/repos/${{ github.repository }}/collaborators/${{ github.event.comment.user.login }}/permission" \
66+
| jq -r '.permission')
67+
68+
if [[ "$PERMISSION" == "admin" || "$PERMISSION" == "write" ]]; then
69+
echo "should-run=true" >> $GITHUB_OUTPUT
70+
echo "pr-number=${{ github.event.issue.number }}" >> $GITHUB_OUTPUT
71+
echo "pr-ref=refs/pull/${{ github.event.issue.number }}/head" >> $GITHUB_OUTPUT
72+
else
73+
echo "should-run=false" >> $GITHUB_OUTPUT
74+
echo "User ${{ github.event.comment.user.login }} does not have permission to trigger evaluations"
75+
fi
76+
else
77+
echo "should-run=true" >> $GITHUB_OUTPUT
78+
echo "pr-ref=${{ github.ref }}" >> $GITHUB_OUTPUT
79+
fi
80+
81+
# Run gevals evaluation with Kind cluster
82+
run-evaluation:
83+
name: Run MCP Evaluation
84+
needs: check-trigger
85+
if: needs.check-trigger.outputs.should-run == 'true'
86+
runs-on: ubuntu-latest
87+
steps:
88+
- name: Checkout
89+
uses: actions/checkout@v4
90+
with:
91+
ref: ${{ needs.check-trigger.outputs.pr-ref }}
92+
93+
- name: Setup Go
94+
uses: actions/setup-go@v5
95+
with:
96+
go-version: ${{ env.GO_VERSION }}
97+
98+
- name: Setup Kind cluster
99+
run: make kind-create-cluster KIND_CLUSTER_NAME=${{ env.KIND_CLUSTER_NAME }}
100+
101+
- name: Start MCP server
102+
run: make run-server
103+
104+
- name: Run gevals evaluation
105+
id: gevals
106+
uses: genmcp/gevals/.github/actions/gevals-action@main
107+
with:
108+
eval-config: 'evals/openai-agent/eval.yaml'
109+
gevals-version: 'latest'
110+
task-filter: ${{ github.event.inputs.task-filter || '' }}
111+
output-format: 'json'
112+
verbose: ${{ github.event.inputs.verbose || 'false' }}
113+
upload-artifacts: 'true'
114+
artifact-name: 'gevals-results'
115+
fail-on-error: 'false'
116+
task-pass-threshold: '0.8'
117+
assertion-pass-threshold: '0.8'
118+
working-directory: '.'
119+
env:
120+
# OpenAI Agent configuration
121+
MODEL_BASE_URL: ${{ secrets.MODEL_BASE_URL }}
122+
MODEL_KEY: ${{ secrets.MODEL_KEY }}
123+
MODEL_NAME: ${{ secrets.MODEL_NAME }}
124+
# LLM Judge configuration
125+
JUDGE_BASE_URL: ${{ secrets.JUDGE_BASE_URL }}
126+
JUDGE_API_KEY: ${{ secrets.JUDGE_API_KEY }}
127+
JUDGE_MODEL_NAME: ${{ secrets.JUDGE_MODEL_NAME }}
128+
129+
- name: Cleanup
130+
if: always()
131+
run: |
132+
make stop-server || true
133+
make kind-delete-cluster KIND_CLUSTER_NAME=${{ env.KIND_CLUSTER_NAME }} || true
134+
135+
- name: Post results comment on PR
136+
if: github.event_name == 'issue_comment' && always()
137+
env:
138+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
139+
run: |
140+
PASS_RATE=$(awk "BEGIN {printf \"%.1f\", ${{ steps.gevals.outputs.task-pass-rate }} * 100}")
141+
142+
gh pr comment ${{ needs.check-trigger.outputs.pr-number }} --body "$(cat <<EOF
143+
## Gevals MCP Evaluation Results
144+
145+
**Summary:** ${{ steps.gevals.outputs.tasks-passed }}/${{ steps.gevals.outputs.tasks-total }} tasks passed (${PASS_RATE}%)
146+
147+
| Metric | Result |
148+
|--------|--------|
149+
| Tasks Passed | ${{ steps.gevals.outputs.tasks-passed }}/${{ steps.gevals.outputs.tasks-total }} |
150+
| Assertions Passed | ${{ steps.gevals.outputs.assertions-passed }}/${{ steps.gevals.outputs.assertions-total }} |
151+
| Overall | ${{ steps.gevals.outputs.passed == 'true' && 'Passed' || 'Failed' }} |
152+
153+
[View full results](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})
154+
EOF
155+
)"

build/gevals.mk

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Gevals evaluation support
2+
3+
MCP_PORT ?= 8080
4+
MCP_HEALTH_TIMEOUT ?= 60
5+
MCP_HEALTH_INTERVAL ?= 2
6+
7+
.PHONY: run-server
8+
run-server: build ## Start MCP server in background and wait for health check
9+
@echo "Starting MCP server on port $(MCP_PORT)..."
10+
@./$(BINARY_NAME) --port $(MCP_PORT) & echo $$! > .mcp-server.pid
11+
@echo "MCP server started with PID $$(cat .mcp-server.pid)"
12+
@echo "Waiting for MCP server to be ready..."
13+
@elapsed=0; \
14+
while [ $$elapsed -lt $(MCP_HEALTH_TIMEOUT) ]; do \
15+
if curl -s http://localhost:$(MCP_PORT)/health > /dev/null 2>&1; then \
16+
echo "MCP server is ready"; \
17+
exit 0; \
18+
fi; \
19+
echo " Waiting... ($$elapsed/$(MCP_HEALTH_TIMEOUT)s)"; \
20+
sleep $(MCP_HEALTH_INTERVAL); \
21+
elapsed=$$((elapsed + $(MCP_HEALTH_INTERVAL))); \
22+
done; \
23+
echo "ERROR: MCP server failed to start within $(MCP_HEALTH_TIMEOUT) seconds"; \
24+
exit 1
25+
26+
.PHONY: stop-server
27+
stop-server: ## Stop the MCP server started by run-server
28+
@if [ -f .mcp-server.pid ]; then \
29+
PID=$$(cat .mcp-server.pid); \
30+
echo "Stopping MCP server (PID: $$PID)"; \
31+
kill $$PID 2>/dev/null || true; \
32+
rm -f .mcp-server.pid; \
33+
else \
34+
echo "No .mcp-server.pid file found"; \
35+
fi

evals/README.md

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Kubernetes MCP Server Test Examples
2+
3+
This directory contains examples for testing the **same Kubernetes MCP server** using different AI agents.
4+
5+
## Structure
6+
7+
```
8+
kube-mcp-server/
9+
├── README.md # This file
10+
├── mcp-config.yaml # Shared MCP server configuration
11+
├── tasks/ # Shared test tasks
12+
│ ├── create-pod.yaml
13+
│ ├── setup.sh
14+
│ ├── verify.sh
15+
│ └── cleanup.sh
16+
├── claude-code/ # Claude Code agent configuration
17+
│ ├── agent.yaml
18+
│ ├── eval.yaml
19+
│ └── eval-inline.yaml
20+
└── openai-agent/ # OpenAI-compatible agent configuration
21+
├── agent.yaml
22+
├── eval.yaml
23+
└── eval-inline.yaml
24+
```
25+
26+
## What This Tests
27+
28+
Both examples test the **same Kubernetes MCP server** using **shared task definitions**:
29+
- Creates an nginx pod named `web-server` in the `create-pod-test` namespace
30+
- Verifies the pod is running
31+
- Validates that the agent called appropriate Kubernetes tools
32+
- Cleans up resources
33+
34+
The tasks and MCP configuration are shared - only the agent configuration differs.
35+
36+
## Prerequisites
37+
38+
- Kubernetes cluster (kind, minikube, or any cluster)
39+
- kubectl configured
40+
- Kubernetes MCP server running at `http://localhost:8080/mcp`
41+
- Built binaries: `gevals` and `agent`
42+
43+
## Running Examples
44+
45+
### Option 1: Claude Code
46+
47+
```bash
48+
./gevals eval examples/kube-mcp-server/claude-code/eval.yaml
49+
```
50+
51+
**Requirements:**
52+
- Claude Code installed and in PATH
53+
54+
**Tool Usage:**
55+
- Claude typically uses pod-specific tools like `pods_run`, `pods_create`
56+
57+
---
58+
59+
### Option 2: OpenAI-Compatible Agent (Built-in)
60+
61+
```bash
62+
# Set your model credentials
63+
export MODEL_BASE_URL='https://your-api-endpoint.com/v1'
64+
export MODEL_KEY='your-api-key'
65+
export MODEL_NAME='your-model-name'
66+
67+
# Run the test
68+
./gevals eval examples/kube-mcp-server/openai-agent/eval.yaml
69+
```
70+
71+
**Note:** Different AI models may choose different tools from the MCP server (`pods_*` or `resources_*`) to accomplish the same task. Both approaches work correctly.
72+
73+
## Assertions
74+
75+
Both examples use flexible assertions that accept either tool approach:
76+
77+
```yaml
78+
toolPattern: "(pods_.*|resources_.*)" # Accepts both pod-specific and generic resource tools
79+
```
80+
81+
This makes the tests robust across different AI models that may prefer different tools.
82+
83+
## Key Difference: Agent Configuration
84+
85+
### Claude Code (claude-code/agent.yaml)
86+
```yaml
87+
commands:
88+
argTemplateMcpServer: "--mcp-config {{ .File }}"
89+
argTemplateAllowedTools: "mcp__{{ .ServerName }}__{{ .ToolName }}"
90+
runPrompt: |-
91+
claude {{ .McpServerFileArgs }} --print "{{ .Prompt }}"
92+
```
93+
94+
### OpenAI Agent (openai-agent/agent.yaml)
95+
```yaml
96+
builtin:
97+
type: "openai-agent"
98+
model: "gpt-4"
99+
```
100+
101+
Uses the built-in OpenAI agent with model configuration.
102+
103+
## Expected Results
104+
105+
Both examples should produce:
106+
- ✅ Task passed - pod created successfully
107+
- ✅ Assertions passed - appropriate tools were called
108+
- ✅ Verification passed - pod exists and is running
109+
110+
Results saved to: `gevals-<eval-name>-out.json`

evals/claude-code/agent.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
kind: Agent
2+
metadata:
3+
name: "claude-code"
4+
builtin:
5+
type: "claude-code"

evals/claude-code/eval-inline.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
kind: Eval
2+
metadata:
3+
name: "kubernetes-basic-operations"
4+
config:
5+
# Inline agent configuration - no separate agent.yaml file needed
6+
agent:
7+
type: "builtin.claude-code"
8+
mcpConfigFile: ../mcp-config.yaml
9+
llmJudge:
10+
env:
11+
baseUrlKey: JUDGE_BASE_URL
12+
apiKeyKey: JUDGE_API_KEY
13+
modelNameKey: JUDGE_MODEL_NAME
14+
taskSets:
15+
- glob: ../tasks/*/*.yaml
16+
assertions:
17+
toolsUsed:
18+
- server: kubernetes
19+
toolPattern: ".*"
20+
minToolCalls: 1
21+
maxToolCalls: 20

evals/claude-code/eval.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
kind: Eval
2+
metadata:
3+
name: "kubernetes-basic-operations"
4+
config:
5+
agent:
6+
type: "file"
7+
path: agent.yaml
8+
mcpConfigFile: ../mcp-config.yaml
9+
llmJudge:
10+
env:
11+
baseUrlKey: JUDGE_BASE_URL
12+
apiKeyKey: JUDGE_API_KEY
13+
modelNameKey: JUDGE_MODEL_NAME
14+
taskSets:
15+
- glob: ../tasks/*/*.yaml
16+
assertions:
17+
toolsUsed:
18+
- server: kubernetes
19+
toolPattern: ".*"
20+
minToolCalls: 1
21+
maxToolCalls: 20

evals/mcp-config.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
mcpServers:
2+
kubernetes:
3+
type: http
4+
url: http://localhost:8080/mcp
5+
enableAllTools: true

evals/openai-agent/agent.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
kind: Agent
2+
metadata:
3+
name: "openai-agent"
4+
builtin:
5+
type: "openai-agent"
6+
model: "gpt-4" # Change to your model
7+
# Before running, set environment variables:
8+
# export MODEL_BASE_URL="https://api.openai.com/v1"
9+
# export MODEL_KEY="sk-..."
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
kind: Eval
2+
metadata:
3+
name: "openai-agent-kubernetes-test"
4+
config:
5+
# Inline agent configuration - no separate agent.yaml file needed
6+
agent:
7+
type: "builtin.openai-agent"
8+
model: "gpt-4" # Change to your model
9+
# Before running, set environment variables:
10+
# export MODEL_BASE_URL="https://api.openai.com/v1"
11+
# export MODEL_KEY="sk-..."
12+
mcpConfigFile: ../mcp-config.yaml
13+
llmJudge:
14+
env:
15+
baseUrlKey: JUDGE_BASE_URL
16+
apiKeyKey: JUDGE_API_KEY
17+
modelNameKey: JUDGE_MODEL_NAME
18+
taskSets:
19+
- glob: ../tasks/*/*.yaml
20+
assertions:
21+
toolsUsed:
22+
- server: kubernetes
23+
toolPattern: ".*"
24+
minToolCalls: 1
25+
maxToolCalls: 20

0 commit comments

Comments
 (0)