Cali0707
diff --git a/‎.github/workflows/gevals.yaml‎
Lines changed: 155 additions & 0 deletions b/‎.github/workflows/gevals.yaml‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎build/gevals.mk‎
Lines changed: 35 additions & 0 deletions b/‎build/gevals.mk‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎evals/README.md‎
Lines changed: 110 additions & 0 deletions b/‎evals/README.md‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎evals/claude-code/agent.yaml‎
Lines changed: 5 additions & 0 deletions b/‎evals/claude-code/agent.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎evals/claude-code/eval-inline.yaml‎
Lines changed: 21 additions & 0 deletions b/‎evals/claude-code/eval-inline.yaml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎evals/claude-code/eval.yaml‎
Lines changed: 21 additions & 0 deletions b/‎evals/claude-code/eval.yaml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎evals/mcp-config.yaml‎
Lines changed: 5 additions & 0 deletions b/‎evals/mcp-config.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎evals/openai-agent/agent.yaml‎
Lines changed: 9 additions & 0 deletions b/‎evals/openai-agent/agent.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎evals/openai-agent/eval-inline.yaml‎
Lines changed: 25 additions & 0 deletions b/‎evals/openai-agent/eval-inline.yaml‎
Lines changed: 25 additions & 0 deletions
@@ -0,0 +1,155 @@
+name: Gevals MCP Evaluation
+
+on:
+  # Weekly schedule - runs every Monday at 9 AM UTC
+  schedule:
+    - cron: '0 9 * * 1'
+
+  # Manual trigger via PR comments
+  issue_comment:
+    types: [created]
+
+  # Allow manual workflow dispatch for testing
+  workflow_dispatch:
+    inputs:
+      task-filter:
+        description: 'Regular expression to filter tasks (optional)'
+        required: false
+        default: ''
+      verbose:
+        description: 'Enable verbose output'
+        required: false
+        type: boolean
+        default: false
+
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+
+concurrency:
+  # Only run once for latest commit per ref and cancel other (previous) runs.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  GO_VERSION: 1.25
+  KIND_CLUSTER_NAME: mcp-eval-cluster
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  # Check if workflow should run based on trigger
+  check-trigger:
+    name: Check if evaluation should run
+    runs-on: ubuntu-latest
+    if: |
+      github.event_name == 'schedule' ||
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'issue_comment' &&
+       github.event.issue.pull_request &&
+       contains(github.event.comment.body, '/run-gevals'))
+    outputs:
+      should-run: ${{ steps.check.outputs.should-run }}
+      pr-number: ${{ steps.check.outputs.pr-number }}
+      pr-ref: ${{ steps.check.outputs.pr-ref }}
+    steps:
+      - name: Check trigger conditions
+        id: check
+        run: |
+          if [[ "${{ github.event_name }}" == "issue_comment" ]]; then
+            # Check if commenter is a maintainer (has write access)
+            PERMISSION=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+              "https://api.github.com/repos/${{ github.repository }}/collaborators/${{ github.event.comment.user.login }}/permission" \
+              | jq -r '.permission')
+
+            if [[ "$PERMISSION" == "admin" || "$PERMISSION" == "write" ]]; then
+              echo "should-run=true" >> $GITHUB_OUTPUT
+              echo "pr-number=${{ github.event.issue.number }}" >> $GITHUB_OUTPUT
+              echo "pr-ref=refs/pull/${{ github.event.issue.number }}/head" >> $GITHUB_OUTPUT
+            else
+              echo "should-run=false" >> $GITHUB_OUTPUT
+              echo "User ${{ github.event.comment.user.login }} does not have permission to trigger evaluations"
+            fi
+          else
+            echo "should-run=true" >> $GITHUB_OUTPUT
+            echo "pr-ref=${{ github.ref }}" >> $GITHUB_OUTPUT
+          fi
+
+  # Run gevals evaluation with Kind cluster
+  run-evaluation:
+    name: Run MCP Evaluation
+    needs: check-trigger
+    if: needs.check-trigger.outputs.should-run == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ needs.check-trigger.outputs.pr-ref }}
+
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ env.GO_VERSION }}
+
+      - name: Setup Kind cluster
+        run: make kind-create-cluster KIND_CLUSTER_NAME=${{ env.KIND_CLUSTER_NAME }}
+
+      - name: Start MCP server
+        run: make run-server
+
+      - name: Run gevals evaluation
+        id: gevals
+        uses: genmcp/gevals/.github/actions/gevals-action@main
+        with:
+          eval-config: 'evals/openai-agent/eval.yaml'
+          gevals-version: 'latest'
+          task-filter: ${{ github.event.inputs.task-filter || '' }}
+          output-format: 'json'
+          verbose: ${{ github.event.inputs.verbose || 'false' }}
+          upload-artifacts: 'true'
+          artifact-name: 'gevals-results'
+          fail-on-error: 'false'
+          task-pass-threshold: '0.8'
+          assertion-pass-threshold: '0.8'
+          working-directory: '.'
+        env:
+          # OpenAI Agent configuration
+          MODEL_BASE_URL: ${{ secrets.MODEL_BASE_URL }}
+          MODEL_KEY: ${{ secrets.MODEL_KEY }}
+          MODEL_NAME: ${{ secrets.MODEL_NAME }}
+          # LLM Judge configuration
+          JUDGE_BASE_URL: ${{ secrets.JUDGE_BASE_URL }}
+          JUDGE_API_KEY: ${{ secrets.JUDGE_API_KEY }}
+          JUDGE_MODEL_NAME: ${{ secrets.JUDGE_MODEL_NAME }}
+
+      - name: Cleanup
+        if: always()
+        run: |
+          make stop-server || true
+          make kind-delete-cluster KIND_CLUSTER_NAME=${{ env.KIND_CLUSTER_NAME }} || true
+
+      - name: Post results comment on PR
+        if: github.event_name == 'issue_comment' && always()
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          PASS_RATE=$(awk "BEGIN {printf \"%.1f\", ${{ steps.gevals.outputs.task-pass-rate }} * 100}")
+
+          gh pr comment ${{ needs.check-trigger.outputs.pr-number }} --body "$(cat <<EOF
+          ## Gevals MCP Evaluation Results
+
+          **Summary:** ${{ steps.gevals.outputs.tasks-passed }}/${{ steps.gevals.outputs.tasks-total }} tasks passed (${PASS_RATE}%)
+
+          | Metric | Result |
+          |--------|--------|
+          | Tasks Passed | ${{ steps.gevals.outputs.tasks-passed }}/${{ steps.gevals.outputs.tasks-total }} |
+          | Assertions Passed | ${{ steps.gevals.outputs.assertions-passed }}/${{ steps.gevals.outputs.assertions-total }} |
+          | Overall | ${{ steps.gevals.outputs.passed == 'true' && 'Passed' || 'Failed' }} |
+
+          [View full results](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})
+          EOF
+          )"
@@ -0,0 +1,35 @@
+# Gevals evaluation support
+
+MCP_PORT ?= 8080
+MCP_HEALTH_TIMEOUT ?= 60
+MCP_HEALTH_INTERVAL ?= 2
+
+.PHONY: run-server
+run-server: build ## Start MCP server in background and wait for health check
+	@echo "Starting MCP server on port $(MCP_PORT)..."
+	@./$(BINARY_NAME) --port $(MCP_PORT) & echo $$! > .mcp-server.pid
+	@echo "MCP server started with PID $$(cat .mcp-server.pid)"
+	@echo "Waiting for MCP server to be ready..."
+	@elapsed=0; \
+	while [ $$elapsed -lt $(MCP_HEALTH_TIMEOUT) ]; do \
+		if curl -s http://localhost:$(MCP_PORT)/health > /dev/null 2>&1; then \
+			echo "MCP server is ready"; \
+			exit 0; \
+		fi; \
+		echo "  Waiting... ($$elapsed/$(MCP_HEALTH_TIMEOUT)s)"; \
+		sleep $(MCP_HEALTH_INTERVAL); \
+		elapsed=$$((elapsed + $(MCP_HEALTH_INTERVAL))); \
+	done; \
+	echo "ERROR: MCP server failed to start within $(MCP_HEALTH_TIMEOUT) seconds"; \
+	exit 1
+
+.PHONY: stop-server
+stop-server: ## Stop the MCP server started by run-server
+	@if [ -f .mcp-server.pid ]; then \
+		PID=$$(cat .mcp-server.pid); \
+		echo "Stopping MCP server (PID: $$PID)"; \
+		kill $$PID 2>/dev/null || true; \
+		rm -f .mcp-server.pid; \
+	else \
+		echo "No .mcp-server.pid file found"; \
+	fi
@@ -0,0 +1,110 @@
+# Kubernetes MCP Server Test Examples
+
+This directory contains examples for testing the **same Kubernetes MCP server** using different AI agents.
+
+## Structure
+
+```
+kube-mcp-server/
+├── README.md                    # This file
+├── mcp-config.yaml              # Shared MCP server configuration
+├── tasks/                       # Shared test tasks
+│   ├── create-pod.yaml
+│   ├── setup.sh
+│   ├── verify.sh
+│   └── cleanup.sh
+├── claude-code/                 # Claude Code agent configuration
+│   ├── agent.yaml
+│   ├── eval.yaml
+│   └── eval-inline.yaml
+└── openai-agent/                # OpenAI-compatible agent configuration
+    ├── agent.yaml
+    ├── eval.yaml
+    └── eval-inline.yaml
+```
+
+## What This Tests
+
+Both examples test the **same Kubernetes MCP server** using **shared task definitions**:
+- Creates an nginx pod named `web-server` in the `create-pod-test` namespace
+- Verifies the pod is running
+- Validates that the agent called appropriate Kubernetes tools
+- Cleans up resources
+
+The tasks and MCP configuration are shared - only the agent configuration differs.
+
+## Prerequisites
+
+- Kubernetes cluster (kind, minikube, or any cluster)
+- kubectl configured
+- Kubernetes MCP server running at `http://localhost:8080/mcp`
+- Built binaries: `gevals` and `agent`
+
+## Running Examples
+
+### Option 1: Claude Code
+
+```bash
+./gevals eval examples/kube-mcp-server/claude-code/eval.yaml
+```
+
+**Requirements:**
+- Claude Code installed and in PATH
+
+**Tool Usage:**
+- Claude typically uses pod-specific tools like `pods_run`, `pods_create`
+
+---
+
+### Option 2: OpenAI-Compatible Agent (Built-in)
+
+```bash
+# Set your model credentials
+export MODEL_BASE_URL='https://your-api-endpoint.com/v1'
+export MODEL_KEY='your-api-key'
+export MODEL_NAME='your-model-name'
+
+# Run the test
+./gevals eval examples/kube-mcp-server/openai-agent/eval.yaml
+```
+
+**Note:** Different AI models may choose different tools from the MCP server (`pods_*` or `resources_*`) to accomplish the same task. Both approaches work correctly.
+
+## Assertions
+
+Both examples use flexible assertions that accept either tool approach:
+
+```yaml
+toolPattern: "(pods_.*|resources_.*)"  # Accepts both pod-specific and generic resource tools
+```
+
+This makes the tests robust across different AI models that may prefer different tools.
+
+## Key Difference: Agent Configuration
+
+### Claude Code (claude-code/agent.yaml)
+```yaml
+commands:
+  argTemplateMcpServer: "--mcp-config {{ .File }}"
+  argTemplateAllowedTools: "mcp__{{ .ServerName }}__{{ .ToolName }}"
+  runPrompt: |-
+    claude {{ .McpServerFileArgs }} --print "{{ .Prompt }}"
+```
+
+### OpenAI Agent (openai-agent/agent.yaml)
+```yaml
+builtin:
+  type: "openai-agent"
+  model: "gpt-4"
+```
+
+Uses the built-in OpenAI agent with model configuration.
+
+## Expected Results
+
+Both examples should produce:
+- ✅ Task passed - pod created successfully
+- ✅ Assertions passed - appropriate tools were called
+- ✅ Verification passed - pod exists and is running
+
+Results saved to: `gevals-<eval-name>-out.json`
@@ -0,0 +1,5 @@
+kind: Agent
+metadata:
+  name: "claude-code"
+builtin:
+  type: "claude-code"
@@ -0,0 +1,21 @@
+kind: Eval
+metadata:
+  name: "kubernetes-basic-operations"
+config:
+  # Inline agent configuration - no separate agent.yaml file needed
+  agent:
+    type: "builtin.claude-code"
+  mcpConfigFile: ../mcp-config.yaml
+  llmJudge:
+    env:
+      baseUrlKey: JUDGE_BASE_URL
+      apiKeyKey: JUDGE_API_KEY
+      modelNameKey: JUDGE_MODEL_NAME
+  taskSets:
+    - glob: ../tasks/*/*.yaml
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
@@ -0,0 +1,21 @@
+kind: Eval
+metadata:
+  name: "kubernetes-basic-operations"
+config:
+  agent:
+    type: "file"
+    path: agent.yaml
+  mcpConfigFile: ../mcp-config.yaml
+  llmJudge:
+    env:
+      baseUrlKey: JUDGE_BASE_URL
+      apiKeyKey: JUDGE_API_KEY
+      modelNameKey: JUDGE_MODEL_NAME
+  taskSets:
+    - glob: ../tasks/*/*.yaml
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
@@ -0,0 +1,5 @@
+mcpServers:
+  kubernetes:
+    type: http
+    url: http://localhost:8080/mcp
+    enableAllTools: true
@@ -0,0 +1,9 @@
+kind: Agent
+metadata:
+  name: "openai-agent"
+builtin:
+  type: "openai-agent"
+  model: "gpt-4"  # Change to your model
+# Before running, set environment variables:
+#   export MODEL_BASE_URL="https://api.openai.com/v1"
+#   export MODEL_KEY="sk-..."
@@ -0,0 +1,25 @@
+kind: Eval
+metadata:
+  name: "openai-agent-kubernetes-test"
+config:
+  # Inline agent configuration - no separate agent.yaml file needed
+  agent:
+    type: "builtin.openai-agent"
+    model: "gpt-4"  # Change to your model
+  # Before running, set environment variables:
+  #   export MODEL_BASE_URL="https://api.openai.com/v1"
+  #   export MODEL_KEY="sk-..."
+  mcpConfigFile: ../mcp-config.yaml
+  llmJudge:
+    env:
+      baseUrlKey: JUDGE_BASE_URL
+      apiKeyKey: JUDGE_API_KEY
+      modelNameKey: JUDGE_MODEL_NAME
+  taskSets:
+    - glob: ../tasks/*/*.yaml
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20