Cali0707
diff --git a/‎.github/workflows/gevals.yaml‎
Lines changed: 230 additions & 0 deletions b/‎.github/workflows/gevals.yaml‎
Lines changed: 230 additions & 0 deletions
diff --git a/‎evals/README.md‎
Lines changed: 110 additions & 0 deletions b/‎evals/README.md‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎evals/claude-code/agent.yaml‎
Lines changed: 5 additions & 0 deletions b/‎evals/claude-code/agent.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎evals/claude-code/eval-inline.yaml‎
Lines changed: 21 additions & 0 deletions b/‎evals/claude-code/eval-inline.yaml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎evals/claude-code/eval.yaml‎
Lines changed: 21 additions & 0 deletions b/‎evals/claude-code/eval.yaml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎evals/mcp-config.yaml‎
Lines changed: 5 additions & 0 deletions b/‎evals/mcp-config.yaml‎
Lines changed: 5 additions & 0 deletions
@@ -0,0 +1,230 @@
+name: Gevals MCP Evaluation
+
+on:
+  # Weekly schedule - runs every Monday at 9 AM UTC
+  schedule:
+    - cron: '0 9 * * 1'
+
+  # Manual trigger via PR comments
+  issue_comment:
+    types: [created]
+
+  # Allow manual workflow dispatch for testing
+  workflow_dispatch:
+    inputs:
+      task-filter:
+        description: 'Regular expression to filter tasks (optional)'
+        required: false
+        default: ''
+      verbose:
+        description: 'Enable verbose output'
+        required: false
+        type: boolean
+        default: false
+
+concurrency:
+  # Only run once for latest commit per ref and cancel other (previous) runs.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  GO_VERSION: 1.25
+  KIND_CLUSTER_NAME: mcp-eval-cluster
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  # Check if workflow should run based on trigger
+  check-trigger:
+    name: Check if evaluation should run
+    runs-on: ubuntu-latest
+    if: |
+      github.event_name == 'schedule' ||
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'issue_comment' &&
+       github.event.issue.pull_request &&
+       contains(github.event.comment.body, '/run-gevals'))
+    outputs:
+      should-run: ${{ steps.check.outputs.should-run }}
+      pr-number: ${{ steps.check.outputs.pr-number }}
+      pr-ref: ${{ steps.check.outputs.pr-ref }}
+    steps:
+      - name: Check trigger conditions
+        id: check
+        run: |
+          if [[ "${{ github.event_name }}" == "issue_comment" ]]; then
+            # Check if commenter is a maintainer (has write access)
+            PERMISSION=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+              "https://api.github.com/repos/${{ github.repository }}/collaborators/${{ github.event.comment.user.login }}/permission" \
+              | jq -r '.permission')
+
+            if [[ "$PERMISSION" == "admin" || "$PERMISSION" == "write" ]]; then
+              echo "should-run=true" >> $GITHUB_OUTPUT
+              echo "pr-number=${{ github.event.issue.number }}" >> $GITHUB_OUTPUT
+              echo "pr-ref=refs/pull/${{ github.event.issue.number }}/head" >> $GITHUB_OUTPUT
+            else
+              echo "should-run=false" >> $GITHUB_OUTPUT
+              echo "User ${{ github.event.comment.user.login }} does not have permission to trigger evaluations"
+            fi
+          else
+            echo "should-run=true" >> $GITHUB_OUTPUT
+            echo "pr-ref=${{ github.ref }}" >> $GITHUB_OUTPUT
+          fi
+
+  # Run gevals evaluation with Kind cluster
+  run-evaluation:
+    name: Run MCP Evaluation
+    needs: check-trigger
+    if: needs.check-trigger.outputs.should-run == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ needs.check-trigger.outputs.pr-ref }}
+
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ env.GO_VERSION }}
+
+      - name: Setup Kind cluster
+        run: |
+          # Install Kind if not already available
+          if ! command -v kind &> /dev/null; then
+            curl -Lo ./kind https://kind.sigs.k8s.io/dl/latest/kind-linux-amd64
+            chmod +x ./kind
+            sudo mv ./kind /usr/local/bin/kind
+          fi
+
+          # Create Kind cluster (automatically updates ~/.kube/config)
+          kind create cluster --name ${{ env.KIND_CLUSTER_NAME }} --wait 5m
+
+          # Verify cluster is ready
+          kubectl cluster-info
+          kubectl get nodes
+
+      - name: Build MCP server
+        run: make build
+
+      - name: Start MCP server
+        run: |
+          # Start MCP server in background (uses default kubeconfig at ~/.kube/config)
+          ./kubernetes-mcp-server --port 8080 &
+          MCP_PID=$!
+          echo "MCP_PID=$MCP_PID" >> $GITHUB_ENV
+
+          # Wait for server to be ready
+          echo "Waiting for MCP server to start..."
+          for i in {1..30}; do
+            if curl -s http://localhost:8080/health > /dev/null 2>&1; then
+              echo "MCP server is ready"
+              exit 0
+            fi
+            echo "  Attempt $i/30..."
+            sleep 2
+          done
+
+          echo "ERROR: MCP server failed to start within 60 seconds"
+          exit 1
+
+      - name: Run gevals evaluation
+        uses: genmcp/gevals/.github/actions/gevals-action@main
+        with:
+          eval-config: 'evals/openai-agent/eval.yaml'
+          gevals-version: 'latest'
+          task-filter: ${{ github.event.inputs.task-filter || '' }}
+          output-format: 'json'
+          verbose: ${{ github.event.inputs.verbose || 'false' }}
+          upload-artifacts: 'true'
+          artifact-name: 'gevals-results'
+          fail-on-error: 'false'
+          task-pass-threshold: '0.8'
+          assertion-pass-threshold: '0.8'
+          working-directory: '.'
+        env:
+          # OpenAI Agent configuration
+          MODEL_BASE_URL: ${{ secrets.MODEL_BASE_URL }}
+          MODEL_KEY: ${{ secrets.MODEL_KEY }}
+          MODEL_NAME: ${{ secrets.MODEL_NAME }}
+          # LLM Judge configuration
+          JUDGE_BASE_URL: ${{ secrets.JUDGE_BASE_URL }}
+          JUDGE_API_KEY: ${{ secrets.JUDGE_API_KEY }}
+          JUDGE_MODEL_NAME: ${{ secrets.JUDGE_MODEL_NAME }}
+
+      - name: Cleanup
+        if: always()
+        run: |
+          # Stop MCP server
+          if [ -n "$MCP_PID" ]; then
+            echo "Stopping MCP server (PID: $MCP_PID)"
+            kill $MCP_PID 2>/dev/null || true
+          fi
+
+          # Delete Kind cluster
+          echo "Deleting Kind cluster"
+          kind delete cluster --name ${{ env.KIND_CLUSTER_NAME }} 2>/dev/null || true
+
+      - name: Post results comment on PR
+        if: github.event_name == 'issue_comment' && always()
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+
+            // Find the results file
+            const resultsPattern = /gevals-.*-out\.json/;
+            const files = fs.readdirSync('.');
+            const resultsFile = files.find(f => resultsPattern.test(f));
+
+            if (!resultsFile) {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: ${{ needs.check-trigger.outputs.pr-number }},
+                body: '❌ Gevals evaluation completed but no results file was found.'
+              });
+              return;
+            }
+
+            // Read and parse results
+            const results = JSON.parse(fs.readFileSync(resultsFile, 'utf8'));
+
+            // Calculate summary stats
+            const totalTasks = results.length;
+            const passedTasks = results.filter(r => r.taskPassed && r.allAssertionsPassed).length;
+            const failedTasks = totalTasks - passedTasks;
+            const passRate = totalTasks > 0 ? ((passedTasks / totalTasks) * 100).toFixed(1) : 0;
+
+            // Build comment body
+            let comment = '## 🤖 Gevals MCP Evaluation Results\n\n';
+            comment += `**Summary:** ${passedTasks}/${totalTasks} tasks passed (${passRate}%)\n\n`;
+
+            if (failedTasks > 0) {
+              comment += '### ❌ Failed Tasks\n\n';
+              results.filter(r => !r.taskPassed || !r.allAssertionsPassed).forEach(task => {
+                comment += `- **${task.taskName}**\n`;
+                comment += `  - Task Passed: ${task.taskPassed ? '✅' : '❌'}\n`;
+                comment += `  - Assertions Passed: ${task.allAssertionsPassed ? '✅' : '❌'}\n`;
+              });
+              comment += '\n';
+            }
+
+            if (passedTasks > 0) {
+              comment += '### ✅ Passed Tasks\n\n';
+              results.filter(r => r.taskPassed && r.allAssertionsPassed).forEach(task => {
+                comment += `- ${task.taskName}\n`;
+              });
+              comment += '\n';
+            }
+
+            comment += `[View full results](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})`;
+
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: ${{ needs.check-trigger.outputs.pr-number }},
+              body: comment
+            });
@@ -0,0 +1,110 @@
+# Kubernetes MCP Server Test Examples
+
+This directory contains examples for testing the **same Kubernetes MCP server** using different AI agents.
+
+## Structure
+
+```
+kube-mcp-server/
+├── README.md                    # This file
+├── mcp-config.yaml              # Shared MCP server configuration
+├── tasks/                       # Shared test tasks
+│   ├── create-pod.yaml
+│   ├── setup.sh
+│   ├── verify.sh
+│   └── cleanup.sh
+├── claude-code/                 # Claude Code agent configuration
+│   ├── agent.yaml
+│   ├── eval.yaml
+│   └── eval-inline.yaml
+└── openai-agent/                # OpenAI-compatible agent configuration
+    ├── agent.yaml
+    ├── eval.yaml
+    └── eval-inline.yaml
+```
+
+## What This Tests
+
+Both examples test the **same Kubernetes MCP server** using **shared task definitions**:
+- Creates an nginx pod named `web-server` in the `create-pod-test` namespace
+- Verifies the pod is running
+- Validates that the agent called appropriate Kubernetes tools
+- Cleans up resources
+
+The tasks and MCP configuration are shared - only the agent configuration differs.
+
+## Prerequisites
+
+- Kubernetes cluster (kind, minikube, or any cluster)
+- kubectl configured
+- Kubernetes MCP server running at `http://localhost:8080/mcp`
+- Built binaries: `gevals` and `agent`
+
+## Running Examples
+
+### Option 1: Claude Code
+
+```bash
+./gevals eval examples/kube-mcp-server/claude-code/eval.yaml
+```
+
+**Requirements:**
+- Claude Code installed and in PATH
+
+**Tool Usage:**
+- Claude typically uses pod-specific tools like `pods_run`, `pods_create`
+
+---
+
+### Option 2: OpenAI-Compatible Agent (Built-in)
+
+```bash
+# Set your model credentials
+export MODEL_BASE_URL='https://your-api-endpoint.com/v1'
+export MODEL_KEY='your-api-key'
+export MODEL_NAME='your-model-name'
+
+# Run the test
+./gevals eval examples/kube-mcp-server/openai-agent/eval.yaml
+```
+
+**Note:** Different AI models may choose different tools from the MCP server (`pods_*` or `resources_*`) to accomplish the same task. Both approaches work correctly.
+
+## Assertions
+
+Both examples use flexible assertions that accept either tool approach:
+
+```yaml
+toolPattern: "(pods_.*|resources_.*)"  # Accepts both pod-specific and generic resource tools
+```
+
+This makes the tests robust across different AI models that may prefer different tools.
+
+## Key Difference: Agent Configuration
+
+### Claude Code (claude-code/agent.yaml)
+```yaml
+commands:
+  argTemplateMcpServer: "--mcp-config {{ .File }}"
+  argTemplateAllowedTools: "mcp__{{ .ServerName }}__{{ .ToolName }}"
+  runPrompt: |-
+    claude {{ .McpServerFileArgs }} --print "{{ .Prompt }}"
+```
+
+### OpenAI Agent (openai-agent/agent.yaml)
+```yaml
+builtin:
+  type: "openai-agent"
+  model: "gpt-4"
+```
+
+Uses the built-in OpenAI agent with model configuration.
+
+## Expected Results
+
+Both examples should produce:
+- ✅ Task passed - pod created successfully
+- ✅ Assertions passed - appropriate tools were called
+- ✅ Verification passed - pod exists and is running
+
+Results saved to: `gevals-<eval-name>-out.json`
@@ -0,0 +1,5 @@
+kind: Agent
+metadata:
+  name: "claude-code"
+builtin:
+  type: "claude-code"
@@ -0,0 +1,21 @@
+kind: Eval
+metadata:
+  name: "kubernetes-basic-operations"
+config:
+  # Inline agent configuration - no separate agent.yaml file needed
+  agent:
+    type: "builtin.claude-code"
+  mcpConfigFile: ../mcp-config.yaml
+  llmJudge:
+    env:
+      baseUrlKey: JUDGE_BASE_URL
+      apiKeyKey: JUDGE_API_KEY
+      modelNameKey: JUDGE_MODEL_NAME
+  taskSets:
+    - glob: ../tasks/*/*.yaml
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
@@ -0,0 +1,21 @@
+kind: Eval
+metadata:
+  name: "kubernetes-basic-operations"
+config:
+  agent:
+    type: "file"
+    path: agent.yaml
+  mcpConfigFile: ../mcp-config.yaml
+  llmJudge:
+    env:
+      baseUrlKey: JUDGE_BASE_URL
+      apiKeyKey: JUDGE_API_KEY
+      modelNameKey: JUDGE_MODEL_NAME
+  taskSets:
+    - glob: ../tasks/*/*.yaml
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
@@ -0,0 +1,5 @@
+mcpServers:
+  kubernetes:
+    type: http
+    url: http://localhost:8080/mcp
+    enableAllTools: true