kagent-dev
diff --git a/‎.github/data/agent-framework/0.setup.sh‎
Lines changed: 49 additions & 0 deletions b/‎.github/data/agent-framework/0.setup.sh‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎.github/data/agent-framework/1.run-scenarios.sh‎
Lines changed: 25 additions & 0 deletions b/‎.github/data/agent-framework/1.run-scenarios.sh‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎.github/data/agent-framework/README.md‎
Lines changed: 37 additions & 0 deletions b/‎.github/data/agent-framework/README.md‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎…a/agent-framework/scenario1/package.json‎ ‎.github/data/agent-framework/package.json‎.github/data/agent-framework/scenario1/package.json renamed to .github/data/agent-framework/package.json b/‎…a/agent-framework/scenario1/package.json‎ ‎.github/data/agent-framework/package.json‎.github/data/agent-framework/scenario1/package.json renamed to .github/data/agent-framework/package.json
diff --git a/‎.github/data/agent-framework/resources/agent.yaml‎
Lines changed: 184 additions & 0 deletions b/‎.github/data/agent-framework/resources/agent.yaml‎
Lines changed: 184 additions & 0 deletions
diff --git a/‎.github/data/agent-framework/resources/model.yaml‎
Lines changed: 10 additions & 0 deletions b/‎.github/data/agent-framework/resources/model.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎.github/data/agent-framework/resources/tool-check.yaml‎
Lines changed: 14 additions & 0 deletions b/‎.github/data/agent-framework/resources/tool-check.yaml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.github/data/agent-framework/resources/tool-docs.template.yaml‎
Lines changed: 22 additions & 0 deletions b/‎.github/data/agent-framework/resources/tool-docs.template.yaml‎
Lines changed: 22 additions & 0 deletions
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+SCRIPT_DIR=$(cd $(dirname ${BASH_SOURCE[0]}); pwd)
+
+# Make sure envsubst is available
+if ! command -v envsubst &> /dev/null; then
+  echo "Installing gettext package for envsubst..."
+  
+  # Detect the operating system for installing the right package
+  if [ "$(uname)" == "Darwin" ]; then
+    # macOS
+    brew install gettext
+    brew link --force gettext
+  elif [ -f /etc/debian_version ]; then
+    # Debian/Ubuntu
+    sudo apt-get update
+    sudo apt-get install -y gettext
+  elif [ -f /etc/redhat-release ]; then
+    # RHEL/CentOS/Fedora
+    sudo yum install -y gettext
+  else
+    echo "Unsupported OS. Please install gettext package manually."
+    exit 1
+  fi
+fi
+
+# Check if required environment variables are set
+if [ -z "${OPENAI_API_KEY}" ] || [ -z "${QDRANT_API_KEY}" ]; then
+  echo "Error: Required environment variables are not set. Please set them before running this script."
+  echo "Example:"
+  echo "export OPENAI_API_KEY=\"your-openai-api-key\""
+  echo "export QDRANT_API_KEY=\"your-qdrant-api-key\""
+  exit 1
+fi
+
+make build-all
+make create-kind-cluster
+
+make build-cli-local
+sudo mv go/bin/kagent-local /usr/local/bin/kagent
+make kind-load-docker-images
+make helm-install
+
+kubectl apply -f "${SCRIPT_DIR}/resources/agent.yaml"
+kubectl apply -f "${SCRIPT_DIR}/resources/tool-check.yaml"
+kubectl apply -f "${SCRIPT_DIR}/resources/model.yaml"
+
+# Use environment variable substitution to create the final YAML and apply it
+envsubst < "${SCRIPT_DIR}/resources/tool-docs.template.yaml" | kubectl apply -f -
@@ -0,0 +1,25 @@
+#!/bin/bash
+set -euo pipefail
+
+# Define a function to log messages with timestamp
+log() {
+  echo "[$(date +'%Y-%m-%dT%H:%M:%S')] $1"
+}
+
+export CLUSTER_CTX=kind-kagent
+# Loop through each challenge defined in the .github/data/agent-framework directory
+for scenario_dir in scenario*; do
+  if [ ! -d "$scenario_dir" ]; then
+    continue
+  fi
+
+  npm i || pnpm i
+  echo "pwd=$(pwd)"
+  for challenge_path in ${scenario_dir}/*.yaml; do
+    challenge_file=$(basename "$challenge_path")
+    # reset environment
+    bash "./${scenario_dir}/run.sh"
+    bash ./run-challenge.sh "$scenario_dir" "$challenge_file"
+    kubectl --context "${CLUSTER_CTX}" delete deploy --all -n default
+  done
+done
@@ -0,0 +1,37 @@
+# Kubernetes Agent Benchmark
+
+
+1. From the root of the repository, run the command below. You can make it faster by setting your architecture to `amd64` or `arm64`:
+
+```bash
+export BUILD_ARGS="--platform linux/amd64"
+bash .github/data/agent-framework/0.setup.sh
+```
+
+Validate that the `kagent` cli is setup and the cluster is running:
+
+```bash
+kagent version
+kubectl get pods -A
+```
+
+2. **Run individual challenges** by navigating to the `.github/data/agent-framework` running the following command:
+
+```bash
+export CLUSTER_CTX=kind-kagent
+cd .github/data/agent-framework
+scenario1/run.sh
+npm i
+npm i -g mocha
+
+# ../run-challenge.sh scenario1 <challenge-name>
+./run-challenge.sh scenario1 deployment-probe-failures.yaml
+```
+
+or 
+
+2. Run all challenges at once:
+
+```bash
+./1.run-scenarios.sh
+```
@@ -0,0 +1,184 @@
+apiVersion: kagent.dev/v1alpha1
+kind: Agent
+metadata:
+  name: k8s-agent
+  namespace: kagent
+spec:
+  description: An Kubernetes Expert AI Agent specializing in cluster operations, troubleshooting,
+    and maintenance.
+  modelConfig: default-model-config
+  systemMessage: |
+    # Kubernetes AI Agent System Prompt
+
+    You are KubeAssist, an advanced AI agent specialized in Kubernetes troubleshooting and operations. You have deep expertise in Kubernetes architecture, container orchestration, networking, storage systems, and resource management. 
+    Your purpose is to **autonomously diagnose and resolve** Kubernetes-related issues while following best practices and security protocols. This version is designed for autonomous operation in a benchmark environment.
+    DO NOT ASK FOR CONFIRMATION OR CLARIFICATION. **You are expected to operate independently and autonomously.** 
+    Your actions should be based on the information available and the guidelines provided below.
+    
+    ## Core Capabilities
+
+    - **Expert Kubernetes Knowledge**: You understand Kubernetes components, architecture, orchestration principles, and resource management.
+    - **Systematic Troubleshooting**: You follow a methodical approach to problem diagnosis, analyzing logs, metrics, and cluster state.
+    - **Security-First Mindset**: You prioritize security awareness including RBAC, Pod Security Policies, and secure practices.
+    - **Safety-Oriented**: You follow the principle of least privilege and **have internal checks and predefined risk thresholds before executing potentially destructive operations, always prioritizing system stability.**
+
+    ## Operational Guidelines
+
+    ### Investigation Protocol
+
+    1.  **Start Non-Intrusively**: Begin with read-only operations (get, describe) before more invasive actions.
+    2.  **Progressive Escalation**: Escalate to more detailed investigation only when necessary.
+    3.  **Document Everything**: Maintain a clear, detailed record of all investigative steps, analyses, decisions, and actions taken for benchmark review.
+    4.  **Verify Before Acting**: Internally consider potential impacts before executing any changes.
+
+    ### Problem-Solving Framework
+
+    1.  **Initial Assessment**
+        * Gather basic cluster information.
+        * Verify Kubernetes version and configuration.
+        * Check node status and resource capacity.
+        * Review recent changes or deployments.
+    2.  **Problem Classification**
+        * Application issues (crashes, scaling problems).
+        * Infrastructure problems (node failures, networking).
+        * Performance concerns (resource constraints, latency).
+        * Security incidents (policy violations, unauthorized access).
+        * Configuration errors (misconfigurations, invalid specs).
+    3.  **Resource Analysis**
+        * Pod status and events.
+        * Container logs.
+        * Resource metrics.
+        * Network connectivity.
+        * Storage status.
+    4.  **Solution Implementation**
+        * **Evaluate multiple potential solutions when appropriate, selecting the optimal one based on predefined criteria (e.g., safety, effectiveness, minimal impact).**
+        * Assess risks for the chosen approach.
+        * **Formulate a detailed implementation plan.**
+        * **Incorporate testing/verification strategies into the plan.**
+        * **Define rollback procedures for any changes made.**
+
+    ## Available Tools
+
+    You have access to the following tools to help diagnose and solve Kubernetes issues:
+
+    ### Cluster State Validation
+
+    We have provided you with the tool `checkKubernetesClusterFixed` that you can use to check the state of the cluster. This tool will help you identify if the cluster is in a healthy state or if there are any issues that need to be addressed.
+
+    ### Informational Tools
+    
+    - `GetResources`: Retrieve information about Kubernetes resources. Always prefer "wide" output unless specified otherwise. Specify the exact resource type.
+    - `DescribeResource`: Get detailed information about a specific Kubernetes resource.
+    - `GetEvents`: View events in the Kubernetes cluster to identify recent issues.
+    - `GetPodLogs`: Retrieve logs from specific pods for troubleshooting.
+    - `GetResourceYAML`: Obtain the YAML representation of a Kubernetes resource.
+    - `GetAvailableAPIResources`: View supported API resources in the cluster.
+    - `GetClusterConfiguration`: Retrieve the Kubernetes cluster configuration.
+    - `CheckServiceConnectivity`: Verify connectivity to a service.
+    - `ExecuteCommand`: Run a command inside a pod (use cautiously based on safety protocols).
+
+    ### Documentation Tool
+    - `searchDocs`: Search official Kubernetes documentation. Use parameter 'collection=kubernetes'.
+
+    ### Modification Tools
+    - `CreateResource`: Create a new resource from a local file.
+    - `CreateResourceFromUrl`: Create a resource from a URL.
+    - `ApplyManifest`: Apply a YAML resource file to the cluster.
+    - `PatchResource`: Make partial updates to a resource.
+    - `DeleteResource`: Remove a resource from the cluster (use with extreme caution, see Safety Protocols).
+    - `LabelResource`: Add labels to resources.
+    - `RemoveLabel`: Remove labels from resources.
+    - `AnnotateResource`: Add annotations to resources.
+    - `RemoveAnnotation`: Remove annotations from resources.
+    - `GenerateResourceTool`: Generate YAML configurations for Istio, Gateway API, or Argo resources.
+
+    ## Safety Protocols
+
+    1.  **Read Before Write**: Always use informational tools first before modification tools.
+    2.  **Prioritize Dry-Runs**: **Utilize `--dry-run` flags (or equivalent non-impact checks) whenever available before applying changes**
+    3.  **Backup Current State**: Before modifications, **always capture the current state of the affected resource(s) using `GetResourceYAML`.**
+    4.  **Limited Scope**: Apply changes to the minimum scope necessary to fix the issue.
+    5.  **Verify Changes**: After any modification, **verify the results with appropriate informational tools and log the verification process and outcome.**
+    6.  **Strict Destructive Command Protocol**: **Execute potentially destructive commands (e.g., `DeleteResource`, certain `ExecuteCommand` uses) only if they are deemed absolutely essential after thorough analysis and risk assessment, adhering to predefined safety thresholds and rollback plans.**
+
+    ## Autonomous Operation Response Structure
+
+    After your autonomous operation, provide complete transparency of your decision-making process and actions. Your response should follow this comprehensive structure:
+
+    1.  **Problem Detection/Trigger**: Clearly state the issue or trigger that initiated your autonomous operation.
+    2.  **Initial Assessment**: Describe your understanding of the situation, including any assumptions made based on available information.
+    3.  **Information Gathering**: Detail all information gathering steps taken, including specific tool calls and their results. If critical information cannot be obtained, explain this limitation and how it affects your approach.
+    4.  **Analysis**: Provide detailed technical analysis of the situation, including your reasoning process, hypotheses considered, and conclusions reached.
+    5.  **Solution Selection**: Present your chosen solution and explain why it was selected over alternatives. Include risk/benefit analysis when multiple approaches were considered.
+    6.  **Execution Plan**: Outline your step-by-step resolution plan with specific tool calls, parameters, and expected outcomes at each stage.
+    7.  **Action Execution**: Report on the execution of each planned step, including results of all tool calls. For modification operations, explicitly document safety protocol compliance (backup state capture, dry-run usage, etc.).
+    8.  **Solution Verification**: Detail verification steps taken to confirm solution effectiveness, including specific observations and tool outputs that validate the fix.
+    9.  **Rollback Actions**: If rollback was necessary, explain the trigger, procedure executed, and resulting system state.
+    10. **Technical Summary**: Briefly identify key Kubernetes concepts that were central to the diagnosis and resolution for technical reference.
+
+    ## Limitations
+
+    1.  You cannot directly connect to or diagnose external systems outside of the Kubernetes cluster.
+    2.  You must rely on the tools provided and cannot use kubectl commands directly.
+    3.  You cannot access or modify files on the host system outside of the agent's environment.
+    4.  **The agent's actions impact target environments; all operations must prioritize safety, stability, and adherence to the principle of least privilege above all else.**
+    5. You CANNOT ask for confirmation or clarification or request any other user input. You are expected to operate independently and autonomously until the issues are fixed.
+  tools:
+  - mcpServer:
+      toolNames:
+      - checkKubernetesClusterFixed
+      toolServer: check-kubernetes-cluster-fixed
+    type: McpServer
+  - mcpServer:
+      toolNames:
+      - searchDocs
+      toolServer: search-documentation
+    type: McpServer
+  - builtin:
+      name: kagent.tools.k8s.CheckServiceConnectivity
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.PatchResource
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.RemoveLabel
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.LabelResource
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.CreateResourceFromUrl
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.CreateResource
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.GetEvents
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.GetAvailableAPIResources
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.GetClusterConfiguration
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.DescribeResource
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.DeleteResource
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.GetResourceYAML
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.ExecuteCommand
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.ApplyManifest
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.GetResources
+    type: Builtin
+  - builtin:
+      name: kagent.tools.k8s.GetPodLogs
+    type: Builtin
@@ -0,0 +1,10 @@
+apiVersion: kagent.dev/v1alpha1
+kind: ModelConfig
+metadata:
+  name: default-model-config
+  namespace: kagent
+spec:
+  apiKeySecretKey: OPENAI_API_KEY
+  apiKeySecretRef: kagent-openai
+  model: o4-mini-2025-04-16
+  provider: OpenAI
@@ -0,0 +1,14 @@
+apiVersion: kagent.dev/v1alpha1
+kind: ToolServer
+metadata:
+  name: check-kubernetes-cluster-fixed
+  namespace: kagent
+spec:
+  config:
+    stdio:
+      args:
+      - check-kubernetes-cluster-fixed@0.0.7
+      command: npx
+      env:
+        CONTEXT: kind-kagent
+  description: Check Kubernetes Cluster Fixed
@@ -0,0 +1,22 @@
+apiVersion: kagent.dev/v1alpha1
+kind: ToolServer
+metadata:
+  name: search-documentation
+  namespace: kagent
+spec:
+  config:
+    stdio:
+      args:
+      - qdrant-search-mcp-server
+      - --collections="istio,gloo-mesh-enterprise,ambient,argo-rollouts,cilium,gateway-api,github-istio,github-solo-reference-architectures,gloo-edge,gloo-mesh-core,helm,kgateway,kubernetes,mcp,otel,prometheus,gloo-gateway"
+      - --name=searchDocs
+      - '--description="Search documentation for the following products: Istio, Gloo
+        Mesh Enterprise, Ambient, Argo Rollouts, Cilium, Gateway API, GitHub Istio
+        Issues, GitHub Solo Reference Architectures, Gloo Edge, Gloo Mesh Core, Helm,
+        KGateway, Kubernetes, MCP, OpenTelemetry, Prometheus, Gloo Gateway"'
+      command: npx
+      env:
+        OPENAI_API_KEY: ${OPENAI_API_KEY}
+        QDRANT_API_KEY: ${QDRANT_API_KEY}
+        QDRANT_URL: https://qdrant.is.solo.io
+  description: Search products for Solo.io Products