Skip to content

Commit 3f09c68

Browse files
committed
Squashed commits
1 parent 3f59c58 commit 3f09c68

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+327
-719
lines changed

.github/data/agent-framework/0.setup.sh

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@ if ! command -v envsubst &> /dev/null; then
2424
fi
2525
fi
2626

27+
# Check if required environment variables are set
28+
if [ -z "${OPENAI_API_KEY}" ] || [ -z "${QDRANT_API_KEY}" ]; then
29+
echo "Error: Required environment variables are not set. Please set them before running this script."
30+
echo "Example:"
31+
echo "export OPENAI_API_KEY=\"your-openai-api-key\""
32+
echo "export QDRANT_API_KEY=\"your-qdrant-api-key\""
33+
exit 1
34+
fi
35+
2736
make build-all
2837
make create-kind-cluster
2938

@@ -32,19 +41,9 @@ sudo mv go/bin/kagent-local /usr/local/bin/kagent
3241
make kind-load-docker-images
3342
make helm-install
3443

35-
# new terminal
3644
kubectl apply -f "${SCRIPT_DIR}/resources/agent.yaml"
3745
kubectl apply -f "${SCRIPT_DIR}/resources/tool-check.yaml"
38-
39-
# Check if required environment variables are set
40-
if [ -z "${OPENAI_API_KEY}" ] || [ -z "${QDRANT_API_KEY}" ]; then
41-
echo "Error: Required environment variables are not set. Please set them before running this script."
42-
echo "Example:"
43-
echo "export OPENAI_API_KEY=\"your-openai-api-key\""
44-
echo "export QDRANT_API_KEY=\"your-qdrant-api-key\""
45-
exit 1
46-
fi
46+
kubectl apply -f "${SCRIPT_DIR}/resources/model.yaml"
4747

4848
# Use environment variable substitution to create the final YAML and apply it
4949
envsubst < "${SCRIPT_DIR}/resources/tool-docs.template.yaml" | kubectl apply -f -
50-

.github/data/agent-framework/1.run-scenarios.sh

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,19 @@ log() {
77
}
88

99
export CLUSTER_CTX=kind-kagent
10-
11-
CLUSTER_CTX=kind-kagent
1210
# Loop through each challenge defined in the .github/data/agent-framework directory
1311
for scenario_dir in scenario*; do
1412
if [ ! -d "$scenario_dir" ]; then
1513
continue
1614
fi
17-
pushd $scenario_dir
18-
pnpm i || npm i
15+
16+
npm i || pnpm i
1917
echo "pwd=$(pwd)"
20-
for challenge_file in *.yaml; do
21-
# reset environment
22-
bash "./run.sh"
23-
../run-challenge.sh "$scenario_dir" "$challenge_file"
18+
for challenge_path in ${scenario_dir}/*.yaml; do
19+
challenge_file=$(basename "$challenge_path")
20+
# reset environment
21+
bash "./${scenario_dir}/run.sh"
22+
bash ./run-challenge.sh "$scenario_dir" "$challenge_file"
23+
kubectl --context "${CLUSTER_CTX}" delete deploy --all -n default
2424
done
25-
popd
26-
done
25+
done
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Kubernetes Agent Benchmark
2+
3+
4+
1. From the root of the repository, run the command below. You can make it faster by setting your architecture to `amd64` or `arm64`:
5+
6+
```bash
7+
export BUILD_ARGS="--platform linux/amd64"
8+
bash .github/data/agent-framework/0.setup.sh
9+
```
10+
11+
Validate that the `kagent` cli is setup and the cluster is running:
12+
13+
```bash
14+
kagent version
15+
kubectl get pods -A
16+
```
17+
18+
2. **Run individual challenges** by navigating to the `.github/data/agent-framework` running the following command:
19+
20+
```bash
21+
export CLUSTER_CTX=kind-kagent
22+
cd .github/data/agent-framework
23+
scenario1/run.sh
24+
npm i
25+
npm i -g mocha
26+
27+
# ../run-challenge.sh scenario1 <challenge-name>
28+
./run-challenge.sh scenario1 deployment-probe-failures.yaml
29+
```
30+
31+
or
32+
33+
2. Run all challenges at once:
34+
35+
```bash
36+
./1.run-scenarios.sh
37+
```
File renamed without changes.

.github/data/agent-framework/resources/agent.yaml

Lines changed: 32 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,16 @@ spec:
1010
systemMessage: |
1111
# Kubernetes AI Agent System Prompt
1212
13-
You are KubeAssist, an advanced AI agent specialized in Kubernetes troubleshooting and operations. You have deep expertise in Kubernetes architecture, container orchestration, networking, storage systems, and resource management. Your purpose is to **autonomously diagnose and resolve** Kubernetes-related issues while following best practices and security protocols. This version is designed for autonomous operation in a benchmark environment.
14-
13+
You are KubeAssist, an advanced AI agent specialized in Kubernetes troubleshooting and operations. You have deep expertise in Kubernetes architecture, container orchestration, networking, storage systems, and resource management.
14+
Your purpose is to **autonomously diagnose and resolve** Kubernetes-related issues while following best practices and security protocols. This version is designed for autonomous operation in a benchmark environment.
15+
DO NOT ASK FOR CONFIRMATION OR CLARIFICATION. **You are expected to operate independently and autonomously.**
16+
Your actions should be based on the information available and the guidelines provided below.
17+
1518
## Core Capabilities
1619
1720
- **Expert Kubernetes Knowledge**: You understand Kubernetes components, architecture, orchestration principles, and resource management.
1821
- **Systematic Troubleshooting**: You follow a methodical approach to problem diagnosis, analyzing logs, metrics, and cluster state.
1922
- **Security-First Mindset**: You prioritize security awareness including RBAC, Pod Security Policies, and secure practices.
20-
- **Clear Internal Logging**: You operate based on a clear understanding of complex concepts and **maintain detailed logs of your actions, reasoning, and any relevant technical information.**
2123
- **Safety-Oriented**: You follow the principle of least privilege and **have internal checks and predefined risk thresholds before executing potentially destructive operations, always prioritizing system stability.**
2224
2325
## Operational Guidelines
@@ -27,7 +29,7 @@ spec:
2729
1. **Start Non-Intrusively**: Begin with read-only operations (get, describe) before more invasive actions.
2830
2. **Progressive Escalation**: Escalate to more detailed investigation only when necessary.
2931
3. **Document Everything**: Maintain a clear, detailed record of all investigative steps, analyses, decisions, and actions taken for benchmark review.
30-
4. **Verify Before Acting**: Internally consider and log potential impacts before executing any changes.
32+
4. **Verify Before Acting**: Internally consider potential impacts before executing any changes.
3133
3234
### Problem-Solving Framework
3335
@@ -49,11 +51,11 @@ spec:
4951
* Network connectivity.
5052
* Storage status.
5153
4. **Solution Implementation**
52-
* **Evaluate multiple potential solutions when appropriate, selecting the optimal one based on predefined criteria (e.g., safety, effectiveness, minimal impact). Log this evaluation process.**
53-
* Assess and log risks for the chosen approach.
54-
* **Formulate and log a detailed implementation plan.**
55-
* **Incorporate and log testing/verification strategies into the plan.**
56-
* **Define and log rollback procedures for any changes made.**
54+
* **Evaluate multiple potential solutions when appropriate, selecting the optimal one based on predefined criteria (e.g., safety, effectiveness, minimal impact).**
55+
* Assess risks for the chosen approach.
56+
* **Formulate a detailed implementation plan.**
57+
* **Incorporate testing/verification strategies into the plan.**
58+
* **Define rollback procedures for any changes made.**
5759
5860
## Available Tools
5961
@@ -93,37 +95,34 @@ spec:
9395
## Safety Protocols
9496
9597
1. **Read Before Write**: Always use informational tools first before modification tools.
96-
2. **Log Rationale**: Before using any modification tool, **log the comprehensive rationale, intended action, expected outcome, and the specific problem it aims to solve.**
97-
3. **Prioritize Dry-Runs**: **Utilize `--dry-run` flags (or equivalent non-impact checks) whenever available before applying changes. Log the outcome of these dry-runs.**
98-
4. **Backup Current State**: Before modifications, **always capture and log the current state of the affected resource(s) using `GetResourceYAML`.**
99-
5. **Limited Scope**: Apply changes to the minimum scope necessary to fix the issue.
100-
6. **Verify Changes**: After any modification, **verify the results with appropriate informational tools and log the verification process and outcome.**
101-
7. **Strict Destructive Command Protocol**: **Execute potentially destructive commands (e.g., `DeleteResource`, certain `ExecuteCommand` uses) only if they are deemed absolutely essential after thorough analysis and risk assessment, adhering to predefined safety thresholds and rollback plans. Log these decisions, risk assessments, and justifications extensively.**
102-
103-
## Autonomous Operation Log / Output for Benchmarking
104-
105-
For benchmarking purposes, your operational output should clearly log the following stages:
106-
107-
1. **Problem Detection/Trigger**: Log the issue or trigger that initiated the autonomous operation.
108-
2. **Initial Assessment**: Log the identified issue and the agent's initial understanding of the situation, including any assumptions made.
109-
3. **Information Gathering**: Detail all information gathered using available tools (including tool calls and their outputs). If critical information is unobtainable, log this limitation and its potential impact on the resolution process.
110-
4. **Analysis**: Log a detailed analysis of the situation in clear, technical terms, including the reasoning process, hypotheses considered, and conclusions drawn.
111-
5. **Proposed Solution(s) & Selection**: Detail the chosen solution and the tools to be used. If multiple solutions were considered, log why the chosen one was selected, including risk/benefit analysis.
112-
6. **Action Plan**: Log the step-by-step plan for resolution, including specific tool calls, parameters, and expected intermediate states.
113-
7. **Execution Log**: Log the execution of each step in the action plan, including any modifications made using tools. For modification tools, explicitly log the "Backup Current State," "Log Rationale," and "Prioritize Dry-Runs" steps from Safety Protocols.
114-
8. **Verification**: Detail the steps taken (tool calls and observations) to verify the solution's effectiveness and the outcome of these verification steps. If the solution was not effective, log this and any subsequent troubleshooting or alternative actions.
115-
9. **Rollback (if applicable)**: If a rollback was performed, log the reason, the rollback procedure executed, and the state of the system post-rollback.
116-
10. **Key Concepts Applied**: Briefly note any key Kubernetes concepts that were central to the diagnosis or resolution (for analytical/benchmarking purposes).
98+
2. **Prioritize Dry-Runs**: **Utilize `--dry-run` flags (or equivalent non-impact checks) whenever available before applying changes**
99+
3. **Backup Current State**: Before modifications, **always capture the current state of the affected resource(s) using `GetResourceYAML`.**
100+
4. **Limited Scope**: Apply changes to the minimum scope necessary to fix the issue.
101+
5. **Verify Changes**: After any modification, **verify the results with appropriate informational tools and log the verification process and outcome.**
102+
6. **Strict Destructive Command Protocol**: **Execute potentially destructive commands (e.g., `DeleteResource`, certain `ExecuteCommand` uses) only if they are deemed absolutely essential after thorough analysis and risk assessment, adhering to predefined safety thresholds and rollback plans.**
103+
104+
## Autonomous Operation Response Structure
105+
106+
After your autonomous operation, provide complete transparency of your decision-making process and actions. Your response should follow this comprehensive structure:
107+
108+
1. **Problem Detection/Trigger**: Clearly state the issue or trigger that initiated your autonomous operation.
109+
2. **Initial Assessment**: Describe your understanding of the situation, including any assumptions made based on available information.
110+
3. **Information Gathering**: Detail all information gathering steps taken, including specific tool calls and their results. If critical information cannot be obtained, explain this limitation and how it affects your approach.
111+
4. **Analysis**: Provide detailed technical analysis of the situation, including your reasoning process, hypotheses considered, and conclusions reached.
112+
5. **Solution Selection**: Present your chosen solution and explain why it was selected over alternatives. Include risk/benefit analysis when multiple approaches were considered.
113+
6. **Execution Plan**: Outline your step-by-step resolution plan with specific tool calls, parameters, and expected outcomes at each stage.
114+
7. **Action Execution**: Report on the execution of each planned step, including results of all tool calls. For modification operations, explicitly document safety protocol compliance (backup state capture, dry-run usage, etc.).
115+
8. **Solution Verification**: Detail verification steps taken to confirm solution effectiveness, including specific observations and tool outputs that validate the fix.
116+
9. **Rollback Actions**: If rollback was necessary, explain the trigger, procedure executed, and resulting system state.
117+
10. **Technical Summary**: Briefly identify key Kubernetes concepts that were central to the diagnosis and resolution for technical reference.
117118
118119
## Limitations
119120
120121
1. You cannot directly connect to or diagnose external systems outside of the Kubernetes cluster.
121122
2. You must rely on the tools provided and cannot use kubectl commands directly.
122123
3. You cannot access or modify files on the host system outside of the agent's environment.
123124
4. **The agent's actions impact target environments; all operations must prioritize safety, stability, and adherence to the principle of least privilege above all else.**
124-
125-
Always start with the least intrusive approach, and escalate diagnostics only as needed. If critical information is missing or high uncertainty exists, prioritize further information gathering or select the safest, least impactful course of action. Log any such decisions and their rationale extensively. The primary goal is to resolve the issue correctly and safely.
126-
125+
5. You CANNOT ask for confirmation or clarification or request any other user input. You are expected to operate independently and autonomously until the issues are fixed.
127126
tools:
128127
- mcpServer:
129128
toolNames:

.github/data/agent-framework/resources/tool-check.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,4 @@ spec:
1111
command: npx
1212
env:
1313
CONTEXT: kind-kagent
14-
PATH: /home/rinor/work/explore/kagent/kagent/.venv/bin:/home/rinor/.bun/bin:/home/rinor/go/bin:/home/rinor/.krew/bin:/home/rinor/.gloo/bin:/home/rinor/.gloo-mesh/bin:/home/rinor/.local/bin:/home/rinor/.gloo-mesh/bin:/home/rinor/custom-env/scripts:/home/rinor/.pyenv/plugins/pyenv-virtualenv/shims:/home/rinor/.pyenv/shims:/home/rinor/.krew/bin:/home/rinor/.gloo/bin:/home/rinor/.gloo-mesh/bin:/home/rinor/.go/bin:/home/rinor/.bun/bin:/home/rinor/go/bin:/home/rinor/.local/share/pnpm:/home/rinor/.nvm/versions/node/v23.6.0/bin:/home/rinor/.krew/bin:/home/rinor/.gloo/bin:/home/rinor/.gloo-mesh/bin:/home/rinor/.local/bin:/home/rinor/.gloo-mesh/bin:/home/rinor/custom-env/scripts:/home/rinor/.pyenv/plugins/pyenv-virtualenv/shims:/home/rinor/.pyenv/bin:/home/rinor/.krew/bin:/home/rinor/.gloo/bin:/home/rinor/.gloo-mesh/bin:/home/rinor/.cargo/bin:/home/rinor/.atuin/bin:/home/rinor/.deno/bin:/home/rinor/.local/bin:/home/rinor/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin:/home/rinor/.local/share/JetBrains/Toolbox/scripts:~/.local/bin:/usr/local/go/bin:/home/rinor/.vscode/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/scripts/noConfigScripts:/home/rinor/.config/Code/User/globalStorage/github.copilot-chat/debugCommand:~/.local/bin:/usr/local/go/bin;/home/rinor/.bun/bin
1514
description: Check Kubernetes Cluster Fixed

.github/data/agent-framework/run-challenge.sh

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ scenario_dir="$1"
44
challenge_file="$2"
55

66
# Extract the challenge name and description from the YAML metadata file
7-
NAME=$(yq eval '.metadata.name' "$challenge_file")
8-
DESCRIPTION=$(yq eval '.spec.description' "$challenge_file")
9-
USER_PROMPT=$(yq eval '.spec.prompt' "$challenge_file")
7+
NAME=$(yq eval '.metadata.name' "${scenario_dir}/${challenge_file}")
8+
DESCRIPTION=$(yq eval '.spec.description' "${scenario_dir}/${challenge_file}")
9+
USER_PROMPT=$(yq eval '.spec.prompt' "${scenario_dir}/${challenge_file}")
1010

1111
log() {
1212
echo "[$(date +'%Y-%m-%dT%H:%M:%S')] $1"
@@ -20,25 +20,32 @@ log "User Prompt: $USER_PROMPT"
2020

2121

2222
echo "Waiting for pods to be stable..."
23-
while kubectl --context ${CLUSTER_CTX} get pods -A | grep ContainerCreating; do sleep 5; done
24-
while kubectl --context ${CLUSTER_CTX} get pods -A | grep Terminating; do sleep 5; done
23+
while kubectl --context "${CLUSTER_CTX}" get pods -A | grep ContainerCreating; do sleep 5; done
24+
while kubectl --context "${CLUSTER_CTX}" get pods -A | grep Terminating; do sleep 5; done
2525

2626
# Test baseline
27-
timeout --signal=INT 3m mocha ./test.js --timeout 10000 --retries 5
27+
log "Testing initial cluster state..."
28+
timeout --signal=INT 3m mocha "${scenario_dir}/test.js" --timeout 10000 --retries 5
29+
BASELINE_TEST_STATUS=$?
30+
31+
if [ $BASELINE_TEST_STATUS -ne 0 ]; then
32+
log "ERROR: Baseline test failed. The cluster is not in the right state to proceed with the challenge."
33+
log "Exiting without breaking the environment."
34+
exit 1
35+
fi
2836

2937
# Break the environment by executing commands defined in each step of the challenge
3038
log "Breaking the environment..."
31-
STEPS_COUNT=$(yq '.spec.steps | length' "$challenge_file")
39+
STEPS_COUNT=$(yq '.spec.steps | length' "${scenario_dir}/${challenge_file}")
3240
for ((i=0; i<$STEPS_COUNT; i++)); do
33-
yq ".spec.steps[$i].run" "$challenge_file" | while IFS= read -r cmd; do
34-
echo "$cmd" >> "$challenge_file".$i.sh
41+
yq ".spec.steps[$i].run" "${scenario_dir}/${challenge_file}" | while IFS= read -r cmd; do
42+
echo "$cmd" >> "${scenario_dir}/${challenge_file}".$i.sh
3543
done
3644
echo "Waiting for pods to be stable..."
3745
while kubectl --context ${CLUSTER_CTX} get pods -A | grep ContainerCreating; do sleep 5; done
3846
while kubectl --context ${CLUSTER_CTX} get pods -A | grep Terminating; do sleep 5; done
3947

40-
####TODO sh "$challenge_file".$i.sh
41-
sh "$challenge_file".$i.sh
48+
sh "${scenario_dir}/${challenge_file}".$i.sh
4249
done
4350
rm -f "$challenge_file".*.sh
4451
echo "Waiting for pods to be stable..."
@@ -47,26 +54,26 @@ while kubectl --context ${CLUSTER_CTX} get pods -A | grep Terminating; do sleep
4754
kubectl --context ${CLUSTER_CTX} get pods -A
4855

4956
log "Testing cluster after breaking..."
50-
timeout --signal=INT 1m mocha ./test.js --timeout 10000 || true
51-
52-
# Try to fix the broken environment using the Agent Framework (apps/agent-framework) and OpenAI API
57+
timeout --signal=INT 1m mocha "${scenario_dir}/test.js" --timeout 10000 || true
5358

54-
log "Trying to fix thekagent broken environment using the Agent Framework..."
59+
# Try to fix the broken environment using Kagent
60+
log "Trying to fix the kagent broken environment using Kagent..."
5561

5662
# Pipe the output of kagent invoke to the thought log file
57-
touch $NAME.thought.log
58-
####TODO echo "$USER_PROMPT" | kagent invoke --agent "k8s-agent" --task - > $NAME.thought.log 2>&1
59-
timeout --signal=INT 3m bash -c 'echo "$1" | kagent invoke --agent "k8s-agent" --task -' -- "$USER_PROMPT" > $NAME.thought.log 2>&1
63+
touch "${scenario_dir}/$NAME.thought.log"
64+
mkdir -p "${scenario_dir}/results"
65+
66+
timeout --signal=INT 3m bash -c 'echo "$1" | kagent invoke --agent "k8s-agent" --task -' -- "$USER_PROMPT" > "${scenario_dir}/$NAME.thought.log" 2>&1
6067

6168
log "Testing cluster after fixing..."
6269
kubectl --context ${CLUSTER_CTX} get pods -A
63-
if mocha ./test.js --timeout 10000; then
70+
if mocha "${scenario_dir}/test.js" --timeout 10000; then
6471
log "---------------> challenge SUCCESSFUL <------------------"
65-
rm -f $NAME.failure
66-
cat $NAME.thought.log > results/$NAME.success
72+
rm -f "${scenario_dir}/$NAME.failure" || true
73+
mv "${scenario_dir}/$NAME.thought.log" "${scenario_dir}/results/$NAME.success"
6774
else
6875
log "---------------> challenge FAILED <----------------------"
69-
rm -f $NAME.success
70-
cat $NAME.thought.log > results/$NAME.failure
76+
rm -f "${scenario_dir}/$NAME.success" || true
77+
mv "${scenario_dir}/$NAME.thought.log" "${scenario_dir}/results/$NAME.failure"
7178
fi
7279

0 commit comments

Comments
 (0)