Skip to content

Commit 3f9db25

Browse files
committed
containerized aks-agent
1 parent ef668ae commit 3f9db25

File tree

414 files changed

+2884
-102049
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

414 files changed

+2884
-102049
lines changed

src/aks-agent/HISTORY.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ To release a new version, please select a new version number (usually plus 1 to
1212
Pending
1313
+++++++
1414

15+
1.0.0b12
16+
++++++++
17+
18+
1519
1.0.0b11
1620
++++++++
1721
* Fix(agent-init): replace max_tokens with max_completion_tokens for connection check of Azure OpenAI service.

src/aks-agent/azext_aks_agent/__init__.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,8 @@
44
# --------------------------------------------------------------------------------------------
55

66

7-
import os
8-
97
# pylint: disable=unused-import
10-
import azext_aks_agent._help
11-
from azext_aks_agent._consts import (
12-
CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY,
13-
CONST_AGENT_NAME,
14-
CONST_AGENT_NAME_ENV_KEY,
15-
CONST_DISABLE_PROMETHEUS_TOOLSET_ENV_KEY,
16-
CONST_PRIVACY_NOTICE_BANNER,
17-
CONST_PRIVACY_NOTICE_BANNER_ENV_KEY,
18-
)
198
from azure.cli.core import AzCommandsLoader
20-
from azure.cli.core.api import get_config_dir
219

2210

2311
class ContainerServiceCommandsLoader(AzCommandsLoader):
@@ -44,14 +32,3 @@ def load_arguments(self, command):
4432

4533

4634
COMMAND_LOADER_CLS = ContainerServiceCommandsLoader
47-
48-
49-
# NOTE(mainred): holmesgpt leverages the environment variables to customize its behavior.
50-
def customize_holmesgpt():
51-
os.environ[CONST_DISABLE_PROMETHEUS_TOOLSET_ENV_KEY] = "true"
52-
os.environ[CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY] = get_config_dir()
53-
os.environ[CONST_AGENT_NAME_ENV_KEY] = CONST_AGENT_NAME
54-
os.environ[CONST_PRIVACY_NOTICE_BANNER_ENV_KEY] = CONST_PRIVACY_NOTICE_BANNER
55-
56-
57-
customize_holmesgpt()

src/aks-agent/azext_aks_agent/_consts.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,16 @@
3030
CONST_MCP_GITHUB_REPO = "Azure/aks-mcp"
3131
CONST_MCP_BINARY_DIR = "bin"
3232

33-
# Color constants for terminal output
34-
HELP_COLOR = "cyan" # same as AI_COLOR for now
35-
ERROR_COLOR = "red"
33+
# Kubernetes WebSocket exec protocol constants
34+
RESIZE_CHANNEL = 4 # WebSocket channel for terminal resize messages
35+
# WebSocket heartbeat configuration (matching kubectl client-go)
36+
# Based on kubernetes/client-go/tools/remotecommand/websocket.go#L59-L65
37+
# pingPeriod = 5 * time.Second
38+
# pingReadDeadline = (pingPeriod * 12) + (1 * time.Second)
39+
# The read deadline is calculated to allow up to 12 missed pings plus 1 second buffer
40+
# This provides tolerance for network delays while detecting actual connection failures
41+
HEARTBEAT_INTERVAL = 5.0 # pingPeriod: 5 seconds between pings
42+
HEARTBEAT_TIMEOUT = (HEARTBEAT_INTERVAL * 12) + 1 # pingReadDeadline: 61 seconds total timeout
43+
44+
AGENT_NAMESPACE = "kube-system"
45+
AGENT_LABEL_SELECTOR = "app.kubernetes.io/name=aks-agent"

src/aks-agent/azext_aks_agent/_help.py

Lines changed: 33 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
# --------------------------------------------------------------------------------------------
66

77
# pylint: disable=too-many-lines
8-
98
from knack.help_files import helps
109

1110
helps[
@@ -15,14 +14,19 @@
1514
short-summary: Run AI assistant to analyze and troubleshoot Kubernetes clusters.
1615
long-summary: |-
1716
This command allows you to ask questions about your Azure Kubernetes cluster and get answers using AI models.
18-
No need to manually set environment variables! All model and credential information can be configured interactively using `az aks agent-init` or via a config file.
1917
parameters:
2018
- name: --name -n
2119
type: string
2220
short-summary: Name of the managed cluster.
2321
- name: --resource-group -g
2422
type: string
2523
short-summary: Name of the resource group.
24+
- name: --init
25+
type: bool
26+
short-summary: Initialize and deploy the AKS agent to the cluster.
27+
long-summary: |-
28+
Run the interactive initialization wizard to configure LLM settings, cluster role permissions,
29+
and deploy the AKS agent Helm chart to your cluster. Required when first setting up the agent.
2630
- name: --model
2731
type: string
2832
short-summary: Specify the LLM provider and model or deployment to use for the AI assistant.
@@ -33,12 +37,6 @@
3337
Each provider may require different environment variables and model naming conventions.
3438
For a full list of supported providers, model patterns, and required environment variables, see https://docs.litellm.ai/docs/providers.
3539
Note: For Azure OpenAI, it is recommended to set the deployment name as the model name until https://github.com/BerriAI/litellm/issues/13950 is resolved.
36-
- name: --api-key
37-
type: string
38-
short-summary: API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY). (Deprecated)
39-
- name: --config-file
40-
type: string
41-
short-summary: Path to configuration file.
4240
- name: --max-steps
4341
type: int
4442
short-summary: Maximum number of steps the LLM can take to investigate the issue.
@@ -56,93 +54,43 @@
5654
short-summary: Refresh the toolsets status.
5755
- name: --status
5856
type: bool
59-
short-summary: Show AKS agent configuration and status information.
60-
- name: --aks-mcp
57+
short-summary: Show AKS agent deployment status including helm release, deployments, and pod information.
58+
- name: --cleanup
6159
type: bool
62-
short-summary: Enable AKS MCP integration for enhanced capabilities. Traditional mode is the default.
60+
short-summary: Uninstall the AKS agent and delete all associated resources from the cluster.
6361
6462
examples:
65-
- name: Ask about pod issues in the cluster with last configured model
63+
- name: Initialize and deploy AKS agent to a cluster
6664
text: |-
67-
az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup
68-
- name: Ask about pod issues in the cluster with Azure OpenAI
65+
az aks agent --init --resource-group myResourceGroup --name myAKSCluster
66+
- name: Check AKS agent deployment status
6967
text: |-
70-
az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model azure/gpt-4.1
68+
az aks agent --status
7169
- name: Ask about pod issues in the cluster with OpenAI
7270
text: |-
73-
az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model gpt-4o
74-
- name: Run agent with config file
75-
text: |
76-
az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml --name MyManagedCluster --resource-group MyResourceGroup
77-
Here is an example of config file:
78-
```json
79-
llms:
80-
- provider: azure
81-
MODEL_NAME: gpt-4.1
82-
AZURE_API_KEY: *******
83-
AZURE_API_BASE: https://{azure-openai-service-name}.openai.azure.com/
84-
AZURE_API_VERSION: 2025-04-01-preview
85-
# define a list of mcp servers, mcp server can be defined
86-
mcp_servers:
87-
aks_mcp:
88-
description: "The AKS-MCP is a Model Context Protocol (MCP) server that enables AI assistants to interact with Azure Kubernetes Service (AKS) clusters"
89-
url: "http://localhost:8003/sse"
90-
91-
# try adding your own tools or toggle the built-in toolsets here
92-
# e.g. query company-specific data, fetch logs from your existing observability tools, etc
93-
# To check how to add a customized toolset, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/custom_toolsets.html#custom-toolsets
94-
# To find all built-in toolsets, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/builtin_toolsets.html
95-
toolsets:
96-
# add a new json processor toolset
97-
json_processor:
98-
description: "A toolset for processing JSON data using jq"
99-
prerequisites:
100-
- command: "jq --version" # Ensure jq is installed
101-
tools:
102-
- name: "process_json"
103-
description: "A tool that uses jq to process JSON input"
104-
command: "echo '{{ json_input }}' | jq '.'" # Example jq command to format JSON
105-
# disable a built-in toolsets
106-
aks/core:
107-
enabled: false
108-
```
109-
- name: Run in interactive mode without a question
110-
text: az aks agent "Check the pod status in my cluster" --name MyManagedCluster --resource-group MyResourceGroup --model azure/gpt-4.1 --api-key "sk-xxx"
111-
- name: Run in non-interactive batch mode
112-
text: az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/gpt-4.1
113-
- name: Show detailed tool output during analysis
114-
text: az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/gpt-4.1
115-
- name: Use custom configuration file
116-
text: az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml --model azure/gpt-4.1
117-
- name: Run agent with no echo of the original question
118-
text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/gpt-4.1
119-
- name: Refresh toolsets to get the latest available tools
120-
text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/gpt-4.1
121-
- name: Show agent status (MCP readiness)
122-
text: az aks agent --status
71+
az aks agent "Why are my pods not starting?" --model gpt-4o
72+
- name: Ask about pod issues in the cluster with last configured model
73+
text: |-
74+
az aks agent "Why are my pods not starting?"
75+
- name: Ask about pod issues in the cluster with Azure OpenAI
76+
text: |-
77+
az aks agent "Why are my pods not starting?" --model azure/gpt-4.1
12378
- name: Run in interactive mode without a question
124-
text: az aks agent "Check the pod status in my cluster" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment --api-key "sk-xxx"
79+
text: |-
80+
az aks agent "Check the pod status in my cluster" --model azure/gpt-4.1
12581
- name: Run in non-interactive batch mode
126-
text: az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/my-gpt4.1-deployment
82+
text: |-
83+
az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/gpt-4.1
12784
- name: Show detailed tool output during analysis
128-
text: az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/my-gpt4.1-deployment
129-
- name: Use custom configuration file
130-
text: az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml --model azure/my-gpt4.1-deployment
85+
text: |-
86+
az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/gpt-4.1
13187
- name: Run agent with no echo of the original question
132-
text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/my-gpt4.1-deployment
88+
text: |-
89+
az aks agent "What is the status of my cluster?" --no-echo-request --model azure/gpt-4.1
13390
- name: Refresh toolsets to get the latest available tools
134-
text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/my-gpt4.1-deployment
135-
"""
136-
137-
helps[
138-
"aks agent-init"
139-
] = """
140-
type: command
141-
short-summary: Initialize and validate LLM provider/model configuration for AKS agent.
142-
long-summary: |-
143-
This command interactively guides you to select an LLM provider and model, validates the connection, and saves the configuration for later use.
144-
You can run this command multiple times to add or update different model configurations.
145-
examples:
146-
- name: Initialize configuration for Azure OpenAI, OpenAI or other llms
147-
text: az aks agent-init
91+
text: |-
92+
az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/gpt-4.1
93+
- name: Cleanup and uninstall AKS agent from the cluster
94+
text: |-
95+
az aks agent --cleanup
14896
"""

src/aks-agent/azext_aks_agent/_params.py

Lines changed: 7 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,6 @@
44
# --------------------------------------------------------------------------------------------
55

66
# pylint: disable=too-many-statements,too-many-lines
7-
import os.path
8-
9-
from azext_aks_agent._consts import CONST_AGENT_CONFIG_FILE_NAME
10-
from azext_aks_agent._validators import validate_agent_config_file
11-
from azure.cli.core.api import get_config_dir
12-
from azure.cli.core.commands.parameters import get_three_state_flag
13-
14-
157
def load_arguments(self, _):
168
with self.argument_context("aks agent") as c:
179
c.positional(
@@ -40,24 +32,17 @@ def load_arguments(self, _):
4032
help="Maximum number of steps the LLM can take to investigate the issue.",
4133
)
4234
c.argument(
43-
"config_file",
44-
default=os.path.join(get_config_dir(), CONST_AGENT_CONFIG_FILE_NAME),
45-
validator=validate_agent_config_file,
35+
"init",
4636
required=False,
47-
help="Path to the config file.",
37+
help="Initialize llm configurations and aks-agent environment on the AKS cluster.",
38+
action="store_true",
4839
)
4940
c.argument(
5041
"model",
5142
help=" Specify the LLM provider and model or deployment to use for the AI assistant.",
5243
required=False,
5344
type=str,
5445
)
55-
c.argument(
56-
"api_key",
57-
help="API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY)",
58-
required=False,
59-
type=str,
60-
)
6146
c.argument(
6247
"no_interactive",
6348
help="Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.",
@@ -85,13 +70,8 @@ def load_arguments(self, _):
8570
help="Show AKS agent configuration and status information.",
8671
)
8772
c.argument(
88-
"use_aks_mcp",
89-
options_list=["--aks-mcp"],
90-
default=False,
91-
arg_type=get_three_state_flag(),
92-
help=(
93-
"Enable AKS MCP integration for enhanced capabilities. "
94-
"Traditional mode is the default. Use --aks-mcp to enable MCP mode, or "
95-
"--no-aks-mcp to explicitly disable it."
96-
),
73+
"cleanup",
74+
options_list=["--cleanup"],
75+
action="store_true",
76+
help="Remove aks-agent resources on the AKS cluster.",
9777
)

src/aks-agent/azext_aks_agent/_validators.py

Lines changed: 0 additions & 53 deletions
This file was deleted.

0 commit comments

Comments
 (0)