Skip to content

Commit fee5683

Browse files
committed
containerized aks agent
1 parent ab95bc2 commit fee5683

33 files changed

+2251
-3831
lines changed

src/aks-agent/azext_aks_agent/__init__.py

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,26 @@
33
# Licensed under the MIT License. See License.txt in the project root for license information.
44
# --------------------------------------------------------------------------------------------
55

6-
7-
import os
6+
from azext_aks_agent._client_factory import CUSTOM_MGMT_AKS
87

98
# pylint: disable=unused-import
10-
import azext_aks_agent._help
11-
from azext_aks_agent._consts import (
12-
CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY,
13-
CONST_AGENT_NAME,
14-
CONST_AGENT_NAME_ENV_KEY,
15-
CONST_DISABLE_PROMETHEUS_TOOLSET_ENV_KEY,
16-
CONST_PRIVACY_NOTICE_BANNER,
17-
CONST_PRIVACY_NOTICE_BANNER_ENV_KEY,
18-
)
199
from azure.cli.core import AzCommandsLoader
20-
from azure.cli.core.api import get_config_dir
10+
from azure.cli.core.profiles import register_resource_type
11+
12+
13+
def register_aks_agent_resource_type():
14+
register_resource_type(
15+
"latest",
16+
CUSTOM_MGMT_AKS,
17+
None,
18+
)
2119

2220

2321
class ContainerServiceCommandsLoader(AzCommandsLoader):
2422

2523
def __init__(self, cli_ctx=None):
2624
from azure.cli.core.commands import CliCommandType
25+
register_aks_agent_resource_type()
2726

2827
aks_agent_custom = CliCommandType(operations_tmpl='azext_aks_agent.custom#{}')
2928
super().__init__(
@@ -44,14 +43,3 @@ def load_arguments(self, command):
4443

4544

4645
COMMAND_LOADER_CLS = ContainerServiceCommandsLoader
47-
48-
49-
# NOTE(mainred): holmesgpt leverages the environment variables to customize its behavior.
50-
def customize_holmesgpt():
51-
os.environ[CONST_DISABLE_PROMETHEUS_TOOLSET_ENV_KEY] = "true"
52-
os.environ[CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY] = get_config_dir()
53-
os.environ[CONST_AGENT_NAME_ENV_KEY] = CONST_AGENT_NAME
54-
os.environ[CONST_PRIVACY_NOTICE_BANNER_ENV_KEY] = CONST_PRIVACY_NOTICE_BANNER
55-
56-
57-
customize_holmesgpt()
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# --------------------------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License. See License.txt in the project root for license information.
4+
# --------------------------------------------------------------------------------------------
5+
6+
from azure.cli.core.commands.client_factory import get_mgmt_service_client
7+
from azure.cli.core.commands.parameters import get_resources_in_subscription
8+
from azure.cli.core.profiles import CustomResourceType, ResourceType
9+
from azure.mgmt.msi import ManagedServiceIdentityClient
10+
from knack.util import CLIError
11+
12+
CUSTOM_MGMT_AKS = CustomResourceType('azext_aks_agent.vendored_sdks.azure_mgmt_containerservice.2025_10_01',
13+
'ContainerServiceClient')
14+
15+
# Note: cf_xxx, as the client_factory option value of a command group at command declaration, it should ignore
16+
# parameters other than cli_ctx; get_xxx_client is used as the client of other services in the command implementation,
17+
# and usually accepts subscription_id as a parameter to reconfigure the subscription when sending the request
18+
19+
20+
# container service clients
21+
def get_container_service_client(cli_ctx, subscription_id=None):
22+
return get_mgmt_service_client(cli_ctx, CUSTOM_MGMT_AKS, subscription_id=subscription_id)
23+
24+
25+
def cf_managed_clusters(cli_ctx, *_):
26+
return get_container_service_client(cli_ctx).managed_clusters

src/aks-agent/azext_aks_agent/_consts.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,17 @@
3030
CONST_MCP_GITHUB_REPO = "Azure/aks-mcp"
3131
CONST_MCP_BINARY_DIR = "bin"
3232

33-
# Color constants for terminal output
34-
HELP_COLOR = "cyan" # same as AI_COLOR for now
35-
ERROR_COLOR = "red"
33+
# Kubernetes WebSocket exec protocol constants
34+
RESIZE_CHANNEL = 4 # WebSocket channel for terminal resize messages
35+
# WebSocket heartbeat configuration (matching kubectl client-go)
36+
# Based on kubernetes/client-go/tools/remotecommand/websocket.go#L59-L65
37+
# pingPeriod = 5 * time.Second
38+
# pingReadDeadline = (pingPeriod * 12) + (1 * time.Second)
39+
# The read deadline is calculated to allow up to 12 missed pings plus 1 second buffer
40+
# This provides tolerance for network delays while detecting actual connection failures
41+
HEARTBEAT_INTERVAL = 5.0 # pingPeriod: 5 seconds between pings
42+
HEARTBEAT_TIMEOUT = (HEARTBEAT_INTERVAL * 12) + 1 # pingReadDeadline: 61 seconds total timeout
43+
44+
AGENT_NAMESPACE = "kube-system"
45+
AGENT_LABEL_SELECTOR = "app.kubernetes.io/name=aks-agent"
46+
AKS_MCP_LABEL_SELECTOR = "app.kubernetes.io/name=aks-mcp"

src/aks-agent/azext_aks_agent/_help.py

Lines changed: 49 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,15 @@
55
# --------------------------------------------------------------------------------------------
66

77
# pylint: disable=too-many-lines
8-
98
from knack.help_files import helps
109

1110
helps[
1211
"aks agent"
1312
] = """
1413
type: command
15-
short-summary: Run AI assistant to analyze and troubleshoot Kubernetes clusters.
14+
short-summary: Run AI assistant to analyze and troubleshoot Azure Kubernetes Service (AKS) clusters.
1615
long-summary: |-
1716
This command allows you to ask questions about your Azure Kubernetes cluster and get answers using AI models.
18-
No need to manually set environment variables! All model and credential information can be configured interactively using `az aks agent-init` or via a config file.
1917
parameters:
2018
- name: --name -n
2119
type: string
@@ -33,12 +31,6 @@
3331
Each provider may require different environment variables and model naming conventions.
3432
For a full list of supported providers, model patterns, and required environment variables, see https://docs.litellm.ai/docs/providers.
3533
Note: For Azure OpenAI, it is recommended to set the deployment name as the model name until https://github.com/BerriAI/litellm/issues/13950 is resolved.
36-
- name: --api-key
37-
type: string
38-
short-summary: API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY). (Deprecated)
39-
- name: --config-file
40-
type: string
41-
short-summary: Path to configuration file.
4234
- name: --max-steps
4335
type: int
4436
short-summary: Maximum number of steps the LLM can take to investigate the issue.
@@ -56,82 +48,35 @@
5648
short-summary: Refresh the toolsets status.
5749
- name: --status
5850
type: bool
59-
short-summary: Show AKS agent configuration and status information.
60-
- name: --aks-mcp
61-
type: bool
62-
short-summary: Enable AKS MCP integration for enhanced capabilities. Traditional mode is the default.
63-
51+
short-summary: Show AKS agent deployment status including helm release, deployments, and pod information.
6452
examples:
53+
- name: Ask about pod issues in the cluster with OpenAI
54+
text: |-
55+
az aks agent "Why are my pods not starting?" --model gpt-4o --resource-group myResourceGroup --name myAKSCluster
6556
- name: Ask about pod issues in the cluster with last configured model
6657
text: |-
67-
az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup
68-
- name: Ask about pod issues in the cluster with Azure OpenAI
58+
az aks agent "Why are my pods not starting?" --resource-group myResourceGroup --name myAKSCluster
59+
- name: Check AKS agent deployment status
6960
text: |-
70-
az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model azure/gpt-4.1
71-
- name: Ask about pod issues in the cluster with OpenAI
61+
az aks agent --status --resource-group myResourceGroup --name myAKSCluster
62+
- name: Ask about pod issues in the cluster with Azure OpenAI
7263
text: |-
73-
az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model gpt-4o
74-
- name: Run agent with config file
75-
text: |
76-
az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml --name MyManagedCluster --resource-group MyResourceGroup
77-
Here is an example of config file:
78-
```json
79-
llms:
80-
- provider: azure
81-
MODEL_NAME: gpt-4.1
82-
AZURE_API_KEY: *******
83-
AZURE_API_BASE: https://{azure-openai-service-name}.openai.azure.com/
84-
AZURE_API_VERSION: 2025-04-01-preview
85-
# define a list of mcp servers, mcp server can be defined
86-
mcp_servers:
87-
aks_mcp:
88-
description: "The AKS-MCP is a Model Context Protocol (MCP) server that enables AI assistants to interact with Azure Kubernetes Service (AKS) clusters"
89-
url: "http://localhost:8003/sse"
90-
91-
# try adding your own tools or toggle the built-in toolsets here
92-
# e.g. query company-specific data, fetch logs from your existing observability tools, etc
93-
# To check how to add a customized toolset, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/custom_toolsets.html#custom-toolsets
94-
# To find all built-in toolsets, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/builtin_toolsets.html
95-
toolsets:
96-
# add a new json processor toolset
97-
json_processor:
98-
description: "A toolset for processing JSON data using jq"
99-
prerequisites:
100-
- command: "jq --version" # Ensure jq is installed
101-
tools:
102-
- name: "process_json"
103-
description: "A tool that uses jq to process JSON input"
104-
command: "echo '{{ json_input }}' | jq '.'" # Example jq command to format JSON
105-
# disable a built-in toolsets
106-
aks/core:
107-
enabled: false
108-
```
64+
az aks agent "Why are my pods not starting?" --model azure/gpt-4.1 --resource-group myResourceGroup --name myAKSCluster
10965
- name: Run in interactive mode without a question
110-
text: az aks agent "Check the pod status in my cluster" --name MyManagedCluster --resource-group MyResourceGroup --model azure/gpt-4.1 --api-key "sk-xxx"
111-
- name: Run in non-interactive batch mode
112-
text: az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/gpt-4.1
113-
- name: Show detailed tool output during analysis
114-
text: az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/gpt-4.1
115-
- name: Use custom configuration file
116-
text: az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml --model azure/gpt-4.1
117-
- name: Run agent with no echo of the original question
118-
text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/gpt-4.1
119-
- name: Refresh toolsets to get the latest available tools
120-
text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/gpt-4.1
121-
- name: Show agent status (MCP readiness)
122-
text: az aks agent --status
123-
- name: Run in interactive mode without a question
124-
text: az aks agent "Check the pod status in my cluster" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment --api-key "sk-xxx"
66+
text: |-
67+
az aks agent "Check the pod status in my cluster" --model azure/gpt-4.1 --resource-group myResourceGroup --name myAKSCluster
12568
- name: Run in non-interactive batch mode
126-
text: az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/my-gpt4.1-deployment
69+
text: |-
70+
az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/gpt-4.1 --resource-group myResourceGroup --name myAKSCluster
12771
- name: Show detailed tool output during analysis
128-
text: az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/my-gpt4.1-deployment
129-
- name: Use custom configuration file
130-
text: az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml --model azure/my-gpt4.1-deployment
72+
text: |-
73+
az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/gpt-4.1 --resource-group myResourceGroup --name myAKSCluster
13174
- name: Run agent with no echo of the original question
132-
text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/my-gpt4.1-deployment
75+
text: |-
76+
az aks agent "What is the status of my cluster?" --no-echo-request --model azure/gpt-4.1 --resource-group myResourceGroup --name myAKSCluster
13377
- name: Refresh toolsets to get the latest available tools
134-
text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/my-gpt4.1-deployment
78+
text: |-
79+
az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/gpt-4.1 --resource-group myResourceGroup --name myAKSCluster
13580
"""
13681

13782
helps[
@@ -142,7 +87,34 @@
14287
long-summary: |-
14388
This command interactively guides you to select an LLM provider and model, validates the connection, and saves the configuration for later use.
14489
You can run this command multiple times to add or update different model configurations.
90+
parameters:
91+
- name: --name -n
92+
type: string
93+
short-summary: Name of the managed cluster.
94+
- name: --resource-group -g
95+
type: string
96+
short-summary: Name of the resource group.
97+
examples:
98+
- name: Initialize and deploy AKS agent to a cluster
99+
text: |-
100+
az aks agent-init --resource-group myResourceGroup --name myAKSCluster
101+
"""
102+
103+
helps[
104+
"aks agent-cleanup"
105+
] = """
106+
type: command
107+
short-summary: Cleanup and uninstall AKS agent from the cluster.
108+
long-summary: |-
109+
This command removes the AKS agent and deletes all associated resources from the cluster.
110+
parameters:
111+
- name: --name -n
112+
type: string
113+
short-summary: Name of the managed cluster.
114+
- name: --resource-group -g
115+
type: string
116+
short-summary: Name of the resource group.
145117
examples:
146-
- name: Initialize configuration for Azure OpenAI, OpenAI or other llms
147-
text: az aks agent-init
118+
- name: Cleanup and uninstall AKS agent from the cluster
119+
text: az aks agent-cleanup --resource-group myResourceGroup --name myAKSCluster
148120
"""

src/aks-agent/azext_aks_agent/_params.py

Lines changed: 22 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,6 @@
44
# --------------------------------------------------------------------------------------------
55

66
# pylint: disable=too-many-statements,too-many-lines
7-
import os.path
8-
9-
from azext_aks_agent._consts import CONST_AGENT_CONFIG_FILE_NAME
10-
from azext_aks_agent._validators import validate_agent_config_file
11-
from azure.cli.core.api import get_config_dir
12-
from azure.cli.core.commands.parameters import get_three_state_flag
13-
14-
157
def load_arguments(self, _):
168
with self.argument_context("aks agent") as c:
179
c.positional(
@@ -24,13 +16,11 @@ def load_arguments(self, _):
2416
"resource_group_name",
2517
options_list=["--resource-group", "-g"],
2618
help="Name of resource group.",
27-
required=False,
2819
)
2920
c.argument(
30-
"name",
21+
"cluster_name",
3122
options_list=["--name", "-n"],
3223
help="Name of the managed cluster.",
33-
required=False,
3424
)
3525
c.argument(
3626
"max_steps",
@@ -39,25 +29,12 @@ def load_arguments(self, _):
3929
required=False,
4030
help="Maximum number of steps the LLM can take to investigate the issue.",
4131
)
42-
c.argument(
43-
"config_file",
44-
default=os.path.join(get_config_dir(), CONST_AGENT_CONFIG_FILE_NAME),
45-
validator=validate_agent_config_file,
46-
required=False,
47-
help="Path to the config file.",
48-
)
4932
c.argument(
5033
"model",
5134
help=" Specify the LLM provider and model or deployment to use for the AI assistant.",
5235
required=False,
5336
type=str,
5437
)
55-
c.argument(
56-
"api_key",
57-
help="API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY)",
58-
required=False,
59-
type=str,
60-
)
6138
c.argument(
6239
"no_interactive",
6340
help="Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.",
@@ -84,14 +61,26 @@ def load_arguments(self, _):
8461
action="store_true",
8562
help="Show AKS agent configuration and status information.",
8663
)
64+
65+
with self.argument_context("aks agent-init") as c:
8766
c.argument(
88-
"use_aks_mcp",
89-
options_list=["--aks-mcp"],
90-
default=False,
91-
arg_type=get_three_state_flag(),
92-
help=(
93-
"Enable AKS MCP integration for enhanced capabilities. "
94-
"Traditional mode is the default. Use --aks-mcp to enable MCP mode, or "
95-
"--no-aks-mcp to explicitly disable it."
96-
),
67+
"resource_group_name",
68+
options_list=["--resource-group", "-g"],
69+
help="Name of resource group.",
70+
)
71+
c.argument(
72+
"cluster_name",
73+
options_list=["--name", "-n"],
74+
help="Name of the managed cluster.",
75+
)
76+
with self.argument_context("aks agent-cleanup") as c:
77+
c.argument(
78+
"resource_group_name",
79+
options_list=["--resource-group", "-g"],
80+
help="Name of resource group.",
81+
)
82+
c.argument(
83+
"cluster_name",
84+
options_list=["--name", "-n"],
85+
help="Name of the managed cluster.",
9786
)

0 commit comments

Comments
 (0)