Skip to content

Commit 455e0ce

Browse files
authored
{AKS} Add interactive AI-powered deubugging tool az aks agent (#9059)
1 parent aada7db commit 455e0ce

File tree

11 files changed

+722
-226
lines changed

11 files changed

+722
-226
lines changed

src/aks-preview/HISTORY.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ To release a new version, please select a new version number (usually plus 1 to
1212
Pending
1313
+++++++
1414

15+
18.0.0b28
16+
+++++++
17+
* Add interactive AI-powered debugging tool `az aks agent`.
18+
1519
18.0.0b27
1620
+++++++
1721
* Add framework for interactive AI-powered debugging tool.

src/aks-preview/azext_aks_preview/_consts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,4 +378,4 @@
378378
CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY = "HOLMES_CONFIGPATH_DIR"
379379
CONST_AGENT_NAME = "AKS AGENT"
380380
CONST_AGENT_NAME_ENV_KEY = "AGENT_NAME"
381-
CONST_AGENT_CONFIG_FILE_NAME = "aksAgent.config"
381+
CONST_AGENT_CONFIG_FILE_NAME = "aksAgent.yaml"

src/aks-preview/azext_aks_preview/_help.py

Lines changed: 95 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -3953,100 +3953,98 @@
39533953
short-summary: Name of the identity binding to show.
39543954
"""
39553955

3956-
# pylint: disable=line-too-long
3957-
# helps[
3958-
# "aks agent"
3959-
# ] = """
3960-
# type: command
3961-
# short-summary: Run AI assistant to analyze and troubleshoot Kubernetes clusters.
3962-
# long-summary: |-
3963-
# This command allows you to ask questions about your Azure Kubernetes cluster and get answers using AI models.
3964-
# Environment variables must be set to use the AI model, please refer to https://docs.litellm.ai/docs/providers to learn more about supported AI providers and models and required environment variables.
3965-
# parameters:
3966-
# - name: --name -n
3967-
# type: string
3968-
# short-summary: Name of the managed cluster.
3969-
# - name: --resource-group -g
3970-
# type: string
3971-
# short-summary: Name of the resource group.
3972-
# - name: --model
3973-
# type: string
3974-
# short-summary: Model to use for the LLM.
3975-
# - name: --api-key
3976-
# type: string
3977-
# short-summary: API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY).
3978-
# - name: --config-file
3979-
# type: string
3980-
# short-summary: Path to configuration file.
3981-
# - name: --max-steps
3982-
# type: int
3983-
# short-summary: Maximum number of steps the LLM can take to investigate the issue.
3984-
# - name: --no-interactive
3985-
# type: bool
3986-
# short-summary: Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.
3987-
# - name: --no-echo-request
3988-
# type: bool
3989-
# short-summary: Disable echoing back the question provided to AKS Agent in the output.
3990-
# - name: --show-tool-output
3991-
# type: bool
3992-
# short-summary: Show the output of each tool that was called during the analysis.
3993-
# - name: --refresh-toolsets
3994-
# type: bool
3995-
# short-summary: Refresh the toolsets status.
3996-
#
3997-
# examples:
3998-
# - name: Ask about pod issues in the cluster with Azure OpenAI
3999-
# text: |-
4000-
# export AZURE_API_BASE="https://my-azureopenai-service.openai.azure.com/"
4001-
# export AZURE_API_VERSION="2025-01-01-preview"
4002-
# export AZURE_API_KEY="sk-xxx"
4003-
# az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment
4004-
# - name: Ask about pod issues in the cluster with OpenAI
4005-
# text: |-
4006-
# export OPENAI_API_KEY="sk-xxx"
4007-
# az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model gpt-4o
4008-
# text: az aks agent "Why are my pods not starting?"
4009-
# - name: Run in interactive mode without a question
4010-
# text: az aks agent "Check the pod status in my cluster" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment --api-key "sk-xxx"
4011-
# - name: Run in non-interactive batch mode
4012-
# text: az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/my-gpt4.1-deployment
4013-
# - name: Show detailed tool output during analysis
4014-
# text: az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/my-gpt4.1-deployment
4015-
# - name: Use custom configuration file
4016-
# text: az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.config --model azure/my-gpt4.1-deployment
4017-
# - name: Run agent with no echo of the original question
4018-
# text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/my-gpt4.1-deployment
4019-
# - name: Refresh toolsets to get the latest available tools
4020-
# text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/my-gpt4.1-deploymen
4021-
# - name: Run agent with config file
4022-
# text: |
4023-
# az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.config
4024-
# Here is an example of config file:
4025-
# ```json
4026-
# model: "gpt-4o"
4027-
# api_key: "..."
4028-
# # define a list of mcp servers, mcp server can be defined
4029-
# mcp_servers:
4030-
# aks_mcp:
4031-
# description: "The AKS-MCP is a Model Context Protocol (MCP) server that enables AI assistants to interact with Azure Kubernetes Service (AKS) clusters"
4032-
# url: "http://localhost:8003/sse"
4033-
#
4034-
# # try adding your own tools or toggle the built-in toolsets here
4035-
# # e.g. query company-specific data, fetch logs from your existing observability tools, etc
4036-
# # To check how to add a customized toolset, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/custom_toolsets.html#custom-toolsets
4037-
# # To find all built-in toolsets, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/builtin_toolsets.html
4038-
# toolsets:
4039-
# # add a new json processor toolset
4040-
# json_processor:
4041-
# description: "A toolset for processing JSON data using jq"
4042-
# prerequisites:
4043-
# - command: "jq --version" # Ensure jq is installed
4044-
# tools:
4045-
# - name: "process_json"
4046-
# description: "A tool that uses jq to process JSON input"
4047-
# command: "echo '{{ json_input }}' | jq '.'" # Example jq command to format JSON
4048-
# # disable a built-in toolsets
4049-
# aks/core:
4050-
# enabled: false
4051-
# ```
4052-
# """
3956+
helps[
3957+
"aks agent"
3958+
] = """
3959+
type: command
3960+
short-summary: Run AI assistant to analyze and troubleshoot Kubernetes clusters.
3961+
long-summary: |-
3962+
This command allows you to ask questions about your Azure Kubernetes cluster and get answers using AI models.
3963+
Environment variables must be set to use the AI model, please refer to https://docs.litellm.ai/docs/providers to learn more about supported AI providers and models and required environment variables.
3964+
parameters:
3965+
- name: --name -n
3966+
type: string
3967+
short-summary: Name of the managed cluster.
3968+
- name: --resource-group -g
3969+
type: string
3970+
short-summary: Name of the resource group.
3971+
- name: --model
3972+
type: string
3973+
short-summary: Model to use for the LLM.
3974+
- name: --api-key
3975+
type: string
3976+
short-summary: API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY).
3977+
- name: --config-file
3978+
type: string
3979+
short-summary: Path to configuration file.
3980+
- name: --max-steps
3981+
type: int
3982+
short-summary: Maximum number of steps the LLM can take to investigate the issue.
3983+
- name: --no-interactive
3984+
type: bool
3985+
short-summary: Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.
3986+
- name: --no-echo-request
3987+
type: bool
3988+
short-summary: Disable echoing back the question provided to AKS Agent in the output.
3989+
- name: --show-tool-output
3990+
type: bool
3991+
short-summary: Show the output of each tool that was called during the analysis.
3992+
- name: --refresh-toolsets
3993+
type: bool
3994+
short-summary: Refresh the toolsets status.
3995+
3996+
examples:
3997+
- name: Ask about pod issues in the cluster with Azure OpenAI
3998+
text: |-
3999+
export AZURE_API_BASE="https://my-azureopenai-service.openai.azure.com/"
4000+
export AZURE_API_VERSION="2025-01-01-preview"
4001+
export AZURE_API_KEY="sk-xxx"
4002+
az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment
4003+
- name: Ask about pod issues in the cluster with OpenAI
4004+
text: |-
4005+
export OPENAI_API_KEY="sk-xxx"
4006+
az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model gpt-4o
4007+
- name: Run in interactive mode without a question
4008+
text: az aks agent "Check the pod status in my cluster" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment --api-key "sk-xxx"
4009+
- name: Run in non-interactive batch mode
4010+
text: az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/my-gpt4.1-deployment
4011+
- name: Show detailed tool output during analysis
4012+
text: az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/my-gpt4.1-deployment
4013+
- name: Use custom configuration file
4014+
text: az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml --model azure/my-gpt4.1-deployment
4015+
- name: Run agent with no echo of the original question
4016+
text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/my-gpt4.1-deployment
4017+
- name: Refresh toolsets to get the latest available tools
4018+
text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/my-gpt4.1-deployment
4019+
- name: Run agent with config file
4020+
text: |
4021+
az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.yaml
4022+
Here is an example of config file:
4023+
```json
4024+
model: "gpt-4o"
4025+
api_key: "..."
4026+
# define a list of mcp servers, mcp server can be defined
4027+
mcp_servers:
4028+
aks_mcp:
4029+
description: "The AKS-MCP is a Model Context Protocol (MCP) server that enables AI assistants to interact with Azure Kubernetes Service (AKS) clusters"
4030+
url: "http://localhost:8003/sse"
4031+
4032+
# try adding your own tools or toggle the built-in toolsets here
4033+
# e.g. query company-specific data, fetch logs from your existing observability tools, etc
4034+
# To check how to add a customized toolset, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/custom_toolsets.html#custom-toolsets
4035+
# To find all built-in toolsets, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/builtin_toolsets.html
4036+
toolsets:
4037+
# add a new json processor toolset
4038+
json_processor:
4039+
description: "A toolset for processing JSON data using jq"
4040+
prerequisites:
4041+
- command: "jq --version" # Ensure jq is installed
4042+
tools:
4043+
- name: "process_json"
4044+
description: "A tool that uses jq to process JSON input"
4045+
command: "echo '{{ json_input }}' | jq '.'" # Example jq command to format JSON
4046+
# disable a built-in toolsets
4047+
aks/core:
4048+
enabled: false
4049+
```
4050+
"""

src/aks-preview/azext_aks_preview/_params.py

Lines changed: 67 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
validate_nat_gateway_idle_timeout,
2424
validate_nat_gateway_managed_outbound_ip_count,
2525
)
26-
# from azure.cli.core.api import get_config_dir
26+
from azure.cli.core.api import get_config_dir
2727
from azure.cli.core.commands.parameters import (
2828
edge_zone_type,
2929
file_type,
@@ -150,7 +150,8 @@
150150
CONST_ADVANCED_NETWORKPOLICIES_FQDN,
151151
CONST_ADVANCED_NETWORKPOLICIES_L7,
152152
CONST_TRANSIT_ENCRYPTION_TYPE_NONE,
153-
CONST_TRANSIT_ENCRYPTION_TYPE_WIREGUARD
153+
CONST_TRANSIT_ENCRYPTION_TYPE_WIREGUARD,
154+
CONST_AGENT_CONFIG_FILE_NAME,
154155
)
155156

156157
from azext_aks_preview._validators import (
@@ -224,7 +225,7 @@
224225
validate_max_blocked_nodes,
225226
validate_resource_group_parameter,
226227
validate_location_resource_group_cluster_parameters,
227-
# validate_agent_config_file,
228+
validate_agent_config_file,
228229
)
229230
from azext_aks_preview.azurecontainerstorage._consts import (
230231
CONST_ACSTOR_ALL,
@@ -2780,70 +2781,69 @@ def load_arguments(self, _):
27802781
action="store_true",
27812782
)
27822783

2783-
# pylint: disable=line-too-long
2784-
# with self.argument_context("aks agent") as c:
2785-
# c.positional(
2786-
# "prompt",
2787-
# help="Ask any question and answer using available tools.",
2788-
# )
2789-
# c.argument(
2790-
# "resource_group_name",
2791-
# options_list=["--resource-group", "-g"],
2792-
# help="Name of resource group.",
2793-
# required=False,
2794-
# )
2795-
# c.argument(
2796-
# "name",
2797-
# options_list=["--name", "-n"],
2798-
# help="Name of the managed cluster.",
2799-
# required=False,
2800-
# )
2801-
# c.argument(
2802-
# "max_steps",
2803-
# type=int,
2804-
# default=10,
2805-
# required=False,
2806-
# help="Maximum number of steps the LLM can take to investigate the issue.",
2807-
# )
2808-
# c.argument(
2809-
# "config_file",
2810-
# default=os.path.join(get_config_dir(), "aksAgent.config"),
2811-
# validator=validate_agent_config_file,
2812-
# required=False,
2813-
# help="Path to the config file.",
2814-
# )
2815-
# c.argument(
2816-
# "model",
2817-
# help="The model to use for the LLM.",
2818-
# required=False,
2819-
# type=str,
2820-
# )
2821-
# c.argument(
2822-
# "api-key",
2823-
# help="API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY)",
2824-
# required=False,
2825-
# type=str,
2826-
# )
2827-
# c.argument(
2828-
# "no_interactive",
2829-
# help="Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.",
2830-
# action="store_true",
2831-
# )
2832-
# c.argument(
2833-
# "no_echo_request",
2834-
# help="Disable echoing back the question provided to AKS Agent in the output.",
2835-
# action="store_true",
2836-
# )
2837-
# c.argument(
2838-
# "show_tool_output",
2839-
# help="Show the output of each tool that was called.",
2840-
# action="store_true",
2841-
# )
2842-
# c.argument(
2843-
# "refresh_toolsets",
2844-
# help="Refresh the toolsets status.",
2845-
# action="store_true",
2846-
# )
2784+
with self.argument_context("aks agent") as c:
2785+
c.positional(
2786+
"prompt",
2787+
help="Ask any question and answer using available tools.",
2788+
)
2789+
c.argument(
2790+
"resource_group_name",
2791+
options_list=["--resource-group", "-g"],
2792+
help="Name of resource group.",
2793+
required=False,
2794+
)
2795+
c.argument(
2796+
"name",
2797+
options_list=["--name", "-n"],
2798+
help="Name of the managed cluster.",
2799+
required=False,
2800+
)
2801+
c.argument(
2802+
"max_steps",
2803+
type=int,
2804+
default=10,
2805+
required=False,
2806+
help="Maximum number of steps the LLM can take to investigate the issue.",
2807+
)
2808+
c.argument(
2809+
"config_file",
2810+
default=os.path.join(get_config_dir(), CONST_AGENT_CONFIG_FILE_NAME),
2811+
validator=validate_agent_config_file,
2812+
required=False,
2813+
help="Path to the config file.",
2814+
)
2815+
c.argument(
2816+
"model",
2817+
help="The model to use for the LLM.",
2818+
required=False,
2819+
type=str,
2820+
)
2821+
c.argument(
2822+
"api-key",
2823+
help="API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY)",
2824+
required=False,
2825+
type=str,
2826+
)
2827+
c.argument(
2828+
"no_interactive",
2829+
help="Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.",
2830+
action="store_true",
2831+
)
2832+
c.argument(
2833+
"no_echo_request",
2834+
help="Disable echoing back the question provided to AKS Agent in the output.",
2835+
action="store_true",
2836+
)
2837+
c.argument(
2838+
"show_tool_output",
2839+
help="Show the output of each tool that was called.",
2840+
action="store_true",
2841+
)
2842+
c.argument(
2843+
"refresh_toolsets",
2844+
help="Refresh the toolsets status.",
2845+
action="store_true",
2846+
)
28472847

28482848

28492849
def _get_default_install_location(exe_name):

0 commit comments

Comments
 (0)