Azure · yanzhudd · Aug 15, 2025 · Jul 22, 2025 · Jul 29, 2025 · Jul 29, 2025
@@ -198,6 +198,11 @@ aks update:
     cluster_service_load_balancer_health_probe_mode:
       rule_exclusions:
       - option_length_too_long
+aks agent:
+  parameters:
+    prompt:
+      rule_exclusions:
+        - no_positional_parameters
 arcdata dc config init:
   parameters:
     path:

@@ -11,10 +11,11 @@ To release a new version, please select a new version number (usually plus 1 to
 
 Pending
 +++++++
+* Add framework for interactive AI-powered debugging tool.
 
 18.0.0b26
 +++++++
-* Add `az aks identity-binding` command group for identity binding feataure.
+* Add `az aks identity-binding` command group for identity binding feature.
 
 18.0.0b25
 +++++++

@@ -373,3 +373,9 @@
 CONST_K8S_EXTENSION_NAME = "k8s-extension"
 CONST_K8S_EXTENSION_ACTION_MOD_NAME = "azext_k8s_extension.action"
 CONST_K8S_EXTENSION_FORMAT_MOD_NAME = "azext_k8s_extension._format"
+
+# aks agent constants
+CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY = "HOLMES_CONFIGPATH_DIR"
+CONST_AGENT_NAME = "AKS AGENT"
+CONST_AGENT_NAME_ENV_KEY = "AGENT_NAME"
+CONST_AGENT_CONFIG_FILE_NAME = "aksAgent.config"
@@ -3943,3 +3943,101 @@
           type: string
           short-summary: Name of the identity binding to show.
 """
+
+# pylint: disable=line-too-long
+# helps[
+#     "aks agent"
+# ] = """
+#     type: command
+#     short-summary: Run AI assistant to analyze and troubleshoot Kubernetes clusters.
+#     long-summary: |-
+#       This command allows you to ask questions about your Azure Kubernetes cluster and get answers using AI models.
+#       Environment variables must be set to use the AI model, please refer to https://docs.litellm.ai/docs/providers to learn more about supported AI providers and models and required environment variables.
+#     parameters:
+#         - name: --name -n
+#           type: string
+#           short-summary: Name of the managed cluster.
+#         - name: --resource-group -g
+#           type: string
+#           short-summary: Name of the resource group.
+#         - name: --model
+#           type: string
+#           short-summary: Model to use for the LLM.
+#         - name: --api-key
+#           type: string
+#           short-summary: API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY).
+#         - name: --config-file
+#           type: string
+#           short-summary: Path to configuration file.
+#         - name: --max-steps
+#           type: int
+#           short-summary: Maximum number of steps the LLM can take to investigate the issue.
+#         - name: --no-interactive
+#           type: bool
+#           short-summary: Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.
+#         - name: --no-echo-request
+#           type: bool
+#           short-summary: Disable echoing back the question provided to AKS Agent in the output.
+#         - name: --show-tool-output
+#           type: bool
+#           short-summary: Show the output of each tool that was called during the analysis.
+#         - name: --refresh-toolsets
+#           type: bool
+#           short-summary: Refresh the toolsets status.
+#
+#     examples:
+#         - name: Ask about pod issues in the cluster with Azure OpenAI
+#           text: |-
+#             export AZURE_API_BASE="https://my-azureopenai-service.openai.azure.com/"
+#             export AZURE_API_VERSION="2025-01-01-preview"
+#             export AZURE_API_KEY="sk-xxx"
+#             az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment
+#         - name: Ask about pod issues in the cluster with OpenAI
+#           text: |-
+#             export OPENAI_API_KEY="sk-xxx"
+#             az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model gpt-4o
+#           text: az aks agent "Why are my pods not starting?"
+#         - name: Run in interactive mode without a question
+#           text: az aks agent "Check the pod status in my cluster" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment --api-key "sk-xxx"
+#         - name: Run in non-interactive batch mode
+#           text: az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/my-gpt4.1-deployment
+#         - name: Show detailed tool output during analysis
+#           text: az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/my-gpt4.1-deployment
+#         - name: Use custom configuration file
+#           text: az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.config --model azure/my-gpt4.1-deployment
+#         - name: Run agent with no echo of the original question
+#           text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/my-gpt4.1-deployment
+#         - name: Refresh toolsets to get the latest available tools
+#           text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/my-gpt4.1-deploymen
+#         - name: Run agent with config file
+#           text: |
+#             az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.config
+#             Here is an example of config file:
+#             ```json
+#             model: "gpt-4o"
+#             api_key: "..."
+#             # define a list of mcp servers, mcp server can be defined
+#             mcp_servers:
+#               aks_mcp:
+#                 description: "The AKS-MCP is a Model Context Protocol (MCP) server that enables AI assistants to interact with Azure Kubernetes Service (AKS) clusters"
+#                 url: "http://localhost:8003/sse"
+#
+#             # try adding your own tools or toggle the built-in toolsets here
+#             # e.g. query company-specific data, fetch logs from your existing observability tools, etc
+#             # To check how to add a customized toolset, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/custom_toolsets.html#custom-toolsets
+#             # To find all built-in toolsets, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/builtin_toolsets.html
+#             toolsets:
+#               # add a new json processor toolset
+#               json_processor:
+#                 description: "A toolset for processing JSON data using jq"
+#                 prerequisites:
+#                   - command: "jq --version"  # Ensure jq is installed
+#                 tools:
+#                   - name: "process_json"
+#                     description: "A tool that uses jq to process JSON input"
+#                     command: "echo '{{ json_input }}' | jq '.'"  # Example jq command to format JSON
+#               # disable a built-in toolsets
+#               aks/core:
+#                 enabled: false
+#               ```
+# """
@@ -23,6 +23,7 @@
     validate_nat_gateway_idle_timeout,
     validate_nat_gateway_managed_outbound_ip_count,
 )
+# from azure.cli.core.api import get_config_dir
 from azure.cli.core.commands.parameters import (
     edge_zone_type,
     file_type,
@@ -223,6 +224,7 @@
     validate_max_blocked_nodes,
     validate_resource_group_parameter,
     validate_location_resource_group_cluster_parameters,
+    # validate_agent_config_file,
 )
 from azext_aks_preview.azurecontainerstorage._consts import (
     CONST_ACSTOR_ALL,
@@ -2775,6 +2777,71 @@ def load_arguments(self, _):
             action="store_true",
         )
 
+# pylint: disable=line-too-long
+#     with self.argument_context("aks agent") as c:
+#         c.positional(
+#             "prompt",
+#             help="Ask any question and answer using available tools.",
+#         )
+#         c.argument(
+#             "resource_group_name",
+#             options_list=["--resource-group", "-g"],
+#             help="Name of resource group.",
+#             required=False,
+#         )
+#         c.argument(
+#             "name",
+#             options_list=["--name", "-n"],
+#             help="Name of the managed cluster.",
+#             required=False,
+#         )
+#         c.argument(
+#             "max_steps",
+#             type=int,
+#             default=10,
+#             required=False,
+#             help="Maximum number of steps the LLM can take to investigate the issue.",
+#         )
+#         c.argument(
+#             "config_file",
+#             default=os.path.join(get_config_dir(), "aksAgent.config"),
+#             validator=validate_agent_config_file,
+#             required=False,
+#             help="Path to the config file.",
+#         )
+#         c.argument(
+#             "model",
+#             help="The model to use for the LLM.",
+#             required=False,
+#             type=str,
+#         )
+#         c.argument(
+#             "api-key",
+#             help="API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY)",
+#             required=False,
+#             type=str,
+#         )
+#         c.argument(
+#             "no_interactive",
+#             help="Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.",
+#             action="store_true",
+#         )
+#         c.argument(
+#             "no_echo_request",
+#             help="Disable echoing back the question provided to AKS Agent in the output.",
+#             action="store_true",
+#         )
+#         c.argument(
+#             "show_tool_output",
+#             help="Show the output of each tool that was called.",
+#             action="store_true",
+#         )
+#         c.argument(
+#             "refresh_toolsets",
+#             help="Refresh the toolsets status.",
+#             action="store_true",
+#         )
+
 
 def _get_default_install_location(exe_name):
     system = platform.system()

@@ -8,10 +8,12 @@
 import os
 import os.path
 import re
+import yaml
 from ipaddress import ip_network
 from math import isclose, isnan
 
 from azure.cli.core import keys
+from azure.cli.core.api import get_config_dir
 from azure.cli.core.azclierror import (
     ArgumentUsageError,
     InvalidArgumentValueError,
@@ -35,6 +37,7 @@
     CONST_NETWORK_POD_IP_ALLOCATION_MODE_STATIC_BLOCK,
     CONST_NODEPOOL_MODE_GATEWAY,
     CONST_AZURE_SERVICE_MESH_MAX_EGRESS_NAME_LENGTH,
+    CONST_AGENT_CONFIG_FILE_NAME,
 )
 from azext_aks_preview._helpers import _fuzzy_match
 from knack.log import get_logger
@@ -977,3 +980,38 @@ def validate_location_resource_group_cluster_parameters(namespace):
         raise MutuallyExclusiveArgumentError(
             "Cannot specify --location and --resource-group and --cluster at the same time."
         )
+
+
+def _validate_param_yaml_file(yaml_path, param_name):
+    if not yaml_path:
+        return
+    if not os.path.exists(yaml_path):
+        raise InvalidArgumentValueError(
+            f"--{param_name}={yaml_path}: file is not found."
+        )
+    if not os.access(yaml_path, os.R_OK):
+        raise InvalidArgumentValueError(
+            f"--{param_name}={yaml_path}: file is not readable."
+        )
+    try:
+        with open(yaml_path, "r") as file:
+            yaml.safe_load(file)
+    except yaml.YAMLError as e:
+        raise InvalidArgumentValueError(
+            f"--{param_name}={yaml_path}: file is not a valid YAML file: {e}"
+        )
+    except Exception as e:
+        raise InvalidArgumentValueError(
+            f"--{param_name}={yaml_path}: An error occurred while reading the config file: {e}"
+        )
+
+
+def validate_agent_config_file(namespace):
+    config_file = namespace.config_file
+    if not config_file:
+        return
+    default_config_path = os.path.join(get_config_dir(), CONST_AGENT_CONFIG_FILE_NAME)
+    if config_file == default_config_path and not os.path.exists(config_file):
+        return
+
+    _validate_param_yaml_file(config_file, "config-file")
-Original file line number
+Diff line change
@@ Expand Up @@
     Pending
     +++++++
+    * Add framework for interactive AI-powered debugging tool.
 .0.0b26
     +++++++
-    * Add `az aks identity-binding` command group for identity binding feataure.
+    * Add `az aks identity-binding` command group for identity binding feature.
 .0.0b25
     +++++++
@@ Expand Down @@