enable aks agent command

mainred · mainred · commit 0f0fbd4d8038 · 2025-08-15T08:00:18.000Z
diff --git a/src/aks-preview/HISTORY.rst b/src/aks-preview/HISTORY.rst
@@ -13,6 +13,11 @@ Pending
 +++++++
 * Add framework for interactive AI-powered debugging tool.
 
+18.0.0b27
++++++++
+* Add interactive AI-powered debugging tool `az aks agent`.
+* Add framework for interactive AI-powered debugging tool.
+
 18.0.0b26
 +++++++
 * Add `az aks identity-binding` command group for identity binding feature.
diff --git a/src/aks-preview/azext_aks_preview/_help.py b/src/aks-preview/azext_aks_preview/_help.py
@@ -3944,100 +3944,99 @@
           short-summary: Name of the identity binding to show.
 """
 
-# pylint: disable=line-too-long
-# helps[
-#     "aks agent"
-# ] = """
-#     type: command
-#     short-summary: Run AI assistant to analyze and troubleshoot Kubernetes clusters.
-#     long-summary: |-
-#       This command allows you to ask questions about your Azure Kubernetes cluster and get answers using AI models.
-#       Environment variables must be set to use the AI model, please refer to https://docs.litellm.ai/docs/providers to learn more about supported AI providers and models and required environment variables.
-#     parameters:
-#         - name: --name -n
-#           type: string
-#           short-summary: Name of the managed cluster.
-#         - name: --resource-group -g
-#           type: string
-#           short-summary: Name of the resource group.
-#         - name: --model
-#           type: string
-#           short-summary: Model to use for the LLM.
-#         - name: --api-key
-#           type: string
-#           short-summary: API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY).
-#         - name: --config-file
-#           type: string
-#           short-summary: Path to configuration file.
-#         - name: --max-steps
-#           type: int
-#           short-summary: Maximum number of steps the LLM can take to investigate the issue.
-#         - name: --no-interactive
-#           type: bool
-#           short-summary: Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.
-#         - name: --no-echo-request
-#           type: bool
-#           short-summary: Disable echoing back the question provided to AKS Agent in the output.
-#         - name: --show-tool-output
-#           type: bool
-#           short-summary: Show the output of each tool that was called during the analysis.
-#         - name: --refresh-toolsets
-#           type: bool
-#           short-summary: Refresh the toolsets status.
-#
-#     examples:
-#         - name: Ask about pod issues in the cluster with Azure OpenAI
-#           text: |-
-#             export AZURE_API_BASE="https://my-azureopenai-service.openai.azure.com/"
-#             export AZURE_API_VERSION="2025-01-01-preview"
-#             export AZURE_API_KEY="sk-xxx"
-#             az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment
-#         - name: Ask about pod issues in the cluster with OpenAI
-#           text: |-
-#             export OPENAI_API_KEY="sk-xxx"
-#             az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model gpt-4o
-#           text: az aks agent "Why are my pods not starting?"
-#         - name: Run in interactive mode without a question
-#           text: az aks agent "Check the pod status in my cluster" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment --api-key "sk-xxx"
-#         - name: Run in non-interactive batch mode
-#           text: az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/my-gpt4.1-deployment
-#         - name: Show detailed tool output during analysis
-#           text: az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/my-gpt4.1-deployment
-#         - name: Use custom configuration file
-#           text: az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.config --model azure/my-gpt4.1-deployment
-#         - name: Run agent with no echo of the original question
-#           text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/my-gpt4.1-deployment
-#         - name: Refresh toolsets to get the latest available tools
-#           text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/my-gpt4.1-deploymen
-#         - name: Run agent with config file
-#           text: |
-#             az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.config
-#             Here is an example of config file:
-#             ```json
-#             model: "gpt-4o"
-#             api_key: "..."
-#             # define a list of mcp servers, mcp server can be defined
-#             mcp_servers:
-#               aks_mcp:
-#                 description: "The AKS-MCP is a Model Context Protocol (MCP) server that enables AI assistants to interact with Azure Kubernetes Service (AKS) clusters"
-#                 url: "http://localhost:8003/sse"
-#
-#             # try adding your own tools or toggle the built-in toolsets here
-#             # e.g. query company-specific data, fetch logs from your existing observability tools, etc
-#             # To check how to add a customized toolset, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/custom_toolsets.html#custom-toolsets
-#             # To find all built-in toolsets, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/builtin_toolsets.html
-#             toolsets:
-#               # add a new json processor toolset
-#               json_processor:
-#                 description: "A toolset for processing JSON data using jq"
-#                 prerequisites:
-#                   - command: "jq --version"  # Ensure jq is installed
-#                 tools:
-#                   - name: "process_json"
-#                     description: "A tool that uses jq to process JSON input"
-#                     command: "echo '{{ json_input }}' | jq '.'"  # Example jq command to format JSON
-#               # disable a built-in toolsets
-#               aks/core:
-#                 enabled: false
-#               ```
-# """
+helps[
+    "aks agent"
+] = """
+    type: command
+    short-summary: Run AI assistant to analyze and troubleshoot Kubernetes clusters.
+    long-summary: |-
+      This command allows you to ask questions about your Azure Kubernetes cluster and get answers using AI models.
+      Environment variables must be set to use the AI model, please refer to https://docs.litellm.ai/docs/providers to learn more about supported AI providers and models and required environment variables.
+    parameters:
+        - name: --name -n
+          type: string
+          short-summary: Name of the managed cluster.
+        - name: --resource-group -g
+          type: string
+          short-summary: Name of the resource group.
+        - name: --model
+          type: string
+          short-summary: Model to use for the LLM.
+        - name: --api-key
+          type: string
+          short-summary: API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY).
+        - name: --config-file
+          type: string
+          short-summary: Path to configuration file.
+        - name: --max-steps
+          type: int
+          short-summary: Maximum number of steps the LLM can take to investigate the issue.
+        - name: --no-interactive
+          type: bool
+          short-summary: Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.
+        - name: --no-echo-request
+          type: bool
+          short-summary: Disable echoing back the question provided to AKS Agent in the output.
+        - name: --show-tool-output
+          type: bool
+          short-summary: Show the output of each tool that was called during the analysis.
+        - name: --refresh-toolsets
+          type: bool
+          short-summary: Refresh the toolsets status.
+
+    examples:
+        - name: Ask about pod issues in the cluster with Azure OpenAI
+          text: |-
+            export AZURE_API_BASE="https://my-azureopenai-service.openai.azure.com/"
+            export AZURE_API_VERSION="2025-01-01-preview"
+            export AZURE_API_KEY="sk-xxx"
+            az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment
+        - name: Ask about pod issues in the cluster with OpenAI
+          text: |-
+            export OPENAI_API_KEY="sk-xxx"
+            az aks agent "Why are my pods not starting?" --name MyManagedCluster --resource-group MyResourceGroup --model gpt-4o
+          text: az aks agent "Why are my pods not starting?"
+        - name: Run in interactive mode without a question
+          text: az aks agent "Check the pod status in my cluster" --name MyManagedCluster --resource-group MyResourceGroup --model azure/my-gpt4.1-deployment --api-key "sk-xxx"
+        - name: Run in non-interactive batch mode
+          text: az aks agent "Diagnose networking issues" --no-interactive --max-steps 15 --model azure/my-gpt4.1-deployment
+        - name: Show detailed tool output during analysis
+          text: az aks agent "Why is my service workload unavailable in namespace workload-ns?" --show-tool-output --model azure/my-gpt4.1-deployment
+        - name: Use custom configuration file
+          text: az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.config --model azure/my-gpt4.1-deployment
+        - name: Run agent with no echo of the original question
+          text: az aks agent "What is the status of my cluster?" --no-echo-request --model azure/my-gpt4.1-deployment
+        - name: Refresh toolsets to get the latest available tools
+          text: az aks agent "What is the status of my cluster?" --refresh-toolsets --model azure/my-gpt4.1-deploymen
+        - name: Run agent with config file
+          text: |
+            az aks agent "Check kubernetes pod resource usage" --config-file /path/to/custom.config
+            Here is an example of config file:
+            ```json
+            model: "gpt-4o"
+            api_key: "..."
+            # define a list of mcp servers, mcp server can be defined
+            mcp_servers:
+              aks_mcp:
+                description: "The AKS-MCP is a Model Context Protocol (MCP) server that enables AI assistants to interact with Azure Kubernetes Service (AKS) clusters"
+                url: "http://localhost:8003/sse"
+
+            # try adding your own tools or toggle the built-in toolsets here
+            # e.g. query company-specific data, fetch logs from your existing observability tools, etc
+            # To check how to add a customized toolset, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/custom_toolsets.html#custom-toolsets
+            # To find all built-in toolsets, please refer to https://docs.robusta.dev/master/configuration/holmesgpt/builtin_toolsets.html
+            toolsets:
+              # add a new json processor toolset
+              json_processor:
+                description: "A toolset for processing JSON data using jq"
+                prerequisites:
+                  - command: "jq --version"  # Ensure jq is installed
+                tools:
+                  - name: "process_json"
+                    description: "A tool that uses jq to process JSON input"
+                    command: "echo '{{ json_input }}' | jq '.'"  # Example jq command to format JSON
+              # disable a built-in toolsets
+              aks/core:
+                enabled: false
+              ```
+"""
diff --git a/src/aks-preview/azext_aks_preview/_params.py b/src/aks-preview/azext_aks_preview/_params.py
@@ -23,7 +23,7 @@
     validate_nat_gateway_idle_timeout,
     validate_nat_gateway_managed_outbound_ip_count,
 )
-# from azure.cli.core.api import get_config_dir
+from azure.cli.core.api import get_config_dir
 from azure.cli.core.commands.parameters import (
     edge_zone_type,
     file_type,
@@ -224,7 +224,7 @@
     validate_max_blocked_nodes,
     validate_resource_group_parameter,
     validate_location_resource_group_cluster_parameters,
-    # validate_agent_config_file,
+    validate_agent_config_file,
 )
 from azext_aks_preview.azurecontainerstorage._consts import (
     CONST_ACSTOR_ALL,
@@ -2777,70 +2777,69 @@ def load_arguments(self, _):
             action="store_true",
         )
 
-# pylint: disable=line-too-long
-#     with self.argument_context("aks agent") as c:
-#         c.positional(
-#             "prompt",
-#             help="Ask any question and answer using available tools.",
-#         )
-#         c.argument(
-#             "resource_group_name",
-#             options_list=["--resource-group", "-g"],
-#             help="Name of resource group.",
-#             required=False,
-#         )
-#         c.argument(
-#             "name",
-#             options_list=["--name", "-n"],
-#             help="Name of the managed cluster.",
-#             required=False,
-#         )
-#         c.argument(
-#             "max_steps",
-#             type=int,
-#             default=10,
-#             required=False,
-#             help="Maximum number of steps the LLM can take to investigate the issue.",
-#         )
-#         c.argument(
-#             "config_file",
-#             default=os.path.join(get_config_dir(), "aksAgent.config"),
-#             validator=validate_agent_config_file,
-#             required=False,
-#             help="Path to the config file.",
-#         )
-#         c.argument(
-#             "model",
-#             help="The model to use for the LLM.",
-#             required=False,
-#             type=str,
-#         )
-#         c.argument(
-#             "api-key",
-#             help="API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY)",
-#             required=False,
-#             type=str,
-#         )
-#         c.argument(
-#             "no_interactive",
-#             help="Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.",
-#             action="store_true",
-#         )
-#         c.argument(
-#             "no_echo_request",
-#             help="Disable echoing back the question provided to AKS Agent in the output.",
-#             action="store_true",
-#         )
-#         c.argument(
-#             "show_tool_output",
-#             help="Show the output of each tool that was called.",
-#             action="store_true",
-#         )
-#         c.argument(
-#             "refresh_toolsets",
-#             help="Refresh the toolsets status.",
-#             action="store_true",
-#         )
+    with self.argument_context("aks agent") as c:
+        c.positional(
+            "prompt",
+            help="Ask any question and answer using available tools.",
+        )
+        c.argument(
+            "resource_group_name",
+            options_list=["--resource-group", "-g"],
+            help="Name of resource group.",
+            required=False,
+        )
+        c.argument(
+            "name",
+            options_list=["--name", "-n"],
+            help="Name of the managed cluster.",
+            required=False,
+        )
+        c.argument(
+            "max_steps",
+            type=int,
+            default=10,
+            required=False,
+            help="Maximum number of steps the LLM can take to investigate the issue.",
+        )
+        c.argument(
+            "config_file",
+            default=os.path.join(get_config_dir(), "aksAgent.config"),
+            validator=validate_agent_config_file,
+            required=False,
+            help="Path to the config file.",
+        )
+        c.argument(
+            "model",
+            help="The model to use for the LLM.",
+            required=False,
+            type=str,
+        )
+        c.argument(
+            "api-key",
+            help="API key to use for the LLM (if not given, uses environment variables AZURE_API_KEY, OPENAI_API_KEY)",
+            required=False,
+            type=str,
+        )
+        c.argument(
+            "no_interactive",
+            help="Disable interactive mode. When set, the agent will not prompt for input and will run in batch mode.",
+            action="store_true",
+        )
+        c.argument(
+            "no_echo_request",
+            help="Disable echoing back the question provided to AKS Agent in the output.",
+            action="store_true",
+        )
+        c.argument(
+            "show_tool_output",
+            help="Show the output of each tool that was called.",
+            action="store_true",
+        )
+        c.argument(
+            "refresh_toolsets",
+            help="Refresh the toolsets status.",
+            action="store_true",
+        )
 
 
 def _get_default_install_location(exe_name):
diff --git a/src/aks-preview/azext_aks_preview/agent/prompt.py b/src/aks-preview/azext_aks_preview/agent/prompt.py
@@ -37,17 +37,10 @@
 1. **IMMEDIATELY STOP ALL OPERATIONS** - Do not proceed with any investigation
 2. **DO NOT ATTEMPT ANY TROUBLESHOOTING** - No kubectl commands, no Azure commands, nothing
 3. **DO NOT INFER THE RESOURCE NAME** - Do not assume any resource name, resource group, or subscription ID
-4. **ONLY display the context failure message** on separate lines:
-```
-Cluster name: <detected_or_not_found>
-Resource group: <detected_or_not_found>
-Subscription ID: <detected_or_not_found>
-
-Please provide the correct cluster context. You can either:
-1. Specify the context in this session: "Please use cluster 'my-cluster' in resource group 'my-rg' under subscription 'my-subscription'"
-2. Or restart with context: `az aks agent --name <cluster-name> --resource-group <rg-name> --subscription <subscription-id>`
-```
-**IMPORTANT**: When displaying the CLI command example above, use it EXACTLY as written with the placeholder format `<cluster-name>`, `<rg-name>`, `<subscription-id>`.
+4. **ONLY display the context failure message** exactly as follows with no extra blank lines (replace the first three placeholders with actual detected values or None):
+   - list "Cluster name", "Resource group", "Subscription ID" with detected value or None
+   - prompt to the user to either provide the the cluster context in the prompt including Cluster name", "Resource group" and "Subscription ID", or
+   - restart the command specifying the cluster info in flags with examples (e.g., --name <cluster_name> --resource-group <resource_group> --subscription <subscription_id>)
 
 {% endif %}
 
diff --git a/src/aks-preview/azext_aks_preview/commands.py b/src/aks-preview/azext_aks_preview/commands.py
@@ -188,7 +188,7 @@ def load_command_table(self, _):
             "operation-abort", "aks_operation_abort", supports_no_wait=True
         )
         g.custom_command("bastion", "aks_bastion")
-        # g.custom_command("agent", "aks_agent")
+        g.custom_command("agent", "aks_agent")
 
     # AKS maintenance configuration commands
     with self.command_group(

Original file line number	Diff line number	Diff line change
`@@ -188,7 +188,7 @@ def load_command_table(self, _):`
`188`	`188`	`"operation-abort", "aks_operation_abort", supports_no_wait=True`
`189`	`189`	`)`
`190`	`190`	`g.custom_command("bastion", "aks_bastion")`
`191`		`- # g.custom_command("agent", "aks_agent")`
	`191`	`+ g.custom_command("agent", "aks_agent")`
`192`	`192`
`193`	`193`	`# AKS maintenance configuration commands`
`194`	`194`	`with self.command_group(`