Azure · yanzhudd · Nov 3, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
@@ -11,6 +11,12 @@ To release a new version, please select a new version number (usually plus 1 to
 
 Pending
 +++++++
+
+1.0.0b8
++++++++
+* Error handling: dont raise traceback for init prompt and holmesgpt interaction.
+* Improve aks agent-init user experience
+* Improve the user holmesgpt interaction error handling
 * Fix stdin reading hang in CI/CD pipelines by using select with timeout for non-interactive mode.
 * Update pytest marker registration and fix datetime.utcnow() deprecation warning in tests.
 * Improve test framework with real-time stderr output visibility and subprocess timeout.

@@ -37,6 +37,98 @@ For more details about supported model providers and required
 variables, see: https://docs.litellm.ai/docs/providers
 
 
+LLM Configuration Explained
+---------------------------
+
+The AKS Agent uses YAML configuration files to define LLM connections. Each configuration contains a provider specification and the required environment variables for that provider.
+
+Configuration Structure
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+    llms:
+    - provider: azure
+      MODEL_NAME: gpt-4.1
+      AZURE_API_KEY: *******
+      AZURE_API_BASE: https://{azure-openai-service}.openai.azure.com/
+      AZURE_API_VERSION: 2025-04-01-preview
+
+Field Explanations
+^^^^^^^^^^^^^^^^^^
+
+**provider**
+    The LiteLLM provider route that determines which LLM service to use. This follows the LiteLLM provider specification from https://docs.litellm.ai/docs/providers.
+
+    Common values:
+
+    * ``azure`` - Azure OpenAI Service
+    * ``openai`` - OpenAI API and OpenAI-compatible APIs (e.g., local models, other services)
+    * ``anthropic`` - Anthropic Claude
+    * ``gemini`` - Google's Gemini
+    * ``openai_compatible`` - OpenAI-compatible APIs (e.g., local models, other services)
+
+**MODEL_NAME**
+    The specific model or deployment name to use. This varies by provider:
+
+    * For Azure OpenAI: Your deployment name (e.g., ``gpt-4.1``, ``gpt-35-turbo``)
+    * For OpenAI: Model name (e.g., ``gpt-4``, ``gpt-3.5-turbo``)
+    * For other providers: Check the specific model names in LiteLLM documentation
+
+**Environment Variables by Provider**
+
+The remaining fields are environment variables required by each provider. These correspond to the authentication and configuration requirements of each LLM service:
+
+**Azure OpenAI (provider: azure)**
+    * ``AZURE_API_KEY`` - Your Azure OpenAI API key
+    * ``AZURE_API_BASE`` - Your Azure OpenAI endpoint URL (e.g., https://your-resource.openai.azure.com/)
+    * ``AZURE_API_VERSION`` - API version (e.g., 2024-02-01, 2025-04-01-preview)
+
+**OpenAI (provider: openai)**
+    * ``OPENAI_API_KEY`` - Your OpenAI API key (starts with sk-)
+
+**Gemini (provider: gemini)**
+    * ``GOOGLE_API_KEY`` - Your Google Cloud API key
+    * ``GOOGLE_API_ENDPOINT`` - Base URL for the Gemini API endpoint
+
+**Anthropic (provider: anthropic)**
+    * ``ANTHROPIC_API_KEY`` - Your Anthropic API key
+
+**OpenAI Compatible (provider: openai_compatible)**
+    * ``OPENAI_API_BASE`` - Base URL for the API endpoint
+    * ``OPENAI_API_KEY`` - API key (if required by the service)
+
+Multiple Model Configuration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can configure multiple models in a single file:
+
+.. code-block:: yaml
+
+    llms:
+    - provider: azure
+      MODEL_NAME: gpt-4
+      AZURE_API_KEY: your-azure-key
+      AZURE_API_BASE: https://your-azure-endpoint.openai.azure.com/
+      AZURE_API_VERSION: 2024-02-01
+    - provider: openai
+      MODEL_NAME: gpt-4
+      OPENAI_API_KEY: your-openai-key
+    - provider: anthropic
+      MODEL_NAME: claude-3-sonnet-20240229
+      ANTHROPIC_API_KEY: your-anthropic-key
+
+When using ``--model``, specify the provider and model as ``provider/model_name`` (e.g., ``azure/gpt-4``, ``openai/gpt-4``).
+
+Security Note
+^^^^^^^^^^^^^
+
+API keys and credentials in configuration files should be kept secure. Consider using:
+
+* Restricted file permissions (``chmod 600 config.yaml``)
+* Environment variable substitution where supported
+* Separate configuration files for different environments (dev/prod)
+
 Quick start and examples
 =========================
 

@@ -4,10 +4,20 @@
 # --------------------------------------------------------------------------------------------
 
 
-from azure.cli.core import AzCommandsLoader
+import os
 
 # pylint: disable=unused-import
 import azext_aks_agent._help
+from azext_aks_agent._consts import (
+    CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY,
+    CONST_AGENT_NAME,
+    CONST_AGENT_NAME_ENV_KEY,
+    CONST_DISABLE_PROMETHEUS_TOOLSET_ENV_KEY,
+    CONST_PRIVACY_NOTICE_BANNER,
+    CONST_PRIVACY_NOTICE_BANNER_ENV_KEY,
+)
+from azure.cli.core import AzCommandsLoader
+from azure.cli.core.api import get_config_dir
 
 
 class ContainerServiceCommandsLoader(AzCommandsLoader):
@@ -34,3 +44,14 @@ def load_arguments(self, command):
 
 
 COMMAND_LOADER_CLS = ContainerServiceCommandsLoader
+
+
+# NOTE(mainred): holmesgpt leverages the environment variables to customize its behavior.
+def customize_holmesgpt():
+    os.environ[CONST_DISABLE_PROMETHEUS_TOOLSET_ENV_KEY] = "true"
+    os.environ[CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY] = get_config_dir()
+    os.environ[CONST_AGENT_NAME_ENV_KEY] = CONST_AGENT_NAME
+    os.environ[CONST_PRIVACY_NOTICE_BANNER_ENV_KEY] = CONST_PRIVACY_NOTICE_BANNER
+
+
+customize_holmesgpt()
@@ -8,7 +8,6 @@
 
 from knack.help_files import helps
 
-
 helps[
     "aks agent"
 ] = """
@@ -78,10 +77,11 @@
             Here is an example of config file:
             ```json
             llms:
-              - provider: "azure"
-                MODEL_NAME: "gpt-4.1"
-                AZURE_API_BASE: "https://<your-base-url>"
-                AZURE_API_KEY: "<your-api-key>"
+            - provider: azure
+              MODEL_NAME: gpt-4.1
+              AZURE_API_KEY: *******
+              AZURE_API_BASE: https://{azure-openai-service-name}.openai.azure.com/
+              AZURE_API_VERSION: 2025-04-01-preview
             # define a list of mcp servers, mcp server can be defined
             mcp_servers:
               aks_mcp:

@@ -3,20 +3,13 @@
 # Licensed under the MIT License. See License.txt in the project root for license information.
 # --------------------------------------------------------------------------------------------
 
-import logging
 import os
 import select
 import sys
 
-from azext_aks_agent._consts import (
-    CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY,
-    CONST_AGENT_NAME,
-    CONST_AGENT_NAME_ENV_KEY,
-    CONST_DISABLE_PROMETHEUS_TOOLSET_ENV_KEY,
-    CONST_PRIVACY_NOTICE_BANNER,
-    CONST_PRIVACY_NOTICE_BANNER_ENV_KEY,
-)
+from azext_aks_agent.agent.logging import init_log
 from azure.cli.core.api import get_config_dir
+from azure.cli.core.azclierror import CLIInternalError
 from azure.cli.core.commands.client_factory import get_subscription_id
 from knack.util import CLIError
 
@@ -25,34 +18,6 @@
 from .telemetry import CLITelemetryClient
 
 
-# NOTE(mainred): environment variables to disable prometheus toolset loading should be set before importing holmes.
-def customize_holmesgpt():
-    os.environ[CONST_DISABLE_PROMETHEUS_TOOLSET_ENV_KEY] = "true"
-    os.environ[CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY] = get_config_dir()
-    os.environ[CONST_AGENT_NAME_ENV_KEY] = CONST_AGENT_NAME
-    os.environ[CONST_PRIVACY_NOTICE_BANNER_ENV_KEY] = CONST_PRIVACY_NOTICE_BANNER
-
-
-# NOTE(mainred): holmes leverage the log handler RichHandler to provide colorful, readable and well-formatted logs
-# making the interactive mode more user-friendly.
-# And we removed exising log handlers to avoid duplicate logs.
-# Also make the console log consistent, we remove the telemetry and data logger to skip redundant logs.
-def init_log():
-    # NOTE(mainred): we need to disable INFO logs from LiteLLM before LiteLLM library is loaded, to avoid logging the
-    # debug logs from heading of LiteLLM.
-    logging.getLogger("LiteLLM").setLevel(logging.WARNING)
-    logging.getLogger("telemetry.main").setLevel(logging.WARNING)
-    logging.getLogger("telemetry.process").setLevel(logging.WARNING)
-    logging.getLogger("telemetry.save").setLevel(logging.WARNING)
-    logging.getLogger("telemetry.client").setLevel(logging.WARNING)
-    logging.getLogger("az_command_data_logger").setLevel(logging.WARNING)
-
-    from holmes.utils.console.logging import init_logging
-
-    # TODO: make log verbose configurable, currently disabled by [].
-    return init_logging([])
-
-
 def _get_mode_state_file() -> str:
     """Get the path to the mode state file."""
     config_dir = get_config_dir()
@@ -168,8 +133,6 @@ def aks_agent(
             raise CLIError(
                 "Please upgrade the python version to 3.10 or above to use aks agent."
             )
-        # customizing holmesgpt should called before importing holmes
-        customize_holmesgpt()
 
         # Initialize variables
         interactive = not no_interactive
@@ -213,85 +176,88 @@ def aks_agent(
         # MCP Lifecycle Manager
         mcp_lifecycle = MCPLifecycleManager()
 
-        try:
-            config = None
+        config = None
 
-            if use_aks_mcp:
-                try:
-                    config_params = {
-                        'config_file': config_file,
-                        'model': model,
-                        'api_key': api_key,
-                        'max_steps': max_steps,
-                        'verbose': show_tool_output
-                    }
-                    mcp_info = mcp_lifecycle.setup_mcp_sync(config_params)
-                    config = mcp_info['config']
-
-                    if show_tool_output:
-                        from .user_feedback import ProgressReporter
-                        ProgressReporter.show_status_message("MCP mode active - enhanced capabilities enabled", "info")
-
-                except Exception as e:  # pylint: disable=broad-exception-caught
-                    # Fallback to traditional mode on any MCP setup failure
-                    from .error_handler import AgentErrorHandler
-                    mcp_error = AgentErrorHandler.handle_mcp_setup_error(e, "MCP initialization")
-                    if show_tool_output:
-                        console.print(f"[yellow]MCP setup failed, using traditional mode: {mcp_error.message}[/yellow]")
-                        if mcp_error.suggestions:
-                            console.print("[dim]Suggestions for next time:[/dim]")
-                            for suggestion in mcp_error.suggestions[:3]:  # Show only first 3 suggestions
-                                console.print(f"[dim]  • {suggestion}[/dim]")
-                    use_aks_mcp = False
-                    current_mode = "traditional"
-
-            # Fallback to traditional mode if MCP setup failed or was disabled
-            if not config:
-                config = _setup_traditional_mode_sync(config_file, model, api_key, max_steps, show_tool_output)
-                if show_tool_output:
-                    console.print("[yellow]Traditional mode active (MCP disabled)[/yellow]")
+        if use_aks_mcp:
+            try:
+                config_params = {
+                    'config_file': config_file,
+                    'model': model,
+                    'api_key': api_key,
+                    'max_steps': max_steps,
+                    'verbose': show_tool_output
+                }
+                mcp_info = mcp_lifecycle.setup_mcp_sync(config_params)
+                config = mcp_info['config']
 
-            # Save the current mode to state file for next run
-            _save_current_mode(current_mode)
+                if show_tool_output:
+                    from .user_feedback import ProgressReporter
+                    ProgressReporter.show_status_message("MCP mode active - enhanced capabilities enabled", "info")
 
-            # Use smart refresh logic
-            effective_refresh_toolsets = smart_refresh
+            except Exception as e:  # pylint: disable=broad-exception-caught
+                # Fallback to traditional mode on any MCP setup failure
+                from .error_handler import AgentErrorHandler
+                mcp_error = AgentErrorHandler.handle_mcp_setup_error(e, "MCP initialization")
+                if show_tool_output:
+                    console.print(f"[yellow]MCP setup failed, using traditional mode: {mcp_error.message}[/yellow]")
+                    if mcp_error.suggestions:
+                        console.print("[dim]Suggestions for next time:[/dim]")
+                        for suggestion in mcp_error.suggestions[:3]:  # Show only first 3 suggestions
+                            console.print(f"[dim]  • {suggestion}[/dim]")
+                use_aks_mcp = False
+                current_mode = "traditional"
+
+        # Fallback to traditional mode if MCP setup failed or was disabled
+        if not config:
+            config = _setup_traditional_mode_sync(config_file, model, api_key, max_steps, show_tool_output)
             if show_tool_output:
-                from .user_feedback import ProgressReporter
-                ProgressReporter.show_status_message(
-                    f"Toolset refresh: {effective_refresh_toolsets} (Mode: {current_mode})", "info"
-                )
+                console.print("[yellow]Traditional mode active (MCP disabled)[/yellow]")
 
-            # Create AI client once with proper refresh settings
+        # Save the current mode to state file for next run
+        _save_current_mode(current_mode)
+
+        # Use smart refresh logic
+        effective_refresh_toolsets = smart_refresh
+        if show_tool_output:
+            from .user_feedback import ProgressReporter
+            ProgressReporter.show_status_message(
+                f"Toolset refresh: {effective_refresh_toolsets} (Mode: {current_mode})", "info"
+            )
+
+        # Validate inputs
+        if not prompt and not interactive and not piped_data:
+            raise CLIError(
+                "Either the 'prompt' argument must be provided (unless using --interactive mode)."
+            )
+        try:
+            # prepare the toolsets
             ai = config.create_console_toolcalling_llm(
                 dal=None,
                 refresh_toolsets=effective_refresh_toolsets,
             )
+        except Exception as e:
+            raise CLIError(f"Failed to create AI executor: {str(e)}")
+
+        # Handle piped data
+        if piped_data:
+            if prompt:
+                # User provided both piped data and a prompt
+                prompt = f"Here's some piped output:\n\n{piped_data}\n\n{prompt}"
+            else:
+                # Only piped data, no prompt - ask what to do with it
+                prompt = f"Here's some piped output:\n\n{piped_data}\n\nWhat can you tell me about this output?"
 
-            # Validate inputs
-            if not prompt and not interactive and not piped_data:
-                raise CLIError(
-                    "Either the 'prompt' argument must be provided (unless using --interactive mode)."
-                )
-
-            # Handle piped data
-            if piped_data:
-                if prompt:
-                    # User provided both piped data and a prompt
-                    prompt = f"Here's some piped output:\n\n{piped_data}\n\n{prompt}"
-                else:
-                    # Only piped data, no prompt - ask what to do with it
-                    prompt = f"Here's some piped output:\n\n{piped_data}\n\nWhat can you tell me about this output?"
-
-            # Phase 2: Holmes Execution (synchronous - no event loop conflicts)
-            is_mcp_mode = current_mode == "mcp"
+        # Phase 2: Holmes Execution (synchronous - no event loop conflicts)
+        is_mcp_mode = current_mode == "mcp"
+        try:
             if interactive:
                 _run_interactive_mode_sync(ai, cmd, resource_group_name, name,
                                            prompt, console, show_tool_output, is_mcp_mode, telemetry)
             else:
                 _run_noninteractive_mode_sync(ai, config, cmd, resource_group_name, name,
                                               prompt, console, echo, show_tool_output, is_mcp_mode)
-
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            raise CLIInternalError(f"Error occurred during execution: {str(e)}")
         finally:
             # Phase 3: MCP Cleanup (isolated async if needed)
             mcp_lifecycle.cleanup_mcp_sync()