diff --git a/.claude/hooks/deployment-monitor.sh b/.claude/hooks/deployment-monitor.sh
new file mode 100755
index 00000000..8667e119
--- /dev/null
+++ b/.claude/hooks/deployment-monitor.sh
@@ -0,0 +1,271 @@
+#!/bin/bash
+#
+# Deployment Monitor Hook
+# This hook monitors long-running deployment processes and notifies when complete
+#
+# Usage: Can be called after triggering a deployment to monitor its progress
+#
+# Dependencies:
+# - Required: bash, wc, tail, sed, grep, cat, tr, date, sleep
+# - Optional: curl (for Slack notifications), osascript (for macOS notifications), notify-send (for Linux notifications)
+#
+# Configuration:
+# Set SLACK_WEBHOOK_URL environment variable for Slack notifications
+
+set -e
+
+HOOK_NAME="deployment-monitor"
+
+# Configuration is loaded from environment variables:
+# - SLACK_WEBHOOK_URL: Optional Slack webhook URL for notifications
+
+# Function to check if required commands are available
+check_command() {
+    local cmd=$1
+    local required=$2
+
+    if ! command -v "$cmd" &> /dev/null; then
+        if [ "$required" = "true" ]; then
+            echo "[$HOOK_NAME] ERROR: Required command '$cmd' is not installed"
+            return 1
+        else
+            echo "[$HOOK_NAME] WARNING: Optional command '$cmd' is not installed"
+            return 0
+        fi
+    fi
+    return 0
+}
+
+# Function to monitor deployment
+monitor_deployment() {
+    local task_id=$1
+
+    if [ -z "$task_id" ]; then
+        echo "[$HOOK_NAME] ERROR: Task ID required"
+        echo "Usage: $0 monitor <task_id>"
+        exit 1
+    fi
+
+    # Build task output paths dynamically based on current working directory
+    local cwd_sanitized
+    cwd_sanitized=$(pwd | tr '/' '-' | sed 's/^-//')
+    local task_dir="/tmp/claude/-${cwd_sanitized}/tasks"
+
+    # Ensure task directory exists
+    if [ ! -d "$task_dir" ]; then
+        echo "[$HOOK_NAME] WARNING: Task directory does not exist: $task_dir"
+        echo "[$HOOK_NAME] Creating directory..."
+        mkdir -p "$task_dir"
+    fi
+
+    local output_file="${task_dir}/${task_id}.output"
+    local exit_code_file="${task_dir}/${task_id}.exit_code"
+    local start_time
+    start_time=$(date +%s)
+
+    echo "[$HOOK_NAME] Monitoring deployment task: $task_id"
+    echo "[$HOOK_NAME] Started at: $(date)"
+    echo ""
+
+    # Wait for the deployment to complete
+    local last_line_count=0
+    local max_wait_seconds=${MONITOR_TIMEOUT:-7200}  # Default 2 hours
+    while true; do
+        # Check for timeout
+        local elapsed=$(($(date +%s) - start_time))
+        if [ "$elapsed" -ge "$max_wait_seconds" ]; then
+            local minutes=$((max_wait_seconds / 60))
+            echo ""
+            echo "[$HOOK_NAME] ERROR: Maximum wait time (${minutes} minutes) reached"
+            notify_completion "FAILED" "Deployment monitoring timed out after ${minutes} minutes without completion"
+            return 2
+        fi
+
+        # Check if exit code file exists (task completed)
+        if [ -f "$exit_code_file" ]; then
+            local exit_code
+            exit_code=$(cat "$exit_code_file")
+            echo "[$HOOK_NAME] Deployment process finished with exit code: $exit_code"
+            break
+        fi
+
+        # Show progress if output file exists
+        if [ -f "$output_file" ]; then
+            local current_lines
+            current_lines=$(wc -l < "$output_file" | tr -d ' ')
+            if [ "$current_lines" != "$last_line_count" ]; then
+                local elapsed=$(($(date +%s) - start_time))
+                local minutes=$((elapsed / 60))
+                local seconds=$((elapsed % 60))
+                echo "[$HOOK_NAME] Progress: $current_lines lines | Elapsed: ${minutes}m ${seconds}s | $(date +%H:%M:%S)"
+
+                # Show latest activity
+                tail -3 "$output_file" | sed 's/\x1b\[[0-9;]*m//g' | grep -v "^$" | tail -1 | sed "s/^/[$HOOK_NAME]   Latest: /"
+
+                last_line_count=$current_lines
+            fi
+        fi
+
+        # Sleep before next check
+        sleep 15
+    done
+
+    # Calculate total time
+    local end_time
+    end_time=$(date +%s)
+    local total_time=$((end_time - start_time))
+    local minutes=$((total_time / 60))
+    local seconds=$((total_time % 60))
+
+    # Determine status and send notification
+    if [ "$exit_code" -eq 0 ]; then
+        notify_completion "COMPLETE" "Maestro cluster deployment completed successfully in ${minutes}m ${seconds}s!"
+        echo ""
+        echo "[$HOOK_NAME] Total deployment time: ${minutes}m ${seconds}s"
+        echo "[$HOOK_NAME] Output file: $output_file"
+        return 0
+    else
+        notify_completion "FAILED" "Deployment failed with exit code $exit_code after ${minutes}m ${seconds}s"
+        echo ""
+        echo "[$HOOK_NAME] Total deployment time: ${minutes}m ${seconds}s"
+        echo "[$HOOK_NAME] Output file: $output_file"
+        return 1
+    fi
+}
+
+# Function to send Slack notification
+send_slack_notification() {
+    local status=$1
+    local message=$2
+    local webhook_url=$3
+
+    if [ -z "$webhook_url" ]; then
+        return 1
+    fi
+
+    # Check if curl is available
+    if ! check_command "curl" "false"; then
+        echo "[$HOOK_NAME] Skipping Slack notification - curl not available"
+        return 1
+    fi
+
+    # Determine color based on status
+    local color="good"
+    local emoji=":white_check_mark:"
+    if [[ "$status" == "FAILED" ]]; then
+        color="danger"
+        emoji=":x:"
+    elif [[ "$status" == "COMPLETE" ]]; then
+        color="good"
+        emoji=":white_check_mark:"
+    fi
+
+    # Create JSON payload using jq for safe escaping
+    local payload
+    if command -v jq &> /dev/null; then
+        # Use jq for safe JSON construction
+        payload=$(jq -n \
+            --arg color "$color" \
+            --arg title "$emoji Maestro Deployment $status" \
+            --arg text "$message" \
+            --arg footer "Maestro Deployment Monitor" \
+            --argjson ts "$(date +%s)" \
+            '{attachments: [{color: $color, title: $title, text: $text, footer: $footer, ts: $ts}]}')
+    elif command -v python3 &> /dev/null; then
+        # Fallback: Use Python for proper JSON encoding
+        payload=$(python3 -c "import json, sys; print(json.dumps({'attachments': [{'color': sys.argv[1], 'title': sys.argv[2] + ' Maestro Deployment ' + sys.argv[3], 'text': sys.argv[4], 'footer': 'Maestro Deployment Monitor', 'ts': int(sys.argv[5])}]}))" "$color" "$emoji" "$status" "$message" "$(date +%s)")
+    else
+        # Last resort: Extended manual escaping for all control characters
+        local escaped_message="${message//\\/\\\\}"      # Escape backslashes
+        escaped_message="${escaped_message//\"/\\\"}"    # Escape quotes
+        escaped_message="${escaped_message//$'\n'/\\n}"  # Escape newlines
+        escaped_message="${escaped_message//$'\r'/\\r}"  # Escape carriage returns
+        escaped_message="${escaped_message//$'\t'/\\t}"  # Escape tabs
+
+        local escaped_status="${status//\\/\\\\}"
+        escaped_status="${escaped_status//\"/\\\"}"
+        escaped_status="${escaped_status//$'\n'/\\n}"
+        escaped_status="${escaped_status//$'\r'/\\r}"
+        escaped_status="${escaped_status//$'\t'/\\t}"
+
+        payload=$(cat <<EOF
+{
+  "attachments": [
+    {
+      "color": "$color",
+      "title": "$emoji Maestro Deployment $escaped_status",
+      "text": "$escaped_message",
+      "footer": "Maestro Deployment Monitor",
+      "ts": $(date +%s)
+    }
+  ]
+}
+EOF
+)
+    fi
+
+    # Send to Slack and capture exit status
+    # --fail ensures curl returns non-zero on HTTP 4xx/5xx errors
+    local curl_exit_code
+    if curl -X POST -H 'Content-type: application/json' \
+        --data "$payload" \
+        "$webhook_url" \
+        --silent --show-error --fail; then
+        curl_exit_code=0
+    else
+        curl_exit_code=$?
+        echo "[$HOOK_NAME] ERROR: Failed to send Slack notification (curl exit code: $curl_exit_code)"
+    fi
+
+    return $curl_exit_code
+}
+
+# Function to send notification
+notify_completion() {
+    local status=$1
+    local message=$2
+
+    echo ""
+    echo "=========================================="
+    echo "[$HOOK_NAME] DEPLOYMENT $status"
+    echo "Message: $message"
+    echo "Time: $(date)"
+    echo "=========================================="
+    echo ""
+
+    # Send Slack notification if webhook is configured
+    if [ -n "$SLACK_WEBHOOK_URL" ]; then
+        echo "[$HOOK_NAME] Sending Slack notification..."
+        if send_slack_notification "$status" "$message" "$SLACK_WEBHOOK_URL"; then
+            echo "[$HOOK_NAME] Slack notification sent successfully"
+        else
+            echo "[$HOOK_NAME] Failed to send Slack notification"
+        fi
+    fi
+
+    # Also send system notification if available
+    if command -v osascript &> /dev/null; then
+        # macOS notification - escape message for AppleScript
+        local safe_message="${message//\\/\\\\}"
+        safe_message="${safe_message//\"/\\\"}"
+        osascript -e "display notification \"$safe_message\" with title \"Maestro Deployment $status\""
+    elif command -v notify-send &> /dev/null; then
+        # Linux notification - use safe argument passing
+        notify-send -- "Maestro Deployment $status" "$message"
+    fi
+}
+
+# Main execution
+case "${1:-notify}" in
+    monitor)
+        monitor_deployment "$2"
+        exit $?
+        ;;
+    notify)
+        notify_completion "${2:-COMPLETE}" "${3:-Deployment finished}"
+        ;;
+    *)
+        echo "Usage: $0 {monitor <task_id>|notify <status> <message>}"
+        exit 1
+        ;;
+esac
diff --git a/.claude/skills/README.md b/.claude/skills/README.md
new file mode 100644
index 00000000..02abfaf5
--- /dev/null
+++ b/.claude/skills/README.md
@@ -0,0 +1,265 @@
+# Maestro Claude Skills
+
+This directory contains custom Claude Code skills for Maestro development and operations.
+
+## Skills Directory Structure
+
+Each skill is organized in its own folder with a `SKILL.md` file that defines the skill implementation:
+
+```
+.claude/skills/
+├── README.md
+├── setup-maestro-cluster/
+│   ├── SKILL.md
+│   └── scripts/
+│       └── setup.sh
+├── run-e2e-tests/
+│   ├── SKILL.md
+│   └── scripts/
+│       └── run-tests.sh
+└── diagnose-maestro-deployment/
+    ├── SKILL.md
+    └── scripts/
+        └── diagnose.sh
+```
+
+## Available Skills
+
+### 1. setup-maestro-cluster
+
+Sets up a long-running Maestro cluster environment using Azure ARO-HCP infrastructure.
+
+**Usage:**
+```bash
+/setup-maestro-cluster
+```
+
+**What it does:**
+1. Verifies Azure CLI installation and login status
+2. Checks that you're logged into the "ARO Hosted Control Planes" Azure account
+3. Clones the ARO-HCP repository to a temporary location
+4. Sets required environment variables (USER, PERSIST, GITHUB_ACTIONS, GOTOOLCHAIN)
+5. Runs `make personal-dev-env` to deploy the environment
+6. Monitors and reports deployment status
+
+**Prerequisites:**
+- Azure CLI installed (`brew install azure-cli` on macOS)
+- Logged into correct Azure account: `az login`
+- Valid Azure permissions for resource creation
+
+**Environment Variables Set:**
+- `USER=oasis` (only if not already set)
+- `PERSIST=true`
+- `GITHUB_ACTIONS=true`
+- `GOTOOLCHAIN=go1.24.4`
+
+**Documentation:** See [setup-maestro-cluster/SKILL.md](setup-maestro-cluster/SKILL.md)
+
+---
+
+### 2. run-e2e-tests
+
+Runs end-to-end or upgrade tests on existing long-running Maestro clusters deployed in Azure AKS.
+
+**Usage:**
+```bash
+/run-e2e-tests [test-type]
+```
+
+Where `test-type` can be:
+- `upgrade`: Run upgrade tests (default)
+- `e2e`: Run standard E2E tests with Istio
+- `all`: Run both upgrade and e2e tests
+
+**What it does:**
+1. Verifies required tools (az, kubectl, kubelogin, jq)
+2. Fetches AKS credentials for svc-cluster and mgmt-cluster
+3. Converts kubeconfig for azurecli authentication
+4. Generates in-cluster kubeconfig with service account tokens
+5. Extracts deployment information (commit SHA, consumer name)
+6. Runs selected test type(s)
+7. Summarizes test results and failures
+8. Cleans up test resources
+
+**Prerequisites:**
+- Azure CLI, kubectl, kubelogin, jq must be installed
+- Logged into Azure with cluster access
+- Long-running clusters must be already deployed
+- Required environment variables:
+  ```bash
+  export SVC_RESOURCE_GROUP="your-svc-rg"
+  export SVC_CLUSTER_NAME="your-svc-cluster"
+  export MGMT_RESOURCE_GROUP="your-mgmt-rg"
+  export MGMT_CLUSTER_NAME="your-mgmt-cluster"
+  ```
+
+**Test Types:**
+- **upgrade**: Pre-upgrade tests, server upgrade, post-upgrade tests, agent upgrade
+- **e2e**: E2E tests with Istio service mesh
+- **all**: Runs both upgrade and e2e tests sequentially
+
+**Documentation:** See [run-e2e-tests/SKILL.md](run-e2e-tests/SKILL.md)
+
+---
+
+### 3. diagnose-maestro-deployment
+
+Automatically diagnoses failed Maestro cluster deployments by analyzing Helm releases, pod status, and resource conflicts.
+
+**Usage:**
+```bash
+# Diagnose using deployment output file
+/diagnose-maestro-deployment /path/to/deployment.output
+
+# Diagnose using cluster information directly
+/diagnose-maestro-deployment --svc-rg <resource-group> --svc-cluster <cluster-name> --mgmt-rg <resource-group> --mgmt-cluster <cluster-name>
+```
+
+**What it does:**
+1. Analyzes deployment output to identify resource groups and cluster names
+2. Retrieves credentials for both service and management clusters
+3. Lists all Helm releases and identifies failed ones
+4. Inspects pod states in critical namespaces
+5. Checks for known issues (e.g., ClusterSizingConfiguration conflicts)
+6. Identifies resource conflicts and timing issues
+7. Generates a comprehensive diagnostic report
+8. Saves the report to a timestamped file
+
+**Prerequisites:**
+- Azure CLI, kubectl, helm must be installed
+- Logged into Azure with cluster access
+- jq installed for JSON parsing
+- Access to deployment output or cluster information
+
+**Known Issues Detected:**
+- **Hypershift ClusterSizingConfiguration conflict**: Helm post-install hook conflicts with operator-managed resources
+- **MCE deployment failures**: Multicluster Engine Helm release issues
+- **Missing Maestro in service cluster**: Deployment halted before service cluster setup
+
+**Output:**
+The skill generates a detailed report saved as `maestro-diagnosis-YYYYMMDD-HHMMSS.txt` containing:
+- Helm release status for both clusters
+- Pod status in critical namespaces
+- Failed release details
+- Resource conflict analysis
+- Root cause identification
+- Recommended remediation steps
+
+**Exit Codes:**
+- `0`: No critical issues found
+- `1`: Issues detected (see report for details)
+
+**Documentation:** See [diagnose-maestro-deployment/SKILL.md](diagnose-maestro-deployment/SKILL.md)
+
+---
+
+## Hooks
+
+### deployment-monitor.sh
+
+A hook that monitors long-running deployment processes and sends notifications.
+
+**Features:**
+- Desktop notifications (macOS/Linux)
+- Slack notifications via webhook
+- Customizable status messages
+- Real-time deployment monitoring
+- Configurable timeout (default: 2 hours)
+
+**Dependencies:**
+- Required: `bash`, `wc`, `tail`, `sed`, `grep`, `cat`, `tr`, `date`, `sleep` (standard Unix tools)
+- Optional: `curl` (for Slack notifications), `osascript` (for macOS notifications), `notify-send` (for Linux notifications)
+
+**Configuration:**
+
+To enable Slack notifications:
+
+1. Create a Slack webhook:
+   - Go to <https://api.slack.com/messaging/webhooks>
+   - Create an Incoming Webhook for your channel
+   - Copy the webhook URL
+
+2. Set the webhook URL as an environment variable:
+   ```bash
+   export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
+   ```
+
+3. (Optional) Configure timeout:
+   ```bash
+   export MONITOR_TIMEOUT=3600  # 1 hour in seconds
+   ```
+
+**Usage:**
+```bash
+# Monitor a deployment task in real-time
+.claude/hooks/deployment-monitor.sh monitor <task_id>
+
+# Example:
+.claude/hooks/deployment-monitor.sh monitor b4ac6c1
+
+# Send a manual completion notification
+.claude/hooks/deployment-monitor.sh notify "COMPLETE" "Deployment finished successfully"
+
+# Send a failure notification
+.claude/hooks/deployment-monitor.sh notify "FAILED" "Deployment failed with errors"
+```
+
+**What the monitor does:**
+1. Tracks the deployment task by its task ID
+2. Shows real-time progress updates (line count, elapsed time)
+3. Displays the latest deployment activity
+4. Detects when the task completes or times out
+5. Automatically sends notifications (Slack + desktop) when done
+6. Reports final status and deployment duration
+
+The hook will:
+1. Send a Slack notification (if configured) with color-coded messages
+2. Send desktop notifications on macOS (via osascript) or Linux (via notify-send)
+3. Return proper exit codes: 0 for success, 1 for failure, 2 for timeout
+
+---
+
+## How Skills Work
+
+Skills are invoked in Claude Code using the `/` prefix followed by the skill name. When you run a skill:
+
+1. Claude Code reads the `SKILL.md` file from the skill's folder
+2. Executes the bash script in the Implementation section
+3. Returns the output to you in the chat
+
+Skills are a powerful way to automate complex, multi-step workflows that you perform frequently.
+
+## Creating New Skills
+
+To create a new skill:
+
+1. Create a new folder in `.claude/skills/` with a descriptive name:
+   ```bash
+   mkdir -p .claude/skills/my-new-skill
+   ```
+
+2. Create a `SKILL.md` file in that folder with these sections:
+   - Title and description
+   - Prerequisites
+   - Usage example
+   - Steps (what the skill does)
+   - Implementation (bash script in a code block)
+   - Notes
+
+3. Make sure the bash script is well-commented and handles errors
+
+4. Update this README.md to document the new skill
+
+See existing skills as examples:
+- [setup-maestro-cluster/SKILL.md](setup-maestro-cluster/SKILL.md)
+- [run-e2e-tests/SKILL.md](run-e2e-tests/SKILL.md)
+
+## Tips for Writing Skills
+
+- **Error Handling**: Always check exit codes and provide clear error messages
+- **Prerequisites**: Document all required tools and environment variables
+- **Idempotency**: Skills should be safe to run multiple times
+- **Cleanup**: Clean up temporary files and resources
+- **Progress Updates**: Provide clear progress indicators (✓, step numbers, etc.)
+- **Exit Codes**: Use proper exit codes (0 for success, non-zero for failures)
+- **Environment Variables**: Use environment variables for configuration instead of hard-coded values
diff --git a/.claude/skills/diagnose-maestro-deployment/SKILL.md b/.claude/skills/diagnose-maestro-deployment/SKILL.md
new file mode 100644
index 00000000..b010ff98
--- /dev/null
+++ b/.claude/skills/diagnose-maestro-deployment/SKILL.md
@@ -0,0 +1,36 @@
+---
+name: diagnose-maestro-deployment
+description: Diagnoses failed Maestro cluster deployments by analyzing Helm releases, pod status, and resource conflicts
+category: Troubleshooting
+tags: [azure, aks, maestro, troubleshooting, debugging, helm, kubernetes]
+---
+
+# Diagnose Maestro Deployment
+
+Automatically diagnoses failed Maestro cluster deployments by:
+- Analyzing deployment output to identify resource groups and cluster names
+- Checking Helm release status in both service and management clusters
+- Inspecting pod states and error conditions
+- Identifying resource conflicts and timing issues
+- Generating a detailed analysis report with root cause and recommendations
+
+**Prerequisites:**
+- Azure CLI installed and logged in
+- kubectl and kubelogin installed
+- Access to the failed deployment output or cluster information
+
+**Usage:**
+```bash
+# Diagnose using deployment output file
+diagnose-maestro-deployment /path/to/deployment.output
+
+# Diagnose using cluster names directly
+diagnose-maestro-deployment --svc-rg <resource-group> --svc-cluster <cluster-name> --mgmt-rg <resource-group> --mgmt-cluster <cluster-name>
+```
+
+```bash
+#!/bin/bash
+# Execute the diagnostic script
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+exec "$SCRIPT_DIR/scripts/diagnose.sh" "$@"
+```
diff --git a/.claude/skills/diagnose-maestro-deployment/scripts/analyze-logs.sh b/.claude/skills/diagnose-maestro-deployment/scripts/analyze-logs.sh
new file mode 100755
index 00000000..8170fdd7
--- /dev/null
+++ b/.claude/skills/diagnose-maestro-deployment/scripts/analyze-logs.sh
@@ -0,0 +1,176 @@
+#!/bin/bash
+# Log Analysis Module for Maestro Deployment Diagnostics
+# This module intelligently parses deployment logs to identify issues
+
+set -e
+
+# Extract failed Helm releases from deployment logs
+extract_failed_helm_releases() {
+    local log_file=$1
+    local temp_dir=$2
+
+    # Look for Helm deployment errors in logs
+    grep -i "failed to deploy helm release\|helm release.*failed\|error running Helm" "$log_file" 2>/dev/null | \
+        grep -o "helm release: [a-zA-Z0-9-]*\|release [a-zA-Z0-9-]*" | \
+        awk '{print $NF}' | \
+        sort -u > "$temp_dir/failed_helm_releases.txt" || true
+
+    # Also extract from error messages with release names
+    grep -oP '(?<=aro-hcp-)[a-zA-Z0-9-]+(?=/templates)' "$log_file" 2>/dev/null | \
+        sort -u >> "$temp_dir/failed_helm_releases.txt" || true
+}
+
+# Extract resource conflicts from logs
+extract_resource_conflicts() {
+    local log_file=$1
+    local temp_dir=$2
+
+    # Look for resource conflict errors
+    if grep -q "Apply failed with.*conflicts\|conflict occurred while applying" "$log_file"; then
+        python3 -c "
+import re
+import sys
+
+try:
+    with open('$log_file', 'r') as f:
+        content = f.read()
+
+    conflicts = {}
+
+    # Find conflict patterns
+    for line in content.split('\n'):
+        if 'Apply failed with' in line or 'conflict occurred while applying' in line:
+            # Extract resource type and name
+            resource_match = re.search(r'(?:object|resource)\s+/([a-zA-Z0-9-]+)\s+([a-zA-Z0-9./]+)', line)
+            if resource_match:
+                resource_name = resource_match.group(1)
+                resource_type = resource_match.group(2)
+
+                # Extract conflicting fields
+                fields = []
+
+                # Pattern 1: Field paths in error message
+                field_matches = re.findall(r'\.spec\.[a-zA-Z0-9.\[\]=\"]+', line)
+                fields.extend(field_matches)
+
+                # Pattern 2: conflicts with manager
+                manager_match = re.search(r'conflicts with\\\\\"([^\\\\]+)\\\\\"', line)
+                manager = manager_match.group(1) if manager_match else 'unknown'
+
+                if resource_name not in conflicts:
+                    conflicts[resource_name] = {
+                        'type': resource_type,
+                        'fields': set(),
+                        'manager': manager
+                    }
+
+                conflicts[resource_name]['fields'].update(fields)
+
+    # Output conflicts in structured format
+    for resource, info in conflicts.items():
+        print(f'CONFLICT:{resource}:{info[\"type\"]}:{info[\"manager\"]}:{\"|\".join(sorted(info[\"fields\"]))}')
+
+except Exception as e:
+    print(f'ERROR: Failed to parse conflicts: {e}', file=sys.stderr)
+" > "$temp_dir/resource_conflicts.txt" 2>/dev/null || echo "ERROR:parse_failed" > "$temp_dir/resource_conflicts.txt"
+    fi
+}
+
+# Extract deployment timeline from logs
+extract_deployment_timeline() {
+    local log_file=$1
+    local temp_dir=$2
+
+    # Extract timestamped events
+    grep -E '^\[?[0-9]{2}:[0-9]{2}:[0-9]{2}' "$log_file" | \
+        grep -i "error\|failed\|success\|complete\|deployed\|installing" | \
+        tail -50 > "$temp_dir/timeline.txt" || true
+}
+
+# Identify root cause from error patterns
+identify_root_cause() {
+    local log_file=$1
+    local temp_dir=$2
+
+    # Common error patterns and their interpretations
+    python3 -c "
+import re
+
+error_patterns = {
+    'timing_conflict': r'conflict occurred while applying.*hook',
+    'resource_exists': r'already exists',
+    'timeout': r'context (deadline exceeded|canceled)|timed? out',
+    'authentication': r'authentication|unauthorized|forbidden',
+    'network': r'connection refused|network.*unreachable|dial tcp',
+    'resource_limit': r'(insufficient|exceeded).*resources',
+    'dependency_missing': r'not found.*required|missing.*dependency',
+    'api_error': r'Internal error occurred|API.*error',
+    'helm_hook_failed': r'Hook.*failed|post-install.*failed',
+}
+
+with open('$log_file', 'r') as f:
+    content = f.read()
+
+detected_patterns = []
+for pattern_name, pattern_regex in error_patterns.items():
+    if re.search(pattern_regex, content, re.IGNORECASE):
+        detected_patterns.append(pattern_name)
+
+        # Find specific error context
+        matches = re.finditer(pattern_regex, content, re.IGNORECASE)
+        for match in list(matches)[:3]:  # Limit to first 3
+            start = max(0, match.start() - 200)
+            end = min(len(content), match.end() + 200)
+            context = content[start:end].replace('\n', ' ')
+            print(f'{pattern_name}:::{context}')
+" > "$temp_dir/error_patterns.txt" 2>/dev/null || true
+}
+
+# Extract component status from logs
+extract_component_status() {
+    local log_file=$1
+    local temp_dir=$2
+
+    # Look for explicit status messages
+    grep -i "status.*complete\|deployment.*success\|installed.*successfully" "$log_file" | \
+        tail -20 > "$temp_dir/success_components.txt" || true
+
+    grep -i "status.*fail\|deployment.*fail\|installation.*fail" "$log_file" | \
+        tail -20 > "$temp_dir/failed_components.txt" || true
+}
+
+# Main analysis function
+analyze_deployment_logs() {
+    local log_file=$1
+    local output_dir=$2
+
+    if [ ! -f "$log_file" ]; then
+        echo "ERROR: Log file not found: $log_file"
+        return 1
+    fi
+
+    mkdir -p "$output_dir"
+
+    echo "Analyzing deployment logs: $log_file"
+    echo "Output directory: $output_dir"
+    echo ""
+
+    # Run all analysis functions
+    extract_failed_helm_releases "$log_file" "$output_dir"
+    extract_resource_conflicts "$log_file" "$output_dir"
+    extract_deployment_timeline "$log_file" "$output_dir"
+    identify_root_cause "$log_file" "$output_dir"
+    extract_component_status "$log_file" "$output_dir"
+
+    echo "Log analysis complete. Results in: $output_dir"
+}
+
+# If script is executed directly, run analysis
+if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
+    if [ $# -lt 2 ]; then
+        echo "Usage: $0 <log-file> <output-dir>"
+        exit 1
+    fi
+
+    analyze_deployment_logs "$1" "$2"
+fi
diff --git a/.claude/skills/diagnose-maestro-deployment/scripts/diagnose.sh b/.claude/skills/diagnose-maestro-deployment/scripts/diagnose.sh
new file mode 100755
index 00000000..e5dd0cb7
--- /dev/null
+++ b/.claude/skills/diagnose-maestro-deployment/scripts/diagnose.sh
@@ -0,0 +1,643 @@
+#!/bin/bash
+set -e
+
+echo "=========================================="
+echo "Maestro Deployment Diagnostic Tool"
+echo "=========================================="
+echo ""
+
+# Initialize variables
+DEPLOYMENT_OUTPUT=""
+SVC_RESOURCE_GROUP=""
+SVC_CLUSTER_NAME=""
+MGMT_RESOURCE_GROUP=""
+MGMT_CLUSTER_NAME=""
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --svc-rg)
+            SVC_RESOURCE_GROUP="$2"
+            shift 2
+            ;;
+        --svc-cluster)
+            SVC_CLUSTER_NAME="$2"
+            shift 2
+            ;;
+        --mgmt-rg)
+            MGMT_RESOURCE_GROUP="$2"
+            shift 2
+            ;;
+        --mgmt-cluster)
+            MGMT_CLUSTER_NAME="$2"
+            shift 2
+            ;;
+        *)
+            if [ -z "$DEPLOYMENT_OUTPUT" ] && [ -f "$1" ]; then
+                DEPLOYMENT_OUTPUT="$1"
+            fi
+            shift
+            ;;
+    esac
+done
+
+# Function to extract cluster info from deployment output
+extract_cluster_info() {
+    local output_file=$1
+
+    echo "Analyzing deployment output: $output_file"
+    echo ""
+
+    # Try to find cluster names from the output
+    if grep -q "pers-usw3" "$output_file"; then
+        # Extract cluster name pattern
+        local cluster_base=$(grep -o "pers-usw3[a-z]*" "$output_file" | head -1)
+
+        if [ -n "$cluster_base" ]; then
+            SVC_CLUSTER_NAME="${cluster_base}-svc"
+            SVC_RESOURCE_GROUP="hcp-underlay-${cluster_base}-svc"
+            MGMT_CLUSTER_NAME="${cluster_base}-mgmt-1"
+            MGMT_RESOURCE_GROUP="hcp-underlay-${cluster_base}-mgmt-1"
+
+            echo "Detected clusters:"
+            echo "  Service: $SVC_RESOURCE_GROUP / $SVC_CLUSTER_NAME"
+            echo "  Management: $MGMT_RESOURCE_GROUP / $MGMT_CLUSTER_NAME"
+            echo ""
+        fi
+    fi
+}
+
+# Extract cluster info if deployment output provided
+if [ -n "$DEPLOYMENT_OUTPUT" ]; then
+    extract_cluster_info "$DEPLOYMENT_OUTPUT"
+fi
+
+# Validate we have cluster information
+if [ -z "$SVC_RESOURCE_GROUP" ] || [ -z "$SVC_CLUSTER_NAME" ] || \
+   [ -z "$MGMT_RESOURCE_GROUP" ] || [ -z "$MGMT_CLUSTER_NAME" ]; then
+    echo "ERROR: Could not determine cluster information."
+    echo ""
+    echo "Usage:"
+    echo "  $0 <deployment-output-file>"
+    echo "  $0 --svc-rg <rg> --svc-cluster <cluster> --mgmt-rg <rg> --mgmt-cluster <cluster>"
+    exit 1
+fi
+
+# Check prerequisites
+echo "Step 1: Checking prerequisites..."
+if ! command -v az &> /dev/null; then
+    echo "ERROR: Azure CLI not installed"
+    exit 1
+fi
+
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl not installed"
+    exit 1
+fi
+
+if ! command -v helm &> /dev/null; then
+    echo "ERROR: helm not installed"
+    exit 1
+fi
+
+if ! command -v jq &> /dev/null; then
+    echo "ERROR: jq not installed (required for JSON parsing)"
+    echo "Install with: brew install jq (macOS) or apt-get install jq (Linux)"
+    exit 1
+fi
+
+if ! az account show &> /dev/null; then
+    echo "ERROR: Not logged into Azure"
+    exit 1
+fi
+
+# kubelogin is optional but recommended for Azure AD authentication
+if ! command -v kubelogin &> /dev/null; then
+    echo "WARNING: kubelogin not installed (Azure AD authentication may fail)"
+    echo "Install with: brew install Azure/kubelogin/kubelogin (macOS)"
+fi
+
+echo "✓ All prerequisites met"
+echo ""
+
+# Create temporary directory for kubeconfigs
+TEMP_DIR=$(mktemp -d)
+trap 'rm -rf "$TEMP_DIR"' EXIT
+
+SVC_KUBECONFIG="$TEMP_DIR/svc.kubeconfig"
+MGMT_KUBECONFIG="$TEMP_DIR/mgmt.kubeconfig"
+
+# Get cluster credentials
+echo "Step 2: Retrieving cluster credentials..."
+
+# Initialize issue tracking early (will be used if credentials fail)
+CREDENTIAL_ISSUES=0
+
+if az aks get-credentials \
+    --resource-group "$SVC_RESOURCE_GROUP" \
+    --name "$SVC_CLUSTER_NAME" \
+    --overwrite-existing \
+    -f "$SVC_KUBECONFIG" 2>/dev/null; then
+    echo "✓ Service cluster credentials retrieved"
+    # kubelogin may fail but shouldn't stop the script
+    if command -v kubelogin &> /dev/null; then
+        kubelogin convert-kubeconfig -l azurecli --kubeconfig "$SVC_KUBECONFIG" 2>/dev/null || true
+    fi
+else
+    echo "✗ Failed to get service cluster credentials"
+    SVC_KUBECONFIG=""
+    CREDENTIAL_ISSUES=$((CREDENTIAL_ISSUES + 1))
+fi
+
+if az aks get-credentials \
+    --resource-group "$MGMT_RESOURCE_GROUP" \
+    --name "$MGMT_CLUSTER_NAME" \
+    --overwrite-existing \
+    -f "$MGMT_KUBECONFIG" 2>/dev/null; then
+    echo "✓ Management cluster credentials retrieved"
+    # kubelogin may fail but shouldn't stop the script
+    if command -v kubelogin &> /dev/null; then
+        kubelogin convert-kubeconfig -l azurecli --kubeconfig "$MGMT_KUBECONFIG" 2>/dev/null || true
+    fi
+else
+    echo "✗ Failed to get management cluster credentials"
+    MGMT_KUBECONFIG=""
+    CREDENTIAL_ISSUES=$((CREDENTIAL_ISSUES + 1))
+fi
+
+echo ""
+
+# Step 3: Analyze deployment logs if provided
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+LOG_ANALYSIS_DIR="$TEMP_DIR/log-analysis"
+
+if [ -n "$DEPLOYMENT_OUTPUT" ] && [ -f "$DEPLOYMENT_OUTPUT" ]; then
+    echo "Step 3: Analyzing deployment logs..."
+    echo ""
+
+    # Source the log analysis module
+    source "$SCRIPT_DIR/analyze-logs.sh"
+
+    # Run log analysis
+    analyze_deployment_logs "$DEPLOYMENT_OUTPUT" "$LOG_ANALYSIS_DIR"
+
+    echo ""
+fi
+
+# Initialize report
+REPORT_FILE="$TEMP_DIR/diagnosis-report.txt"
+
+cat > "$REPORT_FILE" << EOF
+========================================
+Maestro Deployment Diagnostic Report
+========================================
+Generated: $(date)
+
+Clusters Analyzed:
+  Service: $SVC_RESOURCE_GROUP / $SVC_CLUSTER_NAME
+  Management: $MGMT_RESOURCE_GROUP / $MGMT_CLUSTER_NAME
+
+EOF
+
+# Step 4: Analyze Management Cluster (dynamically based on log analysis)
+echo "Step 4: Analyzing Management Cluster..."
+echo ""
+
+if [ -n "$MGMT_KUBECONFIG" ]; then
+    echo "Management Cluster Analysis" >> "$REPORT_FILE"
+    echo "==========================" >> "$REPORT_FILE"
+    echo "" >> "$REPORT_FILE"
+
+    # Get all Helm releases for reference
+    echo "Helm Releases:" >> "$REPORT_FILE"
+    helm --kubeconfig "$MGMT_KUBECONFIG" list -A -o json | \
+        jq -r '.[] | "\(.name) (\(.namespace)): \(.status) - Chart: \(.chart)"' >> "$REPORT_FILE" 2>/dev/null || \
+        echo "Failed to retrieve Helm releases" >> "$REPORT_FILE"
+    echo "" >> "$REPORT_FILE"
+
+    # Determine what to check based on log analysis
+    FAILED_RELEASES=""
+
+    if [ -f "$LOG_ANALYSIS_DIR/failed_helm_releases.txt" ]; then
+        # Use log analysis results to identify failed releases
+        FAILED_RELEASES=$(cat "$LOG_ANALYSIS_DIR/failed_helm_releases.txt" | tr '\n' ' ')
+        echo "Failed releases identified from logs: $FAILED_RELEASES"
+    fi
+
+    # If no log analysis or empty results, fallback to checking cluster state
+    if [ -z "$FAILED_RELEASES" ]; then
+        FAILED_RELEASES=$(helm --kubeconfig "$MGMT_KUBECONFIG" list -A -o json | \
+            jq -r '.[] | select(.status == "failed") | .name + ":" + .namespace' 2>/dev/null || echo "")
+    fi
+
+    if [ -n "$FAILED_RELEASES" ]; then
+        echo "Investigating Failed Components:" >> "$REPORT_FILE"
+        echo "" >> "$REPORT_FILE"
+
+        # Process each failed release
+        for release_info in $FAILED_RELEASES; do
+            if [[ "$release_info" == *":"* ]]; then
+                release=$(echo "$release_info" | cut -d: -f1)
+                namespace=$(echo "$release_info" | cut -d: -f2)
+            else
+                release="$release_info"
+                # Try to find namespace from Helm
+                namespace=$(helm --kubeconfig "$MGMT_KUBECONFIG" list -A -o json 2>/dev/null | \
+                    jq -r ".[] | select(.name == \"$release\") | .namespace" | head -1)
+                if [ -z "$namespace" ]; then
+                    namespace="unknown"
+                fi
+            fi
+
+            echo "Analyzing: $release in namespace $namespace"
+            echo "[$release] (namespace: $namespace)" >> "$REPORT_FILE"
+            echo "---" >> "$REPORT_FILE"
+
+            if [ "$namespace" != "unknown" ]; then
+                # Get pod status
+                echo "Pods:" >> "$REPORT_FILE"
+                kubectl --kubeconfig "$MGMT_KUBECONFIG" get pods -n "$namespace" -o wide 2>/dev/null >> "$REPORT_FILE" || \
+                    echo "  No pods found or error retrieving pods" >> "$REPORT_FILE"
+                echo "" >> "$REPORT_FILE"
+
+                # Check for resource conflicts if indicated in log analysis
+                if [ -f "$LOG_ANALYSIS_DIR/resource_conflicts.txt" ]; then
+                    while IFS= read -r line; do
+                        # Parse CONFLICT:resource_name:resource_type:manager:fields format
+                        # Use awk to properly split on first 4 colons only
+                        conflict_type=$(echo "$line" | awk -F: '{print $1}')
+                        if [ "$conflict_type" = "CONFLICT" ]; then
+                            resource_name=$(echo "$line" | awk -F: '{print $2}')
+                            # Resource type may contain colons, extract everything between 2nd and 3rd-to-last colon
+                            resource_type=$(echo "$line" | awk -F: '{for(i=3;i<NF-1;i++) printf "%s%s", $i, (i<NF-2?":":"")}')
+                            manager=$(echo "$line" | awk -F: '{print $(NF-1)}')
+                            fields=$(echo "$line" | awk -F: '{print $NF}')
+
+                            echo "Resource Conflict Detected:" >> "$REPORT_FILE"
+                            echo "  Resource: $resource_name (type: $resource_type)" >> "$REPORT_FILE"
+                            echo "  Managed by: $manager" >> "$REPORT_FILE"
+                            if [ -n "$fields" ]; then
+                                echo "  Conflicting fields:" >> "$REPORT_FILE"
+                                echo "$fields" | tr '|' '\n' | sed 's/^/    - /' >> "$REPORT_FILE"
+                            fi
+                            echo "" >> "$REPORT_FILE"
+                        fi
+                    done < "$LOG_ANALYSIS_DIR/resource_conflicts.txt"
+                fi
+            fi
+            echo "" >> "$REPORT_FILE"
+        done
+    else
+        echo "No failed Helm releases detected" >> "$REPORT_FILE"
+        echo "✓ No failed Helm releases in management cluster"
+    fi
+    echo "" >> "$REPORT_FILE"
+fi
+
+echo ""
+
+# Step 5: Analyze Service Cluster
+echo "Step 5: Analyzing Service Cluster..."
+echo ""
+
+if [ -n "$SVC_KUBECONFIG" ]; then
+    echo "Service Cluster Analysis" >> "$REPORT_FILE"
+    echo "========================" >> "$REPORT_FILE"
+    echo "" >> "$REPORT_FILE"
+
+    # Get Helm releases
+    echo "Helm Releases:" >> "$REPORT_FILE"
+    helm --kubeconfig "$SVC_KUBECONFIG" list -A -o json | \
+        jq -r '.[] | "\(.name) (\(.namespace)): \(.status) - Chart: \(.chart)"' >> "$REPORT_FILE" 2>/dev/null || \
+        echo "Failed to retrieve Helm releases" >> "$REPORT_FILE"
+    echo "" >> "$REPORT_FILE"
+
+    # Check maestro namespace
+    echo "Maestro Namespace Status:" >> "$REPORT_FILE"
+    if kubectl --kubeconfig "$SVC_KUBECONFIG" get namespace maestro &>/dev/null; then
+        kubectl --kubeconfig "$SVC_KUBECONFIG" get pods -n maestro -o wide 2>/dev/null >> "$REPORT_FILE" || \
+            echo "No pods in maestro namespace" >> "$REPORT_FILE"
+    else
+        echo "Maestro namespace does not exist" >> "$REPORT_FILE"
+    fi
+    echo "" >> "$REPORT_FILE"
+fi
+
+echo ""
+
+# Step 6: Include Log Analysis Results in Report
+if [ -d "$LOG_ANALYSIS_DIR" ]; then
+    echo "Deployment Log Analysis" >> "$REPORT_FILE"
+    echo "======================" >> "$REPORT_FILE"
+    echo "" >> "$REPORT_FILE"
+
+    # Include error patterns
+    if [ -f "$LOG_ANALYSIS_DIR/error_patterns.txt" ] && [ -s "$LOG_ANALYSIS_DIR/error_patterns.txt" ]; then
+        echo "Identified Error Patterns:" >> "$REPORT_FILE"
+        while IFS= read -r line; do
+            # Split on literal ':::' delimiter
+            pattern="${line%%:::*}"
+            context="${line#*:::}"
+            echo "  • Pattern: $pattern" >> "$REPORT_FILE"
+            echo "    Context: $(echo "$context" | head -c 200)..." >> "$REPORT_FILE"
+            echo "" >> "$REPORT_FILE"
+        done < "$LOG_ANALYSIS_DIR/error_patterns.txt"
+    fi
+
+    # Include deployment timeline
+    if [ -f "$LOG_ANALYSIS_DIR/timeline.txt" ] && [ -s "$LOG_ANALYSIS_DIR/timeline.txt" ]; then
+        echo "Deployment Timeline (last 20 events):" >> "$REPORT_FILE"
+        tail -20 "$LOG_ANALYSIS_DIR/timeline.txt" >> "$REPORT_FILE"
+        echo "" >> "$REPORT_FILE"
+    fi
+fi
+
+# Generate diagnosis summary
+echo "Diagnosis Summary" >> "$REPORT_FILE"
+echo "=================" >> "$REPORT_FILE"
+echo "" >> "$REPORT_FILE"
+
+# Root Cause Analysis (Dynamic based on error patterns)
+echo "ROOT CAUSE ANALYSIS:" >> "$REPORT_FILE"
+echo "-------------------" >> "$REPORT_FILE"
+echo "" >> "$REPORT_FILE"
+
+# Analyze error patterns from logs
+if [ -f "$LOG_ANALYSIS_DIR/error_patterns.txt" ] && [ -s "$LOG_ANALYSIS_DIR/error_patterns.txt" ]; then
+    # Group errors by pattern type (bash 3.x compatible)
+    # Count occurrences of each pattern
+    cut -d':' -f1 < "$LOG_ANALYSIS_DIR/error_patterns.txt" | sort | uniq -c | sort -rn > "$LOG_ANALYSIS_DIR/pattern_counts.txt"
+
+    # Determine primary failure based on most common pattern
+    primary_pattern=""
+    if [ -s "$LOG_ANALYSIS_DIR/pattern_counts.txt" ]; then
+        # Get the first line (highest count)
+        read -r count pattern_name < "$LOG_ANALYSIS_DIR/pattern_counts.txt"
+        primary_pattern="$pattern_name"
+    fi
+
+    if [ -n "$primary_pattern" ]; then
+        case "$primary_pattern" in
+            timing_conflict)
+                echo "Primary Failure Type: Timing/Race Condition" >> "$REPORT_FILE"
+                echo "Multiple components attempted to manage the same resources simultaneously," >> "$REPORT_FILE"
+                echo "leading to conflicts. This often occurs when operators start before Helm" >> "$REPORT_FILE"
+                echo "post-install hooks complete." >> "$REPORT_FILE"
+                ;;
+            timeout)
+                echo "Primary Failure Type: Timeout" >> "$REPORT_FILE"
+                echo "One or more operations exceeded their time limits. This may indicate" >> "$REPORT_FILE"
+                echo "slow network, resource constraints, or hung processes." >> "$REPORT_FILE"
+                ;;
+            authentication)
+                echo "Primary Failure Type: Authentication/Authorization" >> "$REPORT_FILE"
+                echo "Deployment failed due to insufficient permissions or invalid credentials." >> "$REPORT_FILE"
+                ;;
+            network)
+                echo "Primary Failure Type: Network Connectivity" >> "$REPORT_FILE"
+                echo "Network-related errors prevented successful deployment." >> "$REPORT_FILE"
+                ;;
+            resource_limit)
+                echo "Primary Failure Type: Resource Constraints" >> "$REPORT_FILE"
+                echo "Insufficient cluster resources (CPU, memory, or storage) to complete deployment." >> "$REPORT_FILE"
+                ;;
+            *)
+                echo "Primary Failure Type: $primary_pattern" >> "$REPORT_FILE"
+                echo "Multiple errors of this type detected in deployment logs." >> "$REPORT_FILE"
+                ;;
+        esac
+        echo "" >> "$REPORT_FILE"
+    fi
+fi
+
+# Analyze resource conflicts
+if [ -f "$LOG_ANALYSIS_DIR/resource_conflicts.txt" ] && [ -s "$LOG_ANALYSIS_DIR/resource_conflicts.txt" ]; then
+    echo "Resource Conflicts Detected:" >> "$REPORT_FILE"
+    while IFS= read -r line; do
+        # Parse CONFLICT:resource_name:resource_type:manager:fields format
+        conflict_type=$(echo "$line" | awk -F: '{print $1}')
+        if [ "$conflict_type" = "CONFLICT" ]; then
+            resource_name=$(echo "$line" | awk -F: '{print $2}')
+            # Resource type may contain colons
+            resource_type=$(echo "$line" | awk -F: '{for(i=3;i<NF-1;i++) printf "%s%s", $i, (i<NF-2?":":"")}')
+            manager=$(echo "$line" | awk -F: '{print $(NF-1)}')
+            fields=$(echo "$line" | awk -F: '{print $NF}')
+
+            echo "  • Resource: $resource_name ($resource_type)" >> "$REPORT_FILE"
+            echo "    Conflicting manager: $manager" >> "$REPORT_FILE"
+            if [ -n "$fields" ] && [ "$fields" != "" ]; then
+                echo "    Fields:" >> "$REPORT_FILE"
+                echo "$fields" | tr '|' '\n' | sed 's/^/      - /' >> "$REPORT_FILE"
+            fi
+            echo "" >> "$REPORT_FILE"
+        fi
+    done < "$LOG_ANALYSIS_DIR/resource_conflicts.txt"
+fi
+
+echo "" >> "$REPORT_FILE"
+echo "DETAILED ISSUES:" >> "$REPORT_FILE"
+echo "----------------" >> "$REPORT_FILE"
+echo "" >> "$REPORT_FILE"
+
+# Dynamically generate issues based on discoveries
+ISSUES_FOUND=0
+CRITICAL_ISSUES=0
+
+# Issue: Credential failures (if any)
+if [ "$CREDENTIAL_ISSUES" -gt 0 ]; then
+    ISSUES_FOUND=$((ISSUES_FOUND + 1))
+    CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1))
+    echo "[$ISSUES_FOUND] Failed to Retrieve Cluster Credentials" >> "$REPORT_FILE"
+    echo "    Severity: CRITICAL" >> "$REPORT_FILE"
+    if [ -z "$SVC_KUBECONFIG" ]; then
+        echo "    Failed: Service cluster ($SVC_CLUSTER_NAME)" >> "$REPORT_FILE"
+    fi
+    if [ -z "$MGMT_KUBECONFIG" ]; then
+        echo "    Failed: Management cluster ($MGMT_CLUSTER_NAME)" >> "$REPORT_FILE"
+    fi
+    echo "    " >> "$REPORT_FILE"
+    echo "    Recommendation:" >> "$REPORT_FILE"
+    echo "      Verify Azure credentials and cluster access permissions" >> "$REPORT_FILE"
+    echo "      Check that resource groups and cluster names are correct" >> "$REPORT_FILE"
+    echo "" >> "$REPORT_FILE"
+fi
+
+# Issue: Failed Helm Releases (dynamic)
+if [ -n "$FAILED_RELEASES" ]; then
+    for release_info in $FAILED_RELEASES; do
+        if [[ "$release_info" == *":"* ]]; then
+            release=$(echo "$release_info" | cut -d: -f1)
+            namespace=$(echo "$release_info" | cut -d: -f2)
+        else
+            release="$release_info"
+            namespace=$(helm --kubeconfig "$MGMT_KUBECONFIG" list -A -o json 2>/dev/null | \
+                jq -r ".[] | select(.name == \"$release\") | .namespace" | head -1)
+            [ -z "$namespace" ] && namespace="unknown"
+        fi
+
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        echo "[$ISSUES_FOUND] Helm Release Failed: $release" >> "$REPORT_FILE"
+        echo "    Namespace: $namespace" >> "$REPORT_FILE"
+
+        # Determine severity based on pod status
+        severity="WARNING"
+        if [ "$namespace" != "unknown" ] && [ -n "$MGMT_KUBECONFIG" ]; then
+            running_pods=$(kubectl --kubeconfig "$MGMT_KUBECONFIG" get pods -n "$namespace" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l)
+            total_pods=$(kubectl --kubeconfig "$MGMT_KUBECONFIG" get pods -n "$namespace" --no-headers 2>/dev/null | wc -l)
+
+            if [ "$total_pods" -gt 0 ] && [ "$running_pods" -eq "$total_pods" ]; then
+                severity="WARNING"
+                echo "    Severity: WARNING (Helm failed but pods are running)" >> "$REPORT_FILE"
+                echo "    Actual Status: ✓ All $total_pods pods are Running" >> "$REPORT_FILE"
+            elif [ "$total_pods" -eq 0 ]; then
+                severity="CRITICAL"
+                CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1))
+                echo "    Severity: CRITICAL" >> "$REPORT_FILE"
+                echo "    Actual Status: ✗ No pods found in namespace" >> "$REPORT_FILE"
+            else
+                # Partial failure: some pods running, some not - this is WARNING not CRITICAL
+                severity="WARNING"
+                echo "    Severity: WARNING" >> "$REPORT_FILE"
+                echo "    Actual Status: ⚠ $running_pods/$total_pods pods Running" >> "$REPORT_FILE"
+            fi
+        else
+            echo "    Severity: $severity" >> "$REPORT_FILE"
+        fi
+
+        # Add recommendation based on severity
+        echo "    " >> "$REPORT_FILE"
+        echo "    Recommendation:" >> "$REPORT_FILE"
+        if [ "$severity" = "WARNING" ]; then
+            echo "      Helm failure may be a false-positive. Verify pods are functional." >> "$REPORT_FILE"
+        else
+            echo "      Investigate pod failures and check Helm release logs." >> "$REPORT_FILE"
+        fi
+        echo "" >> "$REPORT_FILE"
+    done
+fi
+
+# Issue 2: Missing deployments in service cluster
+if [ -n "$SVC_KUBECONFIG" ]; then
+    # Check for maestro namespace
+    if ! kubectl --kubeconfig "$SVC_KUBECONFIG" get namespace maestro &>/dev/null; then
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1))
+        echo "[$ISSUES_FOUND] Missing Deployment: maestro in Service Cluster" >> "$REPORT_FILE"
+        echo "    Severity: CRITICAL" >> "$REPORT_FILE"
+        echo "    Status: Namespace does not exist" >> "$REPORT_FILE"
+        echo "    " >> "$REPORT_FILE"
+        echo "    Likely Cause:" >> "$REPORT_FILE"
+        echo "      Deployment pipeline may have halted before service cluster setup" >> "$REPORT_FILE"
+        echo "    " >> "$REPORT_FILE"
+        echo "    Recommendation:" >> "$REPORT_FILE"
+        echo "      Option 1: Continue deployment to service cluster" >> "$REPORT_FILE"
+        echo "      Option 2: Re-run complete deployment" >> "$REPORT_FILE"
+        echo "" >> "$REPORT_FILE"
+    fi
+fi
+
+# Issue 3: Error patterns from logs
+if [ -f "$LOG_ANALYSIS_DIR/error_patterns.txt" ] && [ -s "$LOG_ANALYSIS_DIR/error_patterns.txt" ]; then
+    # Group unique error types (bash 3.x compatible)
+    # Get unique patterns
+    cut -d':' -f1 < "$LOG_ANALYSIS_DIR/error_patterns.txt" | sort -u > "$LOG_ANALYSIS_DIR/unique_patterns.txt"
+
+    while IFS= read -r pattern; do
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+
+        # Get first context for this pattern
+        context=$(grep "^${pattern}:::" "$LOG_ANALYSIS_DIR/error_patterns.txt" | head -1 | cut -d':' -f4-)
+
+        echo "[$ISSUES_FOUND] Error Pattern Detected: $pattern" >> "$REPORT_FILE"
+
+        case "$pattern" in
+            timing_conflict|helm_hook_failed)
+                echo "    Severity: WARNING" >> "$REPORT_FILE"
+                echo "    Description: Resource timing conflict detected" >> "$REPORT_FILE"
+                ;;
+            timeout)
+                # Timeouts are warnings unless they prevent critical operations
+                echo "    Severity: WARNING" >> "$REPORT_FILE"
+                echo "    Description: Operation timed out" >> "$REPORT_FILE"
+                ;;
+            authentication)
+                echo "    Severity: CRITICAL" >> "$REPORT_FILE"
+                CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1))
+                echo "    Description: Authentication or authorization failure" >> "$REPORT_FILE"
+                ;;
+            network)
+                echo "    Severity: CRITICAL" >> "$REPORT_FILE"
+                CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1))
+                echo "    Description: Network connectivity issue" >> "$REPORT_FILE"
+                ;;
+            *)
+                echo "    Severity: WARNING" >> "$REPORT_FILE"
+                echo "    Description: $pattern error detected" >> "$REPORT_FILE"
+                ;;
+        esac
+
+        echo "    Context: $(echo "$context" | head -c 150)..." >> "$REPORT_FILE"
+        echo "" >> "$REPORT_FILE"
+    done < "$LOG_ANALYSIS_DIR/unique_patterns.txt"
+fi
+
+if [ $ISSUES_FOUND -eq 0 ]; then
+    echo "No issues detected." >> "$REPORT_FILE"
+    echo "✓ All services appear to be running normally." >> "$REPORT_FILE"
+else
+    echo "" >> "$REPORT_FILE"
+    echo "SUMMARY:" >> "$REPORT_FILE"
+    echo "--------" >> "$REPORT_FILE"
+    echo "Total Issues: $ISSUES_FOUND" >> "$REPORT_FILE"
+    echo "Critical Issues: $CRITICAL_ISSUES" >> "$REPORT_FILE"
+    echo "" >> "$REPORT_FILE"
+    if [ $CRITICAL_ISSUES -eq 0 ]; then
+        echo "Overall Status: Deployment appears successful despite Helm warnings" >> "$REPORT_FILE"
+        echo "Action Required: None - Services are functional" >> "$REPORT_FILE"
+    else
+        echo "Overall Status: Deployment incomplete - requires intervention" >> "$REPORT_FILE"
+        echo "Action Required: Complete service cluster deployment" >> "$REPORT_FILE"
+    fi
+fi
+
+echo "" >> "$REPORT_FILE"
+echo "End of Diagnostic Report" >> "$REPORT_FILE"
+echo "========================================" >> "$REPORT_FILE"
+
+# Display report
+echo "=========================================="
+echo "Diagnostic Report Generated"
+echo "=========================================="
+echo ""
+cat "$REPORT_FILE"
+
+# Save report to current directory
+REPORT_OUTPUT="maestro-diagnosis-$(date +%Y%m%d-%H%M%S).txt"
+cp "$REPORT_FILE" "$REPORT_OUTPUT"
+
+echo ""
+echo "=========================================="
+echo "Report saved to: $REPORT_OUTPUT"
+echo "=========================================="
+echo ""
+
+# Send to Slack if webhook is configured
+if [ -n "$SLACK_WEBHOOK_URL" ]; then
+    echo "Sending report to Slack..."
+    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+    if bash "$SCRIPT_DIR/send-to-slack.sh" "$REPORT_OUTPUT"; then
+        echo "✓ Report sent to Slack"
+    else
+        echo "⚠ Failed to send report to Slack (report still saved locally)"
+    fi
+    echo ""
+fi
+
+# Summary
+if [ $ISSUES_FOUND -gt 0 ]; then
+    echo "Found $ISSUES_FOUND issue(s). See report for details and recommendations."
+    exit 1
+else
+    echo "No critical issues found. Deployment appears successful."
+    exit 0
+fi
diff --git a/.claude/skills/diagnose-maestro-deployment/scripts/send-to-slack.sh b/.claude/skills/diagnose-maestro-deployment/scripts/send-to-slack.sh
new file mode 100755
index 00000000..85350524
--- /dev/null
+++ b/.claude/skills/diagnose-maestro-deployment/scripts/send-to-slack.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+set -e
+
+REPORT_FILE="$1"
+WEBHOOK_URL="${SLACK_WEBHOOK_URL}"
+
+if [ -z "$REPORT_FILE" ] || [ ! -f "$REPORT_FILE" ]; then
+    echo "ERROR: Report file not provided or does not exist"
+    echo "Usage: $0 <report-file>"
+    exit 1
+fi
+
+if [ -z "$WEBHOOK_URL" ]; then
+    echo "ERROR: SLACK_WEBHOOK_URL not set"
+    exit 1
+fi
+
+# Extract key information from report
+TOTAL_ISSUES=$(grep "^Total Issues:" "$REPORT_FILE" | awk '{print $3}')
+CRITICAL_ISSUES=$(grep "^Critical Issues:" "$REPORT_FILE" | awk '{print $3}')
+OVERALL_STATUS=$(grep "^Overall Status:" "$REPORT_FILE" | sed 's/Overall Status: //')
+ACTION_REQUIRED=$(grep "^Action Required:" "$REPORT_FILE" | sed 's/Action Required: //')
+
+# Extract cluster info
+SVC_CLUSTER=$(grep "Service:" "$REPORT_FILE" | head -1 | awk '{print $2"/"$3}')
+MGMT_CLUSTER=$(grep "Management:" "$REPORT_FILE" | head -1 | awk '{print $2"/"$3}')
+
+# Determine color based on critical issues
+if [ "$CRITICAL_ISSUES" = "0" ]; then
+    COLOR="warning"
+    EMOJI="⚠️"
+    STATUS_ICON="⚠️"
+else
+    COLOR="danger"
+    EMOJI="🚨"
+    STATUS_ICON="❌"
+fi
+
+# Extract primary failure reason - just the first line
+PRIMARY_REASON=$(grep "^Primary Failure:" "$REPORT_FILE" | sed 's/Primary Failure: //' | head -1)
+
+# Extract conflict fields - clean format
+CONFLICT_FIELDS=$(sed -n '/^Conflicting Fields:/,/^$/p' "$REPORT_FILE" 2>/dev/null | grep "•" | sed 's/  • //' | tr '\n' ',' | sed 's/,$//' | sed 's/,/, /g')
+
+# Extract cascading failure if exists
+CASCADING=$(grep "^Cascading Failure:" "$REPORT_FILE" | sed 's/Cascading Failure: //' | head -1)
+
+# Build issue fields array - each issue gets its own field with detailed info
+ISSUE_FIELDS="[]"
+issue_num=1
+while IFS= read -r line; do
+    if [[ "$line" =~ ^\[([0-9]+)\]\ (.+)$ ]]; then
+        issue_title="${BASH_REMATCH[2]}"
+
+        # Get full issue section
+        issue_section=$(sed -n "/^\[$issue_num\]/,/^\[/p" "$REPORT_FILE")
+        severity=$(echo "$issue_section" | grep "Severity:" | awk '{print $2}' | head -1)
+
+        # Determine emoji based on severity
+        if [ "$severity" = "CRITICAL" ]; then
+            sev_emoji="🔴"
+        else
+            sev_emoji="🟡"
+        fi
+
+        # Build structured issue description based on issue type
+        issue_value=""
+
+        if [[ "$issue_title" =~ "Hypershift" ]]; then
+            # Hypershift issue - show clear cause and effect
+            issue_value="*Root Cause:*"
+            issue_value+=$'\n'"• Hypershift release post-install hook attempted to create ClusterSizingConfiguration resource"
+            issue_value+=$'\n'"• Resource was already created and managed by hypershift-operator-manager"
+
+            # Get specific conflicting fields
+            specific_conflicts=$(echo "$issue_section" | sed -n '/Specific Conflicting Fields:/,/^    $/p' | grep "•" | sed 's/^      • //')
+            if [ -n "$specific_conflicts" ]; then
+                issue_value+=$'\n'"• Leading to field conflicts:"
+                while IFS= read -r field; do
+                    issue_value+=$'\n'"  - $field"
+                done <<< "$specific_conflicts"
+            fi
+            issue_value+=$'\n'"• Helm marked the release as failed due to post-install hook failure"
+
+            # Get actual status
+            issue_value+=$'\n\n'"*Actual Status:*"
+            actual_status=$(echo "$issue_section" | sed -n '/Actual Service Status:/,/^    $/p' | grep "✓" | sed 's/^      //')
+            if [ -n "$actual_status" ]; then
+                while IFS= read -r line; do
+                    issue_value+=$'\n'"$line"
+                done <<< "$actual_status"
+            fi
+
+            issue_value+=$'\n\n'"*Conclusion:* Although Helm status is failed, services are actually running normally. This is a Helm hook timing issue."
+
+        elif [[ "$issue_title" =~ "MCE" ]]; then
+            # MCE issue - similar structure
+            root_cause=$(echo "$issue_section" | sed -n '/Root Cause:/,/^    $/p' | grep -v "Root Cause:" | grep -v "^    $" | sed 's/^      //' | head -1)
+            issue_value="*Root Cause:*"$'\n'"• $root_cause"
+
+            # Get actual status
+            issue_value+=$'\n\n'"*Actual Status:*"
+            actual_status=$(echo "$issue_section" | sed -n '/Actual Service Status:/,/^    $/p' | grep "✓" | sed 's/^      //')
+            if [ -n "$actual_status" ]; then
+                while IFS= read -r line; do
+                    issue_value+=$'\n'"$line"
+                done <<< "$actual_status"
+            fi
+
+            issue_value+=$'\n\n'"*Conclusion:* MCE services are running normally, Helm failure can be ignored."
+
+        elif [[ "$issue_title" =~ "Maestro Not Deployed" ]]; then
+            # Maestro not deployed - show cascading failure
+            issue_value="*Root Cause (Cascading Failure):*"
+            what_happened=$(echo "$issue_section" | sed -n '/What Happened:/,/^    $/p' | grep -v "What Happened:" | grep -v "^    $" | sed 's/^      //' | sed 's/^[0-9]\. /• /')
+            if [ -n "$what_happened" ]; then
+                while IFS= read -r line; do
+                    issue_value+=$'\n'"$line"
+                done <<< "$what_happened"
+            fi
+
+            # Get impact
+            issue_value+=$'\n\n'"*Impact:*"
+            impact=$(echo "$issue_section" | sed -n '/Impact:/,/^    $/p' | grep "✗" | sed 's/^      //')
+            if [ -n "$impact" ]; then
+                while IFS= read -r line; do
+                    issue_value+=$'\n'"$line"
+                done <<< "$impact"
+            fi
+
+            issue_value+=$'\n\n'"*Conclusion:* Service cluster deployment incomplete, manual intervention required."
+        fi
+
+        # Add to issues array
+        if command -v jq &> /dev/null; then
+            issue_field=$(jq -n \
+                --arg title "$sev_emoji Issue $issue_num: $issue_title" \
+                --arg value "$issue_value" \
+                '{
+                    title: $title,
+                    value: $value,
+                    short: false
+                }')
+            ISSUE_FIELDS=$(echo "$ISSUE_FIELDS" | jq --argjson field "$issue_field" '. += [$field]')
+        fi
+
+        issue_num=$((issue_num + 1))
+    fi
+done < <(grep -E '^\[[0-9]+\]' "$REPORT_FILE")
+
+# Build clean, simple message using Slack fields format
+if command -v jq &> /dev/null; then
+    # Build base fields first
+    BASE_FIELDS=$(jq -n \
+        --arg status "$OVERALL_STATUS" \
+        --arg total "$TOTAL_ISSUES" \
+        --arg critical "$CRITICAL_ISSUES" \
+        --arg svc "$SVC_CLUSTER" \
+        --arg mgmt "$MGMT_CLUSTER" \
+        --arg primary "$PRIMARY_REASON" \
+        --arg conflicts "$CONFLICT_FIELDS" \
+        --arg cascading "$CASCADING" \
+        --arg action "$ACTION_REQUIRED" \
+        '[
+            {
+                title: "Status",
+                value: $status,
+                short: true
+            },
+            {
+                title: "Issues",
+                value: ("Total: " + $total + " | Critical: " + $critical),
+                short: true
+            },
+            {
+                title: "Service Cluster",
+                value: $svc,
+                short: true
+            },
+            {
+                title: "Management Cluster",
+                value: $mgmt,
+                short: true
+            },
+            {
+                title: "Primary Failure",
+                value: $primary,
+                short: false
+            },
+            (if $conflicts != "" then {
+                title: "Conflicting Fields",
+                value: $conflicts,
+                short: false
+            } else empty end),
+            (if $cascading != "" then {
+                title: "Cascading Impact",
+                value: $cascading,
+                short: false
+            } else empty end)
+        ]')
+
+    # Combine base fields with issue fields and action
+    ALL_FIELDS=$(echo "$BASE_FIELDS $ISSUE_FIELDS" | jq -s '.[0] + .[1] + [{title: "Action Required", value: $action, short: false}]' --arg action "$ACTION_REQUIRED")
+
+    # Build final payload
+    PAYLOAD=$(jq -n \
+        --arg color "$COLOR" \
+        --arg title "$EMOJI Maestro Deployment Diagnostic" \
+        --argjson fields "$ALL_FIELDS" \
+        --argjson ts "$(date +%s)" \
+        '{
+            attachments: [{
+                color: $color,
+                title: $title,
+                fields: $fields,
+                footer: "Maestro Diagnostic Tool",
+                ts: $ts,
+                mrkdwn_in: ["fields"]
+            }]
+        }')
+elif command -v python3 &> /dev/null; then
+    # Fallback to simple format if jq not available
+    MESSAGE="*$EMOJI Maestro Deployment Diagnosis*\n\n"
+    MESSAGE+="*Status:* $STATUS_ICON $OVERALL_STATUS\n"
+    MESSAGE+="*Action Required:* $ACTION_REQUIRED\n"
+    MESSAGE+="*Total Issues:* \`$TOTAL_ISSUES\` | *Critical:* \`$CRITICAL_ISSUES\`\n\n"
+    MESSAGE+="*Clusters:*\n• Service: \`$SVC_CLUSTER\`\n• Management: \`$MGMT_CLUSTER\`\n\n"
+    MESSAGE+="See full diagnostic report for details."
+
+    PAYLOAD=$(python3 -c "
+import json, sys
+payload = {
+    'attachments': [{
+        'color': sys.argv[1],
+        'text': sys.argv[2],
+        'footer': 'Maestro Diagnostic Tool',
+        'ts': int(sys.argv[3]),
+        'mrkdwn_in': ['text']
+    }]
+}
+print(json.dumps(payload))
+" "$COLOR" "$MESSAGE" "$(date +%s)")
+else
+    echo "ERROR: jq or python3 required for JSON construction"
+    exit 1
+fi
+
+# Send to Slack
+echo "Sending diagnostic report to Slack..."
+if curl -X POST -H 'Content-type: application/json' \
+    --data "$PAYLOAD" \
+    "$WEBHOOK_URL" \
+    --silent --show-error --fail; then
+    echo ""
+    echo "✓ Diagnostic report sent to Slack successfully"
+    exit 0
+else
+    echo ""
+    echo "✗ Failed to send diagnostic report to Slack"
+    exit 1
+fi
diff --git a/.claude/skills/run-e2e-tests/SKILL.md b/.claude/skills/run-e2e-tests/SKILL.md
new file mode 100644
index 00000000..95b8e413
--- /dev/null
+++ b/.claude/skills/run-e2e-tests/SKILL.md
@@ -0,0 +1,47 @@
+---
+name: run-e2e-tests
+description: Runs end-to-end or upgrade tests on existing long-running Maestro clusters deployed in Azure AKS
+category: Testing
+tags: [azure, aks, maestro, e2e, testing, upgrade, kubernetes]
+---
+
+# Run E2E Tests on Long-Running Cluster
+
+Runs end-to-end or upgrade tests on existing long-running Maestro clusters deployed in Azure AKS.
+
+**Prerequisites:**
+- Azure CLI, kubectl, kubelogin, jq must be installed
+- Logged into Azure with cluster access
+- Long-running clusters must be already deployed
+- Required environment variables:
+  - `SVC_RESOURCE_GROUP`: Resource group for service cluster
+  - `SVC_CLUSTER_NAME`: Name of service cluster
+  - `MGMT_RESOURCE_GROUP`: Resource group for management cluster
+  - `MGMT_CLUSTER_NAME`: Name of management cluster
+
+**Usage:**
+```bash
+/run-e2e-tests [test-type]
+```
+
+Where `test-type` can be:
+- `upgrade`: Run upgrade tests (default)
+- `e2e`: Run standard E2E tests with Istio
+- `all`: Run both upgrade and e2e tests
+
+**Example:**
+```bash
+export SVC_RESOURCE_GROUP="hcp-underlay-<cluster-id>-svc"
+export SVC_CLUSTER_NAME="<cluster-id>-svc"
+export MGMT_RESOURCE_GROUP="hcp-underlay-<cluster-id>-mgmt-1"
+export MGMT_CLUSTER_NAME="<cluster-id>-mgmt-1"
+
+/run-e2e-tests upgrade
+```
+
+```bash
+#!/bin/bash
+# Execute the E2E test script
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+exec "$SCRIPT_DIR/scripts/run-tests.sh" "$@"
+```
diff --git a/.claude/skills/run-e2e-tests/scripts/run-tests.sh b/.claude/skills/run-e2e-tests/scripts/run-tests.sh
new file mode 100755
index 00000000..184456df
--- /dev/null
+++ b/.claude/skills/run-e2e-tests/scripts/run-tests.sh
@@ -0,0 +1,316 @@
+#!/bin/bash
+set -e
+
+echo "Starting E2E tests on long-running Maestro clusters..."
+
+# Parse test type argument
+TEST_TYPE="${1:-upgrade}"
+
+# Step 1: Verify prerequisites
+echo "Step 1: Verifying prerequisites..."
+
+# Check Azure CLI
+if ! command -v az &> /dev/null; then
+    echo "ERROR: Azure CLI is not installed."
+    echo "Please install it from: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli"
+    exit 1
+fi
+echo "✓ Azure CLI is installed"
+
+# Check kubectl
+if ! command -v kubectl &> /dev/null; then
+    echo "ERROR: kubectl is not installed."
+    exit 1
+fi
+echo "✓ kubectl is installed"
+
+# Check kubelogin
+if ! command -v kubelogin &> /dev/null; then
+    echo "Installing kubelogin..."
+    az aks install-cli
+fi
+echo "✓ kubelogin is installed ($(kubelogin --version))"
+
+# Check jq
+if ! command -v jq &> /dev/null; then
+    echo "ERROR: jq is not installed."
+    echo "Please install it: brew install jq (macOS) or sudo apt install jq (Linux)"
+    exit 1
+fi
+echo "✓ jq is installed"
+
+# Verify Azure account login
+if ! az account show &> /dev/null; then
+    echo "ERROR: Not logged into Azure."
+    echo "Please run: az login"
+    exit 1
+fi
+
+ACCOUNT_NAME=$(az account show --query "name" -o tsv)
+echo "✓ Logged into Azure account: $ACCOUNT_NAME"
+
+# Check required environment variables
+if [ -z "$SVC_RESOURCE_GROUP" ] || [ -z "$SVC_CLUSTER_NAME" ] || \
+   [ -z "$MGMT_RESOURCE_GROUP" ] || [ -z "$MGMT_CLUSTER_NAME" ]; then
+    echo "ERROR: Required environment variables are not set."
+    echo "Please set:"
+    echo "  export SVC_RESOURCE_GROUP=<your-svc-resource-group>"
+    echo "  export SVC_CLUSTER_NAME=<your-svc-cluster-name>"
+    echo "  export MGMT_RESOURCE_GROUP=<your-mgmt-resource-group>"
+    echo "  export MGMT_CLUSTER_NAME=<your-mgmt-cluster-name>"
+    exit 1
+fi
+
+echo "Using clusters:"
+echo "  Service: $SVC_RESOURCE_GROUP/$SVC_CLUSTER_NAME"
+echo "  Management: $MGMT_RESOURCE_GROUP/$MGMT_CLUSTER_NAME"
+echo ""
+
+# Step 2: Get AKS credentials
+echo "Step 2: Getting AKS credentials..."
+
+az aks get-credentials \
+    --resource-group "$SVC_RESOURCE_GROUP" \
+    --name "$SVC_CLUSTER_NAME" \
+    --overwrite-existing \
+    -f ./svc-cluster.kubeconfig
+
+az aks get-credentials \
+    --resource-group "$MGMT_RESOURCE_GROUP" \
+    --name "$MGMT_CLUSTER_NAME" \
+    --overwrite-existing \
+    -f ./mgmt-cluster.kubeconfig
+
+echo "✓ Credentials downloaded"
+
+# Step 3: Convert kubeconfig for non-interactive login
+echo "Step 3: Converting kubeconfig for azurecli..."
+
+kubelogin convert-kubeconfig -l azurecli --kubeconfig ./svc-cluster.kubeconfig
+kubelogin convert-kubeconfig -l azurecli --kubeconfig ./mgmt-cluster.kubeconfig
+
+echo "✓ Kubeconfig converted"
+
+# Verify cluster access
+echo "Verifying cluster access..."
+kubectl --kubeconfig ./svc-cluster.kubeconfig get pods -A -l app=maestro
+kubectl --kubeconfig ./mgmt-cluster.kubeconfig get pods -A -l app=maestro-agent
+
+echo "✓ Cluster access verified"
+echo ""
+
+# Step 4: Generate in-cluster kubeconfig
+echo "Step 4: Generating in-cluster kubeconfig..."
+
+generate_in_cluster_kube() {
+    local kubeconfig=$1
+    local type=$2
+
+    echo "  Generating for $type cluster..."
+
+    # Create service account
+    kubectl --kubeconfig "$kubeconfig" -n default create serviceaccount e2e-test-admin 2>/dev/null || true
+
+    # Create cluster role binding
+    cat << EOF | kubectl --kubeconfig "$kubeconfig" apply -f -
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: e2e-test-admin
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: cluster-admin
+subjects:
+- kind: ServiceAccount
+  name: e2e-test-admin
+  namespace: default
+EOF
+
+    # Create token
+    local token
+    token=$(kubectl --kubeconfig "$kubeconfig" create token e2e-test-admin --namespace default --duration=8h)
+
+    # Get cluster info
+    local api_server
+    local ca_cert
+    api_server=$(kubectl --kubeconfig "$kubeconfig" config view -o jsonpath='{.clusters[0].cluster.server}')
+    ca_cert=$(kubectl --kubeconfig "$kubeconfig" config view --raw -o jsonpath='{.clusters[0].cluster.certificate-authority-data}')
+
+    # Create in-cluster kubeconfig
+    cat > "${type}-incluster.kubeconfig" << EOF
+apiVersion: v1
+kind: Config
+clusters:
+- name: my-cluster
+  cluster:
+    server: "$api_server"
+    certificate-authority-data: "$ca_cert"
+users:
+- name: e2e-test-admin
+  user:
+    token: "$token"
+contexts:
+- name: e2e-test-admin-context
+  context:
+    cluster: my-cluster
+    user: e2e-test-admin
+    namespace: default
+current-context: e2e-test-admin-context
+EOF
+}
+
+generate_in_cluster_kube "$(pwd)/svc-cluster.kubeconfig" "svc"
+generate_in_cluster_kube "$(pwd)/mgmt-cluster.kubeconfig" "mgmt"
+
+echo "✓ In-cluster kubeconfig files generated"
+echo ""
+
+# Step 5: Extract deployment information
+echo "Step 5: Extracting deployment information..."
+
+# Get pod template hash for active replicaset
+pod_template_hash=$(kubectl --kubeconfig "$(pwd)/svc-cluster.kubeconfig" get rs -l app=maestro -n maestro -o jsonpath='{range .items[?(@.spec.replicas>0)]}{.metadata.labels.pod-template-hash}{"\n"}{end}' | head -1)
+if [ -z "$pod_template_hash" ]; then
+    echo "ERROR: No active replicaset found"
+    exit 1
+fi
+echo "  Pod template hash: $pod_template_hash"
+
+# Get pod name
+pod_name=$(kubectl --kubeconfig "$(pwd)/svc-cluster.kubeconfig" get pods -n maestro -l pod-template-hash="$pod_template_hash" -o jsonpath='{.items[0].metadata.name}')
+if [ -z "$pod_name" ]; then
+    echo "ERROR: No pod found for replicaset hash $pod_template_hash"
+    exit 1
+fi
+echo "  Pod name: $pod_name"
+
+# Extract commit SHA
+commit_sha=$(kubectl --kubeconfig "$(pwd)/svc-cluster.kubeconfig" logs -n maestro "$pod_name" | grep -i "Git Commit" | grep -oE '[a-f0-9]{40}')
+if [ -z "$commit_sha" ]; then
+    echo "ERROR: Could not extract commit SHA from pod logs"
+    exit 1
+fi
+echo "  Commit SHA: $commit_sha"
+
+# Get consumer name
+consumer_name=$(kubectl --kubeconfig "$(pwd)/mgmt-cluster.kubeconfig" get deployment maestro-agent -n maestro -o yaml | grep -E "^\s+- --consumer-name=" | sed 's/.*--consumer-name=//' | head -1)
+if [ -z "$consumer_name" ]; then
+    echo "ERROR: Could not extract consumer name from agent deployment"
+    exit 1
+fi
+echo "  Consumer name: $consumer_name"
+echo ""
+
+# Step 6: Run tests
+echo "Step 6: Running $TEST_TYPE tests..."
+echo "=========================================="
+echo ""
+
+TEST_FAILED=0
+
+run_upgrade_test() {
+    echo "Running upgrade tests..."
+
+    if IMAGE="quay.io/redhat-user-workloads/maestro-rhtap-tenant/maestro-e2e:$commit_sha" \
+        CONSUMER_NAME="$consumer_name" \
+        SERVER_KUBECONFIG="$(pwd)/svc-cluster.kubeconfig" \
+        AGENT_IN_CLUSTER_KUBECONFIG="$(pwd)/mgmt-incluster.kubeconfig" \
+        SERVICE_ACCOUNT_NAME=clusters-service \
+        ENABLE_AUTHORIZATION_POLICY=true \
+        bash -x test/upgrade/script/run.sh; then
+        echo "✓ Upgrade test passed"
+        return 0
+    else
+        echo "✗ Upgrade test failed"
+        return 1
+    fi
+}
+
+run_e2e_test() {
+    echo "Running E2E tests with istio..."
+
+    if AGENT_NAMESPACE=maestro \
+        IMAGE="quay.io/redhat-user-workloads/maestro-rhtap-tenant/maestro-e2e:$commit_sha" \
+        CONSUMER_NAME="$consumer_name" \
+        SERVER_KUBECONFIG="$(pwd)/svc-cluster.kubeconfig" \
+        AGENT_KUBECONFIG="$(pwd)/mgmt-cluster.kubeconfig" \
+        SERVER_IN_CLUSTER_KUBECONFIG="$(pwd)/svc-incluster.kubeconfig" \
+        AGENT_IN_CLUSTER_KUBECONFIG="$(pwd)/mgmt-incluster.kubeconfig" \
+        SERVICE_ACCOUNT_NAME=clusters-service \
+        bash -x test/e2e/istio/test.sh; then
+        echo "✓ E2E test passed"
+        return 0
+    else
+        echo "✗ E2E test failed"
+        return 1
+    fi
+}
+
+case "$TEST_TYPE" in
+    upgrade)
+        run_upgrade_test || TEST_FAILED=1
+        ;;
+    e2e)
+        run_e2e_test || TEST_FAILED=1
+        ;;
+    all)
+        run_upgrade_test || TEST_FAILED=1
+        run_e2e_test || TEST_FAILED=1
+        ;;
+    *)
+        echo "ERROR: Invalid test type: $TEST_TYPE"
+        echo "Valid options: upgrade, e2e, all"
+        exit 1
+        ;;
+esac
+
+echo ""
+echo "=========================================="
+
+# Step 7: Summarize results
+echo "Step 7: Test Summary"
+echo "=========================================="
+
+if [ $TEST_FAILED -eq 0 ]; then
+    echo "✓ All tests PASSED"
+    echo ""
+    echo "Test configuration:"
+    echo "  Image: quay.io/redhat-user-workloads/maestro-rhtap-tenant/maestro-e2e:$commit_sha"
+    echo "  Consumer: $consumer_name"
+    echo "  Test type: $TEST_TYPE"
+else
+    echo "✗ Tests FAILED"
+    echo ""
+    echo "Check the test output above for failure details."
+    echo "Common failure locations:"
+    echo "  - test/upgrade/script/run.sh output"
+    echo "  - test/e2e/istio/test.sh output"
+    echo "  - Pod logs: kubectl --kubeconfig ./svc-cluster.kubeconfig logs -n maestro -l app=maestro"
+fi
+
+echo ""
+
+# Step 8: Cleanup
+echo "Step 8: Cleaning up test resources..."
+
+kubectl --kubeconfig "$(pwd)/svc-cluster.kubeconfig" delete serviceaccount e2e-test-admin -n default 2>/dev/null || true
+kubectl --kubeconfig "$(pwd)/svc-cluster.kubeconfig" delete clusterrolebinding e2e-test-admin 2>/dev/null || true
+kubectl --kubeconfig "$(pwd)/mgmt-cluster.kubeconfig" delete serviceaccount e2e-test-admin -n default 2>/dev/null || true
+kubectl --kubeconfig "$(pwd)/mgmt-cluster.kubeconfig" delete clusterrolebinding e2e-test-admin 2>/dev/null || true
+
+# Remove kubeconfig files containing sensitive credentials
+echo "Removing temporary kubeconfig files..."
+rm -f "$(pwd)/svc-cluster.kubeconfig" "$(pwd)/mgmt-cluster.kubeconfig"
+rm -f "$(pwd)/svc-incluster.kubeconfig" "$(pwd)/mgmt-incluster.kubeconfig"
+
+echo "✓ Cleanup complete"
+echo ""
+
+if [ $TEST_FAILED -eq 0 ]; then
+    echo "E2E testing completed successfully!"
+    exit 0
+else
+    echo "E2E testing completed with failures."
+    exit 1
+fi
diff --git a/.claude/skills/setup-maestro-cluster/SKILL.md b/.claude/skills/setup-maestro-cluster/SKILL.md
new file mode 100644
index 00000000..357e7ecf
--- /dev/null
+++ b/.claude/skills/setup-maestro-cluster/SKILL.md
@@ -0,0 +1,30 @@
+---
+name: setup-maestro-cluster
+description: Sets up a long-running Maestro cluster environment using Azure ARO-HCP infrastructure with both service and management clusters
+category: Infrastructure
+tags: [azure, aks, maestro, deployment, cluster, aro-hcp]
+---
+
+# Setup Maestro Long-Running Cluster
+
+Sets up a long-running Maestro cluster environment using Azure ARO-HCP infrastructure. This will deploy both service and management clusters.
+
+**Prerequisites:**
+- Azure CLI installed and logged in
+- Access to "ARO Hosted Control Planes" Azure subscription
+- Internet connectivity
+
+**Environment variables set:**
+- `USER=oasis` (if not already set)
+- `PERSIST=true`
+- `GITHUB_ACTIONS=true`
+- `GOTOOLCHAIN=go1.24.4`
+
+**Note:** This deployment typically takes 25-30 minutes to complete.
+
+```bash
+#!/bin/bash
+# Execute the cluster setup script
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+exec "$SCRIPT_DIR/scripts/setup.sh" "$@"
+```
diff --git a/.claude/skills/setup-maestro-cluster/scripts/setup.sh b/.claude/skills/setup-maestro-cluster/scripts/setup.sh
new file mode 100755
index 00000000..8aa712b8
--- /dev/null
+++ b/.claude/skills/setup-maestro-cluster/scripts/setup.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+set -e
+
+echo "Starting Maestro long-running cluster setup..."
+
+
+# Cleanup function
+cleanup() {
+    if [ -n "$TEMP_DIR" ] && [ -d "$TEMP_DIR" ]; then
+        echo "Cleaning up temporary directory: $TEMP_DIR"
+        rm -rf "$TEMP_DIR"
+    fi
+}
+
+# Register cleanup on exit
+trap cleanup EXIT INT TERM
+
+# Step 1: Check if Azure CLI is installed
+if ! command -v az &> /dev/null; then
+    echo "ERROR: Azure CLI is not installed."
+    echo "Please install it from: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli"
+    exit 1
+fi
+
+echo "✓ Azure CLI is installed"
+
+# Step 2: Verify Azure account login
+if ! az account show &> /dev/null; then
+    echo "ERROR: Not logged into Azure."
+    echo "Please run: az login"
+    exit 1
+fi
+
+ACCOUNT_NAME=$(az account show --query "name" -o tsv)
+echo "Current Azure account: $ACCOUNT_NAME"
+
+if [[ ! "$ACCOUNT_NAME" =~ "ARO Hosted Control Planes" ]]; then
+    echo "ERROR: Not logged into the correct Azure account."
+    echo "Expected account containing 'ARO Hosted Control Planes', but got: $ACCOUNT_NAME"
+    echo "Please login to the correct account using: az login"
+    exit 1
+fi
+
+echo "✓ Logged into correct Azure account"
+
+# Step 3: Clone ARO-HCP repository
+TEMP_DIR=$(mktemp -d)
+
+# Validate mktemp succeeded
+if [ -z "$TEMP_DIR" ] || [ ! -d "$TEMP_DIR" ]; then
+    echo "ERROR: Failed to create temporary directory"
+    exit 1
+fi
+
+echo "Cloning ARO-HCP repository to: $TEMP_DIR"
+
+if ! timeout 300 git clone https://github.com/Azure/ARO-HCP "$TEMP_DIR/ARO-HCP"; then
+    echo "ERROR: Failed to clone ARO-HCP repository (timeout: 300s)"
+    rm -rf "$TEMP_DIR"
+    exit 1
+fi
+
+echo "✓ Repository cloned successfully"
+
+# Step 4 & 5: Configure environment and deploy
+pushd "$TEMP_DIR/ARO-HCP" > /dev/null
+
+echo "Setting environment variables..."
+# Set USER to oasis if not already set (required by ARO-HCP)
+export USER="${USER:-oasis}"
+# PERSIST can be set via environment variable (default: true for not auto-cleanup after testing)
+export PERSIST="${PERSIST:-true}"
+export GITHUB_ACTIONS=true
+export GOTOOLCHAIN=go1.24.4
+
+echo "USER=$USER"
+echo "PERSIST=$PERSIST"
+echo "GITHUB_ACTIONS=$GITHUB_ACTIONS"
+echo "GOTOOLCHAIN=$GOTOOLCHAIN"
+
+echo ""
+echo "Starting personal-dev-env deployment..."
+echo "This may take several minutes..."
+echo ""
+
+if timeout 3600 make personal-dev-env; then
+    echo ""
+    echo "✓ Deployment completed successfully!"
+    echo "ARO-HCP repository location: $TEMP_DIR/ARO-HCP"
+else
+    echo ""
+    echo "ERROR: Deployment failed or timed out (timeout: 3600s)!"
+    popd > /dev/null
+    exit 1
+fi
+
+popd > /dev/null
+
+# Cleanup temporary directory
+echo "Cleaning up temporary clone..."
+rm -rf "$TEMP_DIR"
+
+echo ""
+echo "Setup complete!"
diff --git a/.gitignore b/.gitignore
index 38fa783b..21974f8a 100755
--- a/.gitignore
+++ b/.gitignore
@@ -69,3 +69,9 @@ unit-test-results.json
 *integration-test-results.json
 test/e2e/setup/aro/aro-hcp
 test/upgrade/report/*
+
+# Ignore ARO-HCP kubeconfig
+*.kubeconfig
+
+# Ignore diagnostic reports
+maestro-diagnosis-*.txt
\ No newline at end of file