diff --git a/.claude/hooks/deployment-monitor.sh b/.claude/hooks/deployment-monitor.sh new file mode 100755 index 00000000..8667e119 --- /dev/null +++ b/.claude/hooks/deployment-monitor.sh @@ -0,0 +1,271 @@ +#!/bin/bash +# +# Deployment Monitor Hook +# This hook monitors long-running deployment processes and notifies when complete +# +# Usage: Can be called after triggering a deployment to monitor its progress +# +# Dependencies: +# - Required: bash, wc, tail, sed, grep, cat, tr, date, sleep +# - Optional: curl (for Slack notifications), osascript (for macOS notifications), notify-send (for Linux notifications) +# +# Configuration: +# Set SLACK_WEBHOOK_URL environment variable for Slack notifications + +set -e + +HOOK_NAME="deployment-monitor" + +# Configuration is loaded from environment variables: +# - SLACK_WEBHOOK_URL: Optional Slack webhook URL for notifications + +# Function to check if required commands are available +check_command() { + local cmd=$1 + local required=$2 + + if ! command -v "$cmd" &> /dev/null; then + if [ "$required" = "true" ]; then + echo "[$HOOK_NAME] ERROR: Required command '$cmd' is not installed" + return 1 + else + echo "[$HOOK_NAME] WARNING: Optional command '$cmd' is not installed" + return 0 + fi + fi + return 0 +} + +# Function to monitor deployment +monitor_deployment() { + local task_id=$1 + + if [ -z "$task_id" ]; then + echo "[$HOOK_NAME] ERROR: Task ID required" + echo "Usage: $0 monitor " + exit 1 + fi + + # Build task output paths dynamically based on current working directory + local cwd_sanitized + cwd_sanitized=$(pwd | tr '/' '-' | sed 's/^-//') + local task_dir="/tmp/claude/-${cwd_sanitized}/tasks" + + # Ensure task directory exists + if [ ! -d "$task_dir" ]; then + echo "[$HOOK_NAME] WARNING: Task directory does not exist: $task_dir" + echo "[$HOOK_NAME] Creating directory..." + mkdir -p "$task_dir" + fi + + local output_file="${task_dir}/${task_id}.output" + local exit_code_file="${task_dir}/${task_id}.exit_code" + local start_time + start_time=$(date +%s) + + echo "[$HOOK_NAME] Monitoring deployment task: $task_id" + echo "[$HOOK_NAME] Started at: $(date)" + echo "" + + # Wait for the deployment to complete + local last_line_count=0 + local max_wait_seconds=${MONITOR_TIMEOUT:-7200} # Default 2 hours + while true; do + # Check for timeout + local elapsed=$(($(date +%s) - start_time)) + if [ "$elapsed" -ge "$max_wait_seconds" ]; then + local minutes=$((max_wait_seconds / 60)) + echo "" + echo "[$HOOK_NAME] ERROR: Maximum wait time (${minutes} minutes) reached" + notify_completion "FAILED" "Deployment monitoring timed out after ${minutes} minutes without completion" + return 2 + fi + + # Check if exit code file exists (task completed) + if [ -f "$exit_code_file" ]; then + local exit_code + exit_code=$(cat "$exit_code_file") + echo "[$HOOK_NAME] Deployment process finished with exit code: $exit_code" + break + fi + + # Show progress if output file exists + if [ -f "$output_file" ]; then + local current_lines + current_lines=$(wc -l < "$output_file" | tr -d ' ') + if [ "$current_lines" != "$last_line_count" ]; then + local elapsed=$(($(date +%s) - start_time)) + local minutes=$((elapsed / 60)) + local seconds=$((elapsed % 60)) + echo "[$HOOK_NAME] Progress: $current_lines lines | Elapsed: ${minutes}m ${seconds}s | $(date +%H:%M:%S)" + + # Show latest activity + tail -3 "$output_file" | sed 's/\x1b\[[0-9;]*m//g' | grep -v "^$" | tail -1 | sed "s/^/[$HOOK_NAME] Latest: /" + + last_line_count=$current_lines + fi + fi + + # Sleep before next check + sleep 15 + done + + # Calculate total time + local end_time + end_time=$(date +%s) + local total_time=$((end_time - start_time)) + local minutes=$((total_time / 60)) + local seconds=$((total_time % 60)) + + # Determine status and send notification + if [ "$exit_code" -eq 0 ]; then + notify_completion "COMPLETE" "Maestro cluster deployment completed successfully in ${minutes}m ${seconds}s!" + echo "" + echo "[$HOOK_NAME] Total deployment time: ${minutes}m ${seconds}s" + echo "[$HOOK_NAME] Output file: $output_file" + return 0 + else + notify_completion "FAILED" "Deployment failed with exit code $exit_code after ${minutes}m ${seconds}s" + echo "" + echo "[$HOOK_NAME] Total deployment time: ${minutes}m ${seconds}s" + echo "[$HOOK_NAME] Output file: $output_file" + return 1 + fi +} + +# Function to send Slack notification +send_slack_notification() { + local status=$1 + local message=$2 + local webhook_url=$3 + + if [ -z "$webhook_url" ]; then + return 1 + fi + + # Check if curl is available + if ! check_command "curl" "false"; then + echo "[$HOOK_NAME] Skipping Slack notification - curl not available" + return 1 + fi + + # Determine color based on status + local color="good" + local emoji=":white_check_mark:" + if [[ "$status" == "FAILED" ]]; then + color="danger" + emoji=":x:" + elif [[ "$status" == "COMPLETE" ]]; then + color="good" + emoji=":white_check_mark:" + fi + + # Create JSON payload using jq for safe escaping + local payload + if command -v jq &> /dev/null; then + # Use jq for safe JSON construction + payload=$(jq -n \ + --arg color "$color" \ + --arg title "$emoji Maestro Deployment $status" \ + --arg text "$message" \ + --arg footer "Maestro Deployment Monitor" \ + --argjson ts "$(date +%s)" \ + '{attachments: [{color: $color, title: $title, text: $text, footer: $footer, ts: $ts}]}') + elif command -v python3 &> /dev/null; then + # Fallback: Use Python for proper JSON encoding + payload=$(python3 -c "import json, sys; print(json.dumps({'attachments': [{'color': sys.argv[1], 'title': sys.argv[2] + ' Maestro Deployment ' + sys.argv[3], 'text': sys.argv[4], 'footer': 'Maestro Deployment Monitor', 'ts': int(sys.argv[5])}]}))" "$color" "$emoji" "$status" "$message" "$(date +%s)") + else + # Last resort: Extended manual escaping for all control characters + local escaped_message="${message//\\/\\\\}" # Escape backslashes + escaped_message="${escaped_message//\"/\\\"}" # Escape quotes + escaped_message="${escaped_message//$'\n'/\\n}" # Escape newlines + escaped_message="${escaped_message//$'\r'/\\r}" # Escape carriage returns + escaped_message="${escaped_message//$'\t'/\\t}" # Escape tabs + + local escaped_status="${status//\\/\\\\}" + escaped_status="${escaped_status//\"/\\\"}" + escaped_status="${escaped_status//$'\n'/\\n}" + escaped_status="${escaped_status//$'\r'/\\r}" + escaped_status="${escaped_status//$'\t'/\\t}" + + payload=$(cat < /dev/null; then + # macOS notification - escape message for AppleScript + local safe_message="${message//\\/\\\\}" + safe_message="${safe_message//\"/\\\"}" + osascript -e "display notification \"$safe_message\" with title \"Maestro Deployment $status\"" + elif command -v notify-send &> /dev/null; then + # Linux notification - use safe argument passing + notify-send -- "Maestro Deployment $status" "$message" + fi +} + +# Main execution +case "${1:-notify}" in + monitor) + monitor_deployment "$2" + exit $? + ;; + notify) + notify_completion "${2:-COMPLETE}" "${3:-Deployment finished}" + ;; + *) + echo "Usage: $0 {monitor |notify }" + exit 1 + ;; +esac diff --git a/.claude/skills/README.md b/.claude/skills/README.md new file mode 100644 index 00000000..02abfaf5 --- /dev/null +++ b/.claude/skills/README.md @@ -0,0 +1,265 @@ +# Maestro Claude Skills + +This directory contains custom Claude Code skills for Maestro development and operations. + +## Skills Directory Structure + +Each skill is organized in its own folder with a `SKILL.md` file that defines the skill implementation: + +``` +.claude/skills/ +├── README.md +├── setup-maestro-cluster/ +│ ├── SKILL.md +│ └── scripts/ +│ └── setup.sh +├── run-e2e-tests/ +│ ├── SKILL.md +│ └── scripts/ +│ └── run-tests.sh +└── diagnose-maestro-deployment/ + ├── SKILL.md + └── scripts/ + └── diagnose.sh +``` + +## Available Skills + +### 1. setup-maestro-cluster + +Sets up a long-running Maestro cluster environment using Azure ARO-HCP infrastructure. + +**Usage:** +```bash +/setup-maestro-cluster +``` + +**What it does:** +1. Verifies Azure CLI installation and login status +2. Checks that you're logged into the "ARO Hosted Control Planes" Azure account +3. Clones the ARO-HCP repository to a temporary location +4. Sets required environment variables (USER, PERSIST, GITHUB_ACTIONS, GOTOOLCHAIN) +5. Runs `make personal-dev-env` to deploy the environment +6. Monitors and reports deployment status + +**Prerequisites:** +- Azure CLI installed (`brew install azure-cli` on macOS) +- Logged into correct Azure account: `az login` +- Valid Azure permissions for resource creation + +**Environment Variables Set:** +- `USER=oasis` (only if not already set) +- `PERSIST=true` +- `GITHUB_ACTIONS=true` +- `GOTOOLCHAIN=go1.24.4` + +**Documentation:** See [setup-maestro-cluster/SKILL.md](setup-maestro-cluster/SKILL.md) + +--- + +### 2. run-e2e-tests + +Runs end-to-end or upgrade tests on existing long-running Maestro clusters deployed in Azure AKS. + +**Usage:** +```bash +/run-e2e-tests [test-type] +``` + +Where `test-type` can be: +- `upgrade`: Run upgrade tests (default) +- `e2e`: Run standard E2E tests with Istio +- `all`: Run both upgrade and e2e tests + +**What it does:** +1. Verifies required tools (az, kubectl, kubelogin, jq) +2. Fetches AKS credentials for svc-cluster and mgmt-cluster +3. Converts kubeconfig for azurecli authentication +4. Generates in-cluster kubeconfig with service account tokens +5. Extracts deployment information (commit SHA, consumer name) +6. Runs selected test type(s) +7. Summarizes test results and failures +8. Cleans up test resources + +**Prerequisites:** +- Azure CLI, kubectl, kubelogin, jq must be installed +- Logged into Azure with cluster access +- Long-running clusters must be already deployed +- Required environment variables: + ```bash + export SVC_RESOURCE_GROUP="your-svc-rg" + export SVC_CLUSTER_NAME="your-svc-cluster" + export MGMT_RESOURCE_GROUP="your-mgmt-rg" + export MGMT_CLUSTER_NAME="your-mgmt-cluster" + ``` + +**Test Types:** +- **upgrade**: Pre-upgrade tests, server upgrade, post-upgrade tests, agent upgrade +- **e2e**: E2E tests with Istio service mesh +- **all**: Runs both upgrade and e2e tests sequentially + +**Documentation:** See [run-e2e-tests/SKILL.md](run-e2e-tests/SKILL.md) + +--- + +### 3. diagnose-maestro-deployment + +Automatically diagnoses failed Maestro cluster deployments by analyzing Helm releases, pod status, and resource conflicts. + +**Usage:** +```bash +# Diagnose using deployment output file +/diagnose-maestro-deployment /path/to/deployment.output + +# Diagnose using cluster information directly +/diagnose-maestro-deployment --svc-rg --svc-cluster --mgmt-rg --mgmt-cluster +``` + +**What it does:** +1. Analyzes deployment output to identify resource groups and cluster names +2. Retrieves credentials for both service and management clusters +3. Lists all Helm releases and identifies failed ones +4. Inspects pod states in critical namespaces +5. Checks for known issues (e.g., ClusterSizingConfiguration conflicts) +6. Identifies resource conflicts and timing issues +7. Generates a comprehensive diagnostic report +8. Saves the report to a timestamped file + +**Prerequisites:** +- Azure CLI, kubectl, helm must be installed +- Logged into Azure with cluster access +- jq installed for JSON parsing +- Access to deployment output or cluster information + +**Known Issues Detected:** +- **Hypershift ClusterSizingConfiguration conflict**: Helm post-install hook conflicts with operator-managed resources +- **MCE deployment failures**: Multicluster Engine Helm release issues +- **Missing Maestro in service cluster**: Deployment halted before service cluster setup + +**Output:** +The skill generates a detailed report saved as `maestro-diagnosis-YYYYMMDD-HHMMSS.txt` containing: +- Helm release status for both clusters +- Pod status in critical namespaces +- Failed release details +- Resource conflict analysis +- Root cause identification +- Recommended remediation steps + +**Exit Codes:** +- `0`: No critical issues found +- `1`: Issues detected (see report for details) + +**Documentation:** See [diagnose-maestro-deployment/SKILL.md](diagnose-maestro-deployment/SKILL.md) + +--- + +## Hooks + +### deployment-monitor.sh + +A hook that monitors long-running deployment processes and sends notifications. + +**Features:** +- Desktop notifications (macOS/Linux) +- Slack notifications via webhook +- Customizable status messages +- Real-time deployment monitoring +- Configurable timeout (default: 2 hours) + +**Dependencies:** +- Required: `bash`, `wc`, `tail`, `sed`, `grep`, `cat`, `tr`, `date`, `sleep` (standard Unix tools) +- Optional: `curl` (for Slack notifications), `osascript` (for macOS notifications), `notify-send` (for Linux notifications) + +**Configuration:** + +To enable Slack notifications: + +1. Create a Slack webhook: + - Go to + - Create an Incoming Webhook for your channel + - Copy the webhook URL + +2. Set the webhook URL as an environment variable: + ```bash + export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/YOUR/WEBHOOK/URL" + ``` + +3. (Optional) Configure timeout: + ```bash + export MONITOR_TIMEOUT=3600 # 1 hour in seconds + ``` + +**Usage:** +```bash +# Monitor a deployment task in real-time +.claude/hooks/deployment-monitor.sh monitor + +# Example: +.claude/hooks/deployment-monitor.sh monitor b4ac6c1 + +# Send a manual completion notification +.claude/hooks/deployment-monitor.sh notify "COMPLETE" "Deployment finished successfully" + +# Send a failure notification +.claude/hooks/deployment-monitor.sh notify "FAILED" "Deployment failed with errors" +``` + +**What the monitor does:** +1. Tracks the deployment task by its task ID +2. Shows real-time progress updates (line count, elapsed time) +3. Displays the latest deployment activity +4. Detects when the task completes or times out +5. Automatically sends notifications (Slack + desktop) when done +6. Reports final status and deployment duration + +The hook will: +1. Send a Slack notification (if configured) with color-coded messages +2. Send desktop notifications on macOS (via osascript) or Linux (via notify-send) +3. Return proper exit codes: 0 for success, 1 for failure, 2 for timeout + +--- + +## How Skills Work + +Skills are invoked in Claude Code using the `/` prefix followed by the skill name. When you run a skill: + +1. Claude Code reads the `SKILL.md` file from the skill's folder +2. Executes the bash script in the Implementation section +3. Returns the output to you in the chat + +Skills are a powerful way to automate complex, multi-step workflows that you perform frequently. + +## Creating New Skills + +To create a new skill: + +1. Create a new folder in `.claude/skills/` with a descriptive name: + ```bash + mkdir -p .claude/skills/my-new-skill + ``` + +2. Create a `SKILL.md` file in that folder with these sections: + - Title and description + - Prerequisites + - Usage example + - Steps (what the skill does) + - Implementation (bash script in a code block) + - Notes + +3. Make sure the bash script is well-commented and handles errors + +4. Update this README.md to document the new skill + +See existing skills as examples: +- [setup-maestro-cluster/SKILL.md](setup-maestro-cluster/SKILL.md) +- [run-e2e-tests/SKILL.md](run-e2e-tests/SKILL.md) + +## Tips for Writing Skills + +- **Error Handling**: Always check exit codes and provide clear error messages +- **Prerequisites**: Document all required tools and environment variables +- **Idempotency**: Skills should be safe to run multiple times +- **Cleanup**: Clean up temporary files and resources +- **Progress Updates**: Provide clear progress indicators (✓, step numbers, etc.) +- **Exit Codes**: Use proper exit codes (0 for success, non-zero for failures) +- **Environment Variables**: Use environment variables for configuration instead of hard-coded values diff --git a/.claude/skills/diagnose-maestro-deployment/SKILL.md b/.claude/skills/diagnose-maestro-deployment/SKILL.md new file mode 100644 index 00000000..b010ff98 --- /dev/null +++ b/.claude/skills/diagnose-maestro-deployment/SKILL.md @@ -0,0 +1,36 @@ +--- +name: diagnose-maestro-deployment +description: Diagnoses failed Maestro cluster deployments by analyzing Helm releases, pod status, and resource conflicts +category: Troubleshooting +tags: [azure, aks, maestro, troubleshooting, debugging, helm, kubernetes] +--- + +# Diagnose Maestro Deployment + +Automatically diagnoses failed Maestro cluster deployments by: +- Analyzing deployment output to identify resource groups and cluster names +- Checking Helm release status in both service and management clusters +- Inspecting pod states and error conditions +- Identifying resource conflicts and timing issues +- Generating a detailed analysis report with root cause and recommendations + +**Prerequisites:** +- Azure CLI installed and logged in +- kubectl and kubelogin installed +- Access to the failed deployment output or cluster information + +**Usage:** +```bash +# Diagnose using deployment output file +diagnose-maestro-deployment /path/to/deployment.output + +# Diagnose using cluster names directly +diagnose-maestro-deployment --svc-rg --svc-cluster --mgmt-rg --mgmt-cluster +``` + +```bash +#!/bin/bash +# Execute the diagnostic script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +exec "$SCRIPT_DIR/scripts/diagnose.sh" "$@" +``` diff --git a/.claude/skills/diagnose-maestro-deployment/scripts/analyze-logs.sh b/.claude/skills/diagnose-maestro-deployment/scripts/analyze-logs.sh new file mode 100755 index 00000000..8170fdd7 --- /dev/null +++ b/.claude/skills/diagnose-maestro-deployment/scripts/analyze-logs.sh @@ -0,0 +1,176 @@ +#!/bin/bash +# Log Analysis Module for Maestro Deployment Diagnostics +# This module intelligently parses deployment logs to identify issues + +set -e + +# Extract failed Helm releases from deployment logs +extract_failed_helm_releases() { + local log_file=$1 + local temp_dir=$2 + + # Look for Helm deployment errors in logs + grep -i "failed to deploy helm release\|helm release.*failed\|error running Helm" "$log_file" 2>/dev/null | \ + grep -o "helm release: [a-zA-Z0-9-]*\|release [a-zA-Z0-9-]*" | \ + awk '{print $NF}' | \ + sort -u > "$temp_dir/failed_helm_releases.txt" || true + + # Also extract from error messages with release names + grep -oP '(?<=aro-hcp-)[a-zA-Z0-9-]+(?=/templates)' "$log_file" 2>/dev/null | \ + sort -u >> "$temp_dir/failed_helm_releases.txt" || true +} + +# Extract resource conflicts from logs +extract_resource_conflicts() { + local log_file=$1 + local temp_dir=$2 + + # Look for resource conflict errors + if grep -q "Apply failed with.*conflicts\|conflict occurred while applying" "$log_file"; then + python3 -c " +import re +import sys + +try: + with open('$log_file', 'r') as f: + content = f.read() + + conflicts = {} + + # Find conflict patterns + for line in content.split('\n'): + if 'Apply failed with' in line or 'conflict occurred while applying' in line: + # Extract resource type and name + resource_match = re.search(r'(?:object|resource)\s+/([a-zA-Z0-9-]+)\s+([a-zA-Z0-9./]+)', line) + if resource_match: + resource_name = resource_match.group(1) + resource_type = resource_match.group(2) + + # Extract conflicting fields + fields = [] + + # Pattern 1: Field paths in error message + field_matches = re.findall(r'\.spec\.[a-zA-Z0-9.\[\]=\"]+', line) + fields.extend(field_matches) + + # Pattern 2: conflicts with manager + manager_match = re.search(r'conflicts with\\\\\"([^\\\\]+)\\\\\"', line) + manager = manager_match.group(1) if manager_match else 'unknown' + + if resource_name not in conflicts: + conflicts[resource_name] = { + 'type': resource_type, + 'fields': set(), + 'manager': manager + } + + conflicts[resource_name]['fields'].update(fields) + + # Output conflicts in structured format + for resource, info in conflicts.items(): + print(f'CONFLICT:{resource}:{info[\"type\"]}:{info[\"manager\"]}:{\"|\".join(sorted(info[\"fields\"]))}') + +except Exception as e: + print(f'ERROR: Failed to parse conflicts: {e}', file=sys.stderr) +" > "$temp_dir/resource_conflicts.txt" 2>/dev/null || echo "ERROR:parse_failed" > "$temp_dir/resource_conflicts.txt" + fi +} + +# Extract deployment timeline from logs +extract_deployment_timeline() { + local log_file=$1 + local temp_dir=$2 + + # Extract timestamped events + grep -E '^\[?[0-9]{2}:[0-9]{2}:[0-9]{2}' "$log_file" | \ + grep -i "error\|failed\|success\|complete\|deployed\|installing" | \ + tail -50 > "$temp_dir/timeline.txt" || true +} + +# Identify root cause from error patterns +identify_root_cause() { + local log_file=$1 + local temp_dir=$2 + + # Common error patterns and their interpretations + python3 -c " +import re + +error_patterns = { + 'timing_conflict': r'conflict occurred while applying.*hook', + 'resource_exists': r'already exists', + 'timeout': r'context (deadline exceeded|canceled)|timed? out', + 'authentication': r'authentication|unauthorized|forbidden', + 'network': r'connection refused|network.*unreachable|dial tcp', + 'resource_limit': r'(insufficient|exceeded).*resources', + 'dependency_missing': r'not found.*required|missing.*dependency', + 'api_error': r'Internal error occurred|API.*error', + 'helm_hook_failed': r'Hook.*failed|post-install.*failed', +} + +with open('$log_file', 'r') as f: + content = f.read() + +detected_patterns = [] +for pattern_name, pattern_regex in error_patterns.items(): + if re.search(pattern_regex, content, re.IGNORECASE): + detected_patterns.append(pattern_name) + + # Find specific error context + matches = re.finditer(pattern_regex, content, re.IGNORECASE) + for match in list(matches)[:3]: # Limit to first 3 + start = max(0, match.start() - 200) + end = min(len(content), match.end() + 200) + context = content[start:end].replace('\n', ' ') + print(f'{pattern_name}:::{context}') +" > "$temp_dir/error_patterns.txt" 2>/dev/null || true +} + +# Extract component status from logs +extract_component_status() { + local log_file=$1 + local temp_dir=$2 + + # Look for explicit status messages + grep -i "status.*complete\|deployment.*success\|installed.*successfully" "$log_file" | \ + tail -20 > "$temp_dir/success_components.txt" || true + + grep -i "status.*fail\|deployment.*fail\|installation.*fail" "$log_file" | \ + tail -20 > "$temp_dir/failed_components.txt" || true +} + +# Main analysis function +analyze_deployment_logs() { + local log_file=$1 + local output_dir=$2 + + if [ ! -f "$log_file" ]; then + echo "ERROR: Log file not found: $log_file" + return 1 + fi + + mkdir -p "$output_dir" + + echo "Analyzing deployment logs: $log_file" + echo "Output directory: $output_dir" + echo "" + + # Run all analysis functions + extract_failed_helm_releases "$log_file" "$output_dir" + extract_resource_conflicts "$log_file" "$output_dir" + extract_deployment_timeline "$log_file" "$output_dir" + identify_root_cause "$log_file" "$output_dir" + extract_component_status "$log_file" "$output_dir" + + echo "Log analysis complete. Results in: $output_dir" +} + +# If script is executed directly, run analysis +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + if [ $# -lt 2 ]; then + echo "Usage: $0 " + exit 1 + fi + + analyze_deployment_logs "$1" "$2" +fi diff --git a/.claude/skills/diagnose-maestro-deployment/scripts/diagnose.sh b/.claude/skills/diagnose-maestro-deployment/scripts/diagnose.sh new file mode 100755 index 00000000..e5dd0cb7 --- /dev/null +++ b/.claude/skills/diagnose-maestro-deployment/scripts/diagnose.sh @@ -0,0 +1,643 @@ +#!/bin/bash +set -e + +echo "==========================================" +echo "Maestro Deployment Diagnostic Tool" +echo "==========================================" +echo "" + +# Initialize variables +DEPLOYMENT_OUTPUT="" +SVC_RESOURCE_GROUP="" +SVC_CLUSTER_NAME="" +MGMT_RESOURCE_GROUP="" +MGMT_CLUSTER_NAME="" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --svc-rg) + SVC_RESOURCE_GROUP="$2" + shift 2 + ;; + --svc-cluster) + SVC_CLUSTER_NAME="$2" + shift 2 + ;; + --mgmt-rg) + MGMT_RESOURCE_GROUP="$2" + shift 2 + ;; + --mgmt-cluster) + MGMT_CLUSTER_NAME="$2" + shift 2 + ;; + *) + if [ -z "$DEPLOYMENT_OUTPUT" ] && [ -f "$1" ]; then + DEPLOYMENT_OUTPUT="$1" + fi + shift + ;; + esac +done + +# Function to extract cluster info from deployment output +extract_cluster_info() { + local output_file=$1 + + echo "Analyzing deployment output: $output_file" + echo "" + + # Try to find cluster names from the output + if grep -q "pers-usw3" "$output_file"; then + # Extract cluster name pattern + local cluster_base=$(grep -o "pers-usw3[a-z]*" "$output_file" | head -1) + + if [ -n "$cluster_base" ]; then + SVC_CLUSTER_NAME="${cluster_base}-svc" + SVC_RESOURCE_GROUP="hcp-underlay-${cluster_base}-svc" + MGMT_CLUSTER_NAME="${cluster_base}-mgmt-1" + MGMT_RESOURCE_GROUP="hcp-underlay-${cluster_base}-mgmt-1" + + echo "Detected clusters:" + echo " Service: $SVC_RESOURCE_GROUP / $SVC_CLUSTER_NAME" + echo " Management: $MGMT_RESOURCE_GROUP / $MGMT_CLUSTER_NAME" + echo "" + fi + fi +} + +# Extract cluster info if deployment output provided +if [ -n "$DEPLOYMENT_OUTPUT" ]; then + extract_cluster_info "$DEPLOYMENT_OUTPUT" +fi + +# Validate we have cluster information +if [ -z "$SVC_RESOURCE_GROUP" ] || [ -z "$SVC_CLUSTER_NAME" ] || \ + [ -z "$MGMT_RESOURCE_GROUP" ] || [ -z "$MGMT_CLUSTER_NAME" ]; then + echo "ERROR: Could not determine cluster information." + echo "" + echo "Usage:" + echo " $0 " + echo " $0 --svc-rg --svc-cluster --mgmt-rg --mgmt-cluster " + exit 1 +fi + +# Check prerequisites +echo "Step 1: Checking prerequisites..." +if ! command -v az &> /dev/null; then + echo "ERROR: Azure CLI not installed" + exit 1 +fi + +if ! command -v kubectl &> /dev/null; then + echo "ERROR: kubectl not installed" + exit 1 +fi + +if ! command -v helm &> /dev/null; then + echo "ERROR: helm not installed" + exit 1 +fi + +if ! command -v jq &> /dev/null; then + echo "ERROR: jq not installed (required for JSON parsing)" + echo "Install with: brew install jq (macOS) or apt-get install jq (Linux)" + exit 1 +fi + +if ! az account show &> /dev/null; then + echo "ERROR: Not logged into Azure" + exit 1 +fi + +# kubelogin is optional but recommended for Azure AD authentication +if ! command -v kubelogin &> /dev/null; then + echo "WARNING: kubelogin not installed (Azure AD authentication may fail)" + echo "Install with: brew install Azure/kubelogin/kubelogin (macOS)" +fi + +echo "✓ All prerequisites met" +echo "" + +# Create temporary directory for kubeconfigs +TEMP_DIR=$(mktemp -d) +trap 'rm -rf "$TEMP_DIR"' EXIT + +SVC_KUBECONFIG="$TEMP_DIR/svc.kubeconfig" +MGMT_KUBECONFIG="$TEMP_DIR/mgmt.kubeconfig" + +# Get cluster credentials +echo "Step 2: Retrieving cluster credentials..." + +# Initialize issue tracking early (will be used if credentials fail) +CREDENTIAL_ISSUES=0 + +if az aks get-credentials \ + --resource-group "$SVC_RESOURCE_GROUP" \ + --name "$SVC_CLUSTER_NAME" \ + --overwrite-existing \ + -f "$SVC_KUBECONFIG" 2>/dev/null; then + echo "✓ Service cluster credentials retrieved" + # kubelogin may fail but shouldn't stop the script + if command -v kubelogin &> /dev/null; then + kubelogin convert-kubeconfig -l azurecli --kubeconfig "$SVC_KUBECONFIG" 2>/dev/null || true + fi +else + echo "✗ Failed to get service cluster credentials" + SVC_KUBECONFIG="" + CREDENTIAL_ISSUES=$((CREDENTIAL_ISSUES + 1)) +fi + +if az aks get-credentials \ + --resource-group "$MGMT_RESOURCE_GROUP" \ + --name "$MGMT_CLUSTER_NAME" \ + --overwrite-existing \ + -f "$MGMT_KUBECONFIG" 2>/dev/null; then + echo "✓ Management cluster credentials retrieved" + # kubelogin may fail but shouldn't stop the script + if command -v kubelogin &> /dev/null; then + kubelogin convert-kubeconfig -l azurecli --kubeconfig "$MGMT_KUBECONFIG" 2>/dev/null || true + fi +else + echo "✗ Failed to get management cluster credentials" + MGMT_KUBECONFIG="" + CREDENTIAL_ISSUES=$((CREDENTIAL_ISSUES + 1)) +fi + +echo "" + +# Step 3: Analyze deployment logs if provided +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LOG_ANALYSIS_DIR="$TEMP_DIR/log-analysis" + +if [ -n "$DEPLOYMENT_OUTPUT" ] && [ -f "$DEPLOYMENT_OUTPUT" ]; then + echo "Step 3: Analyzing deployment logs..." + echo "" + + # Source the log analysis module + source "$SCRIPT_DIR/analyze-logs.sh" + + # Run log analysis + analyze_deployment_logs "$DEPLOYMENT_OUTPUT" "$LOG_ANALYSIS_DIR" + + echo "" +fi + +# Initialize report +REPORT_FILE="$TEMP_DIR/diagnosis-report.txt" + +cat > "$REPORT_FILE" << EOF +======================================== +Maestro Deployment Diagnostic Report +======================================== +Generated: $(date) + +Clusters Analyzed: + Service: $SVC_RESOURCE_GROUP / $SVC_CLUSTER_NAME + Management: $MGMT_RESOURCE_GROUP / $MGMT_CLUSTER_NAME + +EOF + +# Step 4: Analyze Management Cluster (dynamically based on log analysis) +echo "Step 4: Analyzing Management Cluster..." +echo "" + +if [ -n "$MGMT_KUBECONFIG" ]; then + echo "Management Cluster Analysis" >> "$REPORT_FILE" + echo "==========================" >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" + + # Get all Helm releases for reference + echo "Helm Releases:" >> "$REPORT_FILE" + helm --kubeconfig "$MGMT_KUBECONFIG" list -A -o json | \ + jq -r '.[] | "\(.name) (\(.namespace)): \(.status) - Chart: \(.chart)"' >> "$REPORT_FILE" 2>/dev/null || \ + echo "Failed to retrieve Helm releases" >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" + + # Determine what to check based on log analysis + FAILED_RELEASES="" + + if [ -f "$LOG_ANALYSIS_DIR/failed_helm_releases.txt" ]; then + # Use log analysis results to identify failed releases + FAILED_RELEASES=$(cat "$LOG_ANALYSIS_DIR/failed_helm_releases.txt" | tr '\n' ' ') + echo "Failed releases identified from logs: $FAILED_RELEASES" + fi + + # If no log analysis or empty results, fallback to checking cluster state + if [ -z "$FAILED_RELEASES" ]; then + FAILED_RELEASES=$(helm --kubeconfig "$MGMT_KUBECONFIG" list -A -o json | \ + jq -r '.[] | select(.status == "failed") | .name + ":" + .namespace' 2>/dev/null || echo "") + fi + + if [ -n "$FAILED_RELEASES" ]; then + echo "Investigating Failed Components:" >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" + + # Process each failed release + for release_info in $FAILED_RELEASES; do + if [[ "$release_info" == *":"* ]]; then + release=$(echo "$release_info" | cut -d: -f1) + namespace=$(echo "$release_info" | cut -d: -f2) + else + release="$release_info" + # Try to find namespace from Helm + namespace=$(helm --kubeconfig "$MGMT_KUBECONFIG" list -A -o json 2>/dev/null | \ + jq -r ".[] | select(.name == \"$release\") | .namespace" | head -1) + if [ -z "$namespace" ]; then + namespace="unknown" + fi + fi + + echo "Analyzing: $release in namespace $namespace" + echo "[$release] (namespace: $namespace)" >> "$REPORT_FILE" + echo "---" >> "$REPORT_FILE" + + if [ "$namespace" != "unknown" ]; then + # Get pod status + echo "Pods:" >> "$REPORT_FILE" + kubectl --kubeconfig "$MGMT_KUBECONFIG" get pods -n "$namespace" -o wide 2>/dev/null >> "$REPORT_FILE" || \ + echo " No pods found or error retrieving pods" >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" + + # Check for resource conflicts if indicated in log analysis + if [ -f "$LOG_ANALYSIS_DIR/resource_conflicts.txt" ]; then + while IFS= read -r line; do + # Parse CONFLICT:resource_name:resource_type:manager:fields format + # Use awk to properly split on first 4 colons only + conflict_type=$(echo "$line" | awk -F: '{print $1}') + if [ "$conflict_type" = "CONFLICT" ]; then + resource_name=$(echo "$line" | awk -F: '{print $2}') + # Resource type may contain colons, extract everything between 2nd and 3rd-to-last colon + resource_type=$(echo "$line" | awk -F: '{for(i=3;i> "$REPORT_FILE" + echo " Resource: $resource_name (type: $resource_type)" >> "$REPORT_FILE" + echo " Managed by: $manager" >> "$REPORT_FILE" + if [ -n "$fields" ]; then + echo " Conflicting fields:" >> "$REPORT_FILE" + echo "$fields" | tr '|' '\n' | sed 's/^/ - /' >> "$REPORT_FILE" + fi + echo "" >> "$REPORT_FILE" + fi + done < "$LOG_ANALYSIS_DIR/resource_conflicts.txt" + fi + fi + echo "" >> "$REPORT_FILE" + done + else + echo "No failed Helm releases detected" >> "$REPORT_FILE" + echo "✓ No failed Helm releases in management cluster" + fi + echo "" >> "$REPORT_FILE" +fi + +echo "" + +# Step 5: Analyze Service Cluster +echo "Step 5: Analyzing Service Cluster..." +echo "" + +if [ -n "$SVC_KUBECONFIG" ]; then + echo "Service Cluster Analysis" >> "$REPORT_FILE" + echo "========================" >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" + + # Get Helm releases + echo "Helm Releases:" >> "$REPORT_FILE" + helm --kubeconfig "$SVC_KUBECONFIG" list -A -o json | \ + jq -r '.[] | "\(.name) (\(.namespace)): \(.status) - Chart: \(.chart)"' >> "$REPORT_FILE" 2>/dev/null || \ + echo "Failed to retrieve Helm releases" >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" + + # Check maestro namespace + echo "Maestro Namespace Status:" >> "$REPORT_FILE" + if kubectl --kubeconfig "$SVC_KUBECONFIG" get namespace maestro &>/dev/null; then + kubectl --kubeconfig "$SVC_KUBECONFIG" get pods -n maestro -o wide 2>/dev/null >> "$REPORT_FILE" || \ + echo "No pods in maestro namespace" >> "$REPORT_FILE" + else + echo "Maestro namespace does not exist" >> "$REPORT_FILE" + fi + echo "" >> "$REPORT_FILE" +fi + +echo "" + +# Step 6: Include Log Analysis Results in Report +if [ -d "$LOG_ANALYSIS_DIR" ]; then + echo "Deployment Log Analysis" >> "$REPORT_FILE" + echo "======================" >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" + + # Include error patterns + if [ -f "$LOG_ANALYSIS_DIR/error_patterns.txt" ] && [ -s "$LOG_ANALYSIS_DIR/error_patterns.txt" ]; then + echo "Identified Error Patterns:" >> "$REPORT_FILE" + while IFS= read -r line; do + # Split on literal ':::' delimiter + pattern="${line%%:::*}" + context="${line#*:::}" + echo " • Pattern: $pattern" >> "$REPORT_FILE" + echo " Context: $(echo "$context" | head -c 200)..." >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" + done < "$LOG_ANALYSIS_DIR/error_patterns.txt" + fi + + # Include deployment timeline + if [ -f "$LOG_ANALYSIS_DIR/timeline.txt" ] && [ -s "$LOG_ANALYSIS_DIR/timeline.txt" ]; then + echo "Deployment Timeline (last 20 events):" >> "$REPORT_FILE" + tail -20 "$LOG_ANALYSIS_DIR/timeline.txt" >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" + fi +fi + +# Generate diagnosis summary +echo "Diagnosis Summary" >> "$REPORT_FILE" +echo "=================" >> "$REPORT_FILE" +echo "" >> "$REPORT_FILE" + +# Root Cause Analysis (Dynamic based on error patterns) +echo "ROOT CAUSE ANALYSIS:" >> "$REPORT_FILE" +echo "-------------------" >> "$REPORT_FILE" +echo "" >> "$REPORT_FILE" + +# Analyze error patterns from logs +if [ -f "$LOG_ANALYSIS_DIR/error_patterns.txt" ] && [ -s "$LOG_ANALYSIS_DIR/error_patterns.txt" ]; then + # Group errors by pattern type (bash 3.x compatible) + # Count occurrences of each pattern + cut -d':' -f1 < "$LOG_ANALYSIS_DIR/error_patterns.txt" | sort | uniq -c | sort -rn > "$LOG_ANALYSIS_DIR/pattern_counts.txt" + + # Determine primary failure based on most common pattern + primary_pattern="" + if [ -s "$LOG_ANALYSIS_DIR/pattern_counts.txt" ]; then + # Get the first line (highest count) + read -r count pattern_name < "$LOG_ANALYSIS_DIR/pattern_counts.txt" + primary_pattern="$pattern_name" + fi + + if [ -n "$primary_pattern" ]; then + case "$primary_pattern" in + timing_conflict) + echo "Primary Failure Type: Timing/Race Condition" >> "$REPORT_FILE" + echo "Multiple components attempted to manage the same resources simultaneously," >> "$REPORT_FILE" + echo "leading to conflicts. This often occurs when operators start before Helm" >> "$REPORT_FILE" + echo "post-install hooks complete." >> "$REPORT_FILE" + ;; + timeout) + echo "Primary Failure Type: Timeout" >> "$REPORT_FILE" + echo "One or more operations exceeded their time limits. This may indicate" >> "$REPORT_FILE" + echo "slow network, resource constraints, or hung processes." >> "$REPORT_FILE" + ;; + authentication) + echo "Primary Failure Type: Authentication/Authorization" >> "$REPORT_FILE" + echo "Deployment failed due to insufficient permissions or invalid credentials." >> "$REPORT_FILE" + ;; + network) + echo "Primary Failure Type: Network Connectivity" >> "$REPORT_FILE" + echo "Network-related errors prevented successful deployment." >> "$REPORT_FILE" + ;; + resource_limit) + echo "Primary Failure Type: Resource Constraints" >> "$REPORT_FILE" + echo "Insufficient cluster resources (CPU, memory, or storage) to complete deployment." >> "$REPORT_FILE" + ;; + *) + echo "Primary Failure Type: $primary_pattern" >> "$REPORT_FILE" + echo "Multiple errors of this type detected in deployment logs." >> "$REPORT_FILE" + ;; + esac + echo "" >> "$REPORT_FILE" + fi +fi + +# Analyze resource conflicts +if [ -f "$LOG_ANALYSIS_DIR/resource_conflicts.txt" ] && [ -s "$LOG_ANALYSIS_DIR/resource_conflicts.txt" ]; then + echo "Resource Conflicts Detected:" >> "$REPORT_FILE" + while IFS= read -r line; do + # Parse CONFLICT:resource_name:resource_type:manager:fields format + conflict_type=$(echo "$line" | awk -F: '{print $1}') + if [ "$conflict_type" = "CONFLICT" ]; then + resource_name=$(echo "$line" | awk -F: '{print $2}') + # Resource type may contain colons + resource_type=$(echo "$line" | awk -F: '{for(i=3;i> "$REPORT_FILE" + echo " Conflicting manager: $manager" >> "$REPORT_FILE" + if [ -n "$fields" ] && [ "$fields" != "" ]; then + echo " Fields:" >> "$REPORT_FILE" + echo "$fields" | tr '|' '\n' | sed 's/^/ - /' >> "$REPORT_FILE" + fi + echo "" >> "$REPORT_FILE" + fi + done < "$LOG_ANALYSIS_DIR/resource_conflicts.txt" +fi + +echo "" >> "$REPORT_FILE" +echo "DETAILED ISSUES:" >> "$REPORT_FILE" +echo "----------------" >> "$REPORT_FILE" +echo "" >> "$REPORT_FILE" + +# Dynamically generate issues based on discoveries +ISSUES_FOUND=0 +CRITICAL_ISSUES=0 + +# Issue: Credential failures (if any) +if [ "$CREDENTIAL_ISSUES" -gt 0 ]; then + ISSUES_FOUND=$((ISSUES_FOUND + 1)) + CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) + echo "[$ISSUES_FOUND] Failed to Retrieve Cluster Credentials" >> "$REPORT_FILE" + echo " Severity: CRITICAL" >> "$REPORT_FILE" + if [ -z "$SVC_KUBECONFIG" ]; then + echo " Failed: Service cluster ($SVC_CLUSTER_NAME)" >> "$REPORT_FILE" + fi + if [ -z "$MGMT_KUBECONFIG" ]; then + echo " Failed: Management cluster ($MGMT_CLUSTER_NAME)" >> "$REPORT_FILE" + fi + echo " " >> "$REPORT_FILE" + echo " Recommendation:" >> "$REPORT_FILE" + echo " Verify Azure credentials and cluster access permissions" >> "$REPORT_FILE" + echo " Check that resource groups and cluster names are correct" >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" +fi + +# Issue: Failed Helm Releases (dynamic) +if [ -n "$FAILED_RELEASES" ]; then + for release_info in $FAILED_RELEASES; do + if [[ "$release_info" == *":"* ]]; then + release=$(echo "$release_info" | cut -d: -f1) + namespace=$(echo "$release_info" | cut -d: -f2) + else + release="$release_info" + namespace=$(helm --kubeconfig "$MGMT_KUBECONFIG" list -A -o json 2>/dev/null | \ + jq -r ".[] | select(.name == \"$release\") | .namespace" | head -1) + [ -z "$namespace" ] && namespace="unknown" + fi + + ISSUES_FOUND=$((ISSUES_FOUND + 1)) + echo "[$ISSUES_FOUND] Helm Release Failed: $release" >> "$REPORT_FILE" + echo " Namespace: $namespace" >> "$REPORT_FILE" + + # Determine severity based on pod status + severity="WARNING" + if [ "$namespace" != "unknown" ] && [ -n "$MGMT_KUBECONFIG" ]; then + running_pods=$(kubectl --kubeconfig "$MGMT_KUBECONFIG" get pods -n "$namespace" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) + total_pods=$(kubectl --kubeconfig "$MGMT_KUBECONFIG" get pods -n "$namespace" --no-headers 2>/dev/null | wc -l) + + if [ "$total_pods" -gt 0 ] && [ "$running_pods" -eq "$total_pods" ]; then + severity="WARNING" + echo " Severity: WARNING (Helm failed but pods are running)" >> "$REPORT_FILE" + echo " Actual Status: ✓ All $total_pods pods are Running" >> "$REPORT_FILE" + elif [ "$total_pods" -eq 0 ]; then + severity="CRITICAL" + CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) + echo " Severity: CRITICAL" >> "$REPORT_FILE" + echo " Actual Status: ✗ No pods found in namespace" >> "$REPORT_FILE" + else + # Partial failure: some pods running, some not - this is WARNING not CRITICAL + severity="WARNING" + echo " Severity: WARNING" >> "$REPORT_FILE" + echo " Actual Status: ⚠ $running_pods/$total_pods pods Running" >> "$REPORT_FILE" + fi + else + echo " Severity: $severity" >> "$REPORT_FILE" + fi + + # Add recommendation based on severity + echo " " >> "$REPORT_FILE" + echo " Recommendation:" >> "$REPORT_FILE" + if [ "$severity" = "WARNING" ]; then + echo " Helm failure may be a false-positive. Verify pods are functional." >> "$REPORT_FILE" + else + echo " Investigate pod failures and check Helm release logs." >> "$REPORT_FILE" + fi + echo "" >> "$REPORT_FILE" + done +fi + +# Issue 2: Missing deployments in service cluster +if [ -n "$SVC_KUBECONFIG" ]; then + # Check for maestro namespace + if ! kubectl --kubeconfig "$SVC_KUBECONFIG" get namespace maestro &>/dev/null; then + ISSUES_FOUND=$((ISSUES_FOUND + 1)) + CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) + echo "[$ISSUES_FOUND] Missing Deployment: maestro in Service Cluster" >> "$REPORT_FILE" + echo " Severity: CRITICAL" >> "$REPORT_FILE" + echo " Status: Namespace does not exist" >> "$REPORT_FILE" + echo " " >> "$REPORT_FILE" + echo " Likely Cause:" >> "$REPORT_FILE" + echo " Deployment pipeline may have halted before service cluster setup" >> "$REPORT_FILE" + echo " " >> "$REPORT_FILE" + echo " Recommendation:" >> "$REPORT_FILE" + echo " Option 1: Continue deployment to service cluster" >> "$REPORT_FILE" + echo " Option 2: Re-run complete deployment" >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" + fi +fi + +# Issue 3: Error patterns from logs +if [ -f "$LOG_ANALYSIS_DIR/error_patterns.txt" ] && [ -s "$LOG_ANALYSIS_DIR/error_patterns.txt" ]; then + # Group unique error types (bash 3.x compatible) + # Get unique patterns + cut -d':' -f1 < "$LOG_ANALYSIS_DIR/error_patterns.txt" | sort -u > "$LOG_ANALYSIS_DIR/unique_patterns.txt" + + while IFS= read -r pattern; do + ISSUES_FOUND=$((ISSUES_FOUND + 1)) + + # Get first context for this pattern + context=$(grep "^${pattern}:::" "$LOG_ANALYSIS_DIR/error_patterns.txt" | head -1 | cut -d':' -f4-) + + echo "[$ISSUES_FOUND] Error Pattern Detected: $pattern" >> "$REPORT_FILE" + + case "$pattern" in + timing_conflict|helm_hook_failed) + echo " Severity: WARNING" >> "$REPORT_FILE" + echo " Description: Resource timing conflict detected" >> "$REPORT_FILE" + ;; + timeout) + # Timeouts are warnings unless they prevent critical operations + echo " Severity: WARNING" >> "$REPORT_FILE" + echo " Description: Operation timed out" >> "$REPORT_FILE" + ;; + authentication) + echo " Severity: CRITICAL" >> "$REPORT_FILE" + CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) + echo " Description: Authentication or authorization failure" >> "$REPORT_FILE" + ;; + network) + echo " Severity: CRITICAL" >> "$REPORT_FILE" + CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) + echo " Description: Network connectivity issue" >> "$REPORT_FILE" + ;; + *) + echo " Severity: WARNING" >> "$REPORT_FILE" + echo " Description: $pattern error detected" >> "$REPORT_FILE" + ;; + esac + + echo " Context: $(echo "$context" | head -c 150)..." >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" + done < "$LOG_ANALYSIS_DIR/unique_patterns.txt" +fi + +if [ $ISSUES_FOUND -eq 0 ]; then + echo "No issues detected." >> "$REPORT_FILE" + echo "✓ All services appear to be running normally." >> "$REPORT_FILE" +else + echo "" >> "$REPORT_FILE" + echo "SUMMARY:" >> "$REPORT_FILE" + echo "--------" >> "$REPORT_FILE" + echo "Total Issues: $ISSUES_FOUND" >> "$REPORT_FILE" + echo "Critical Issues: $CRITICAL_ISSUES" >> "$REPORT_FILE" + echo "" >> "$REPORT_FILE" + if [ $CRITICAL_ISSUES -eq 0 ]; then + echo "Overall Status: Deployment appears successful despite Helm warnings" >> "$REPORT_FILE" + echo "Action Required: None - Services are functional" >> "$REPORT_FILE" + else + echo "Overall Status: Deployment incomplete - requires intervention" >> "$REPORT_FILE" + echo "Action Required: Complete service cluster deployment" >> "$REPORT_FILE" + fi +fi + +echo "" >> "$REPORT_FILE" +echo "End of Diagnostic Report" >> "$REPORT_FILE" +echo "========================================" >> "$REPORT_FILE" + +# Display report +echo "==========================================" +echo "Diagnostic Report Generated" +echo "==========================================" +echo "" +cat "$REPORT_FILE" + +# Save report to current directory +REPORT_OUTPUT="maestro-diagnosis-$(date +%Y%m%d-%H%M%S).txt" +cp "$REPORT_FILE" "$REPORT_OUTPUT" + +echo "" +echo "==========================================" +echo "Report saved to: $REPORT_OUTPUT" +echo "==========================================" +echo "" + +# Send to Slack if webhook is configured +if [ -n "$SLACK_WEBHOOK_URL" ]; then + echo "Sending report to Slack..." + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + if bash "$SCRIPT_DIR/send-to-slack.sh" "$REPORT_OUTPUT"; then + echo "✓ Report sent to Slack" + else + echo "⚠ Failed to send report to Slack (report still saved locally)" + fi + echo "" +fi + +# Summary +if [ $ISSUES_FOUND -gt 0 ]; then + echo "Found $ISSUES_FOUND issue(s). See report for details and recommendations." + exit 1 +else + echo "No critical issues found. Deployment appears successful." + exit 0 +fi diff --git a/.claude/skills/diagnose-maestro-deployment/scripts/send-to-slack.sh b/.claude/skills/diagnose-maestro-deployment/scripts/send-to-slack.sh new file mode 100755 index 00000000..85350524 --- /dev/null +++ b/.claude/skills/diagnose-maestro-deployment/scripts/send-to-slack.sh @@ -0,0 +1,261 @@ +#!/bin/bash +set -e + +REPORT_FILE="$1" +WEBHOOK_URL="${SLACK_WEBHOOK_URL}" + +if [ -z "$REPORT_FILE" ] || [ ! -f "$REPORT_FILE" ]; then + echo "ERROR: Report file not provided or does not exist" + echo "Usage: $0 " + exit 1 +fi + +if [ -z "$WEBHOOK_URL" ]; then + echo "ERROR: SLACK_WEBHOOK_URL not set" + exit 1 +fi + +# Extract key information from report +TOTAL_ISSUES=$(grep "^Total Issues:" "$REPORT_FILE" | awk '{print $3}') +CRITICAL_ISSUES=$(grep "^Critical Issues:" "$REPORT_FILE" | awk '{print $3}') +OVERALL_STATUS=$(grep "^Overall Status:" "$REPORT_FILE" | sed 's/Overall Status: //') +ACTION_REQUIRED=$(grep "^Action Required:" "$REPORT_FILE" | sed 's/Action Required: //') + +# Extract cluster info +SVC_CLUSTER=$(grep "Service:" "$REPORT_FILE" | head -1 | awk '{print $2"/"$3}') +MGMT_CLUSTER=$(grep "Management:" "$REPORT_FILE" | head -1 | awk '{print $2"/"$3}') + +# Determine color based on critical issues +if [ "$CRITICAL_ISSUES" = "0" ]; then + COLOR="warning" + EMOJI="⚠️" + STATUS_ICON="⚠️" +else + COLOR="danger" + EMOJI="🚨" + STATUS_ICON="❌" +fi + +# Extract primary failure reason - just the first line +PRIMARY_REASON=$(grep "^Primary Failure:" "$REPORT_FILE" | sed 's/Primary Failure: //' | head -1) + +# Extract conflict fields - clean format +CONFLICT_FIELDS=$(sed -n '/^Conflicting Fields:/,/^$/p' "$REPORT_FILE" 2>/dev/null | grep "•" | sed 's/ • //' | tr '\n' ',' | sed 's/,$//' | sed 's/,/, /g') + +# Extract cascading failure if exists +CASCADING=$(grep "^Cascading Failure:" "$REPORT_FILE" | sed 's/Cascading Failure: //' | head -1) + +# Build issue fields array - each issue gets its own field with detailed info +ISSUE_FIELDS="[]" +issue_num=1 +while IFS= read -r line; do + if [[ "$line" =~ ^\[([0-9]+)\]\ (.+)$ ]]; then + issue_title="${BASH_REMATCH[2]}" + + # Get full issue section + issue_section=$(sed -n "/^\[$issue_num\]/,/^\[/p" "$REPORT_FILE") + severity=$(echo "$issue_section" | grep "Severity:" | awk '{print $2}' | head -1) + + # Determine emoji based on severity + if [ "$severity" = "CRITICAL" ]; then + sev_emoji="🔴" + else + sev_emoji="🟡" + fi + + # Build structured issue description based on issue type + issue_value="" + + if [[ "$issue_title" =~ "Hypershift" ]]; then + # Hypershift issue - show clear cause and effect + issue_value="*Root Cause:*" + issue_value+=$'\n'"• Hypershift release post-install hook attempted to create ClusterSizingConfiguration resource" + issue_value+=$'\n'"• Resource was already created and managed by hypershift-operator-manager" + + # Get specific conflicting fields + specific_conflicts=$(echo "$issue_section" | sed -n '/Specific Conflicting Fields:/,/^ $/p' | grep "•" | sed 's/^ • //') + if [ -n "$specific_conflicts" ]; then + issue_value+=$'\n'"• Leading to field conflicts:" + while IFS= read -r field; do + issue_value+=$'\n'" - $field" + done <<< "$specific_conflicts" + fi + issue_value+=$'\n'"• Helm marked the release as failed due to post-install hook failure" + + # Get actual status + issue_value+=$'\n\n'"*Actual Status:*" + actual_status=$(echo "$issue_section" | sed -n '/Actual Service Status:/,/^ $/p' | grep "✓" | sed 's/^ //') + if [ -n "$actual_status" ]; then + while IFS= read -r line; do + issue_value+=$'\n'"$line" + done <<< "$actual_status" + fi + + issue_value+=$'\n\n'"*Conclusion:* Although Helm status is failed, services are actually running normally. This is a Helm hook timing issue." + + elif [[ "$issue_title" =~ "MCE" ]]; then + # MCE issue - similar structure + root_cause=$(echo "$issue_section" | sed -n '/Root Cause:/,/^ $/p' | grep -v "Root Cause:" | grep -v "^ $" | sed 's/^ //' | head -1) + issue_value="*Root Cause:*"$'\n'"• $root_cause" + + # Get actual status + issue_value+=$'\n\n'"*Actual Status:*" + actual_status=$(echo "$issue_section" | sed -n '/Actual Service Status:/,/^ $/p' | grep "✓" | sed 's/^ //') + if [ -n "$actual_status" ]; then + while IFS= read -r line; do + issue_value+=$'\n'"$line" + done <<< "$actual_status" + fi + + issue_value+=$'\n\n'"*Conclusion:* MCE services are running normally, Helm failure can be ignored." + + elif [[ "$issue_title" =~ "Maestro Not Deployed" ]]; then + # Maestro not deployed - show cascading failure + issue_value="*Root Cause (Cascading Failure):*" + what_happened=$(echo "$issue_section" | sed -n '/What Happened:/,/^ $/p' | grep -v "What Happened:" | grep -v "^ $" | sed 's/^ //' | sed 's/^[0-9]\. /• /') + if [ -n "$what_happened" ]; then + while IFS= read -r line; do + issue_value+=$'\n'"$line" + done <<< "$what_happened" + fi + + # Get impact + issue_value+=$'\n\n'"*Impact:*" + impact=$(echo "$issue_section" | sed -n '/Impact:/,/^ $/p' | grep "✗" | sed 's/^ //') + if [ -n "$impact" ]; then + while IFS= read -r line; do + issue_value+=$'\n'"$line" + done <<< "$impact" + fi + + issue_value+=$'\n\n'"*Conclusion:* Service cluster deployment incomplete, manual intervention required." + fi + + # Add to issues array + if command -v jq &> /dev/null; then + issue_field=$(jq -n \ + --arg title "$sev_emoji Issue $issue_num: $issue_title" \ + --arg value "$issue_value" \ + '{ + title: $title, + value: $value, + short: false + }') + ISSUE_FIELDS=$(echo "$ISSUE_FIELDS" | jq --argjson field "$issue_field" '. += [$field]') + fi + + issue_num=$((issue_num + 1)) + fi +done < <(grep -E '^\[[0-9]+\]' "$REPORT_FILE") + +# Build clean, simple message using Slack fields format +if command -v jq &> /dev/null; then + # Build base fields first + BASE_FIELDS=$(jq -n \ + --arg status "$OVERALL_STATUS" \ + --arg total "$TOTAL_ISSUES" \ + --arg critical "$CRITICAL_ISSUES" \ + --arg svc "$SVC_CLUSTER" \ + --arg mgmt "$MGMT_CLUSTER" \ + --arg primary "$PRIMARY_REASON" \ + --arg conflicts "$CONFLICT_FIELDS" \ + --arg cascading "$CASCADING" \ + --arg action "$ACTION_REQUIRED" \ + '[ + { + title: "Status", + value: $status, + short: true + }, + { + title: "Issues", + value: ("Total: " + $total + " | Critical: " + $critical), + short: true + }, + { + title: "Service Cluster", + value: $svc, + short: true + }, + { + title: "Management Cluster", + value: $mgmt, + short: true + }, + { + title: "Primary Failure", + value: $primary, + short: false + }, + (if $conflicts != "" then { + title: "Conflicting Fields", + value: $conflicts, + short: false + } else empty end), + (if $cascading != "" then { + title: "Cascading Impact", + value: $cascading, + short: false + } else empty end) + ]') + + # Combine base fields with issue fields and action + ALL_FIELDS=$(echo "$BASE_FIELDS $ISSUE_FIELDS" | jq -s '.[0] + .[1] + [{title: "Action Required", value: $action, short: false}]' --arg action "$ACTION_REQUIRED") + + # Build final payload + PAYLOAD=$(jq -n \ + --arg color "$COLOR" \ + --arg title "$EMOJI Maestro Deployment Diagnostic" \ + --argjson fields "$ALL_FIELDS" \ + --argjson ts "$(date +%s)" \ + '{ + attachments: [{ + color: $color, + title: $title, + fields: $fields, + footer: "Maestro Diagnostic Tool", + ts: $ts, + mrkdwn_in: ["fields"] + }] + }') +elif command -v python3 &> /dev/null; then + # Fallback to simple format if jq not available + MESSAGE="*$EMOJI Maestro Deployment Diagnosis*\n\n" + MESSAGE+="*Status:* $STATUS_ICON $OVERALL_STATUS\n" + MESSAGE+="*Action Required:* $ACTION_REQUIRED\n" + MESSAGE+="*Total Issues:* \`$TOTAL_ISSUES\` | *Critical:* \`$CRITICAL_ISSUES\`\n\n" + MESSAGE+="*Clusters:*\n• Service: \`$SVC_CLUSTER\`\n• Management: \`$MGMT_CLUSTER\`\n\n" + MESSAGE+="See full diagnostic report for details." + + PAYLOAD=$(python3 -c " +import json, sys +payload = { + 'attachments': [{ + 'color': sys.argv[1], + 'text': sys.argv[2], + 'footer': 'Maestro Diagnostic Tool', + 'ts': int(sys.argv[3]), + 'mrkdwn_in': ['text'] + }] +} +print(json.dumps(payload)) +" "$COLOR" "$MESSAGE" "$(date +%s)") +else + echo "ERROR: jq or python3 required for JSON construction" + exit 1 +fi + +# Send to Slack +echo "Sending diagnostic report to Slack..." +if curl -X POST -H 'Content-type: application/json' \ + --data "$PAYLOAD" \ + "$WEBHOOK_URL" \ + --silent --show-error --fail; then + echo "" + echo "✓ Diagnostic report sent to Slack successfully" + exit 0 +else + echo "" + echo "✗ Failed to send diagnostic report to Slack" + exit 1 +fi diff --git a/.claude/skills/run-e2e-tests/SKILL.md b/.claude/skills/run-e2e-tests/SKILL.md new file mode 100644 index 00000000..95b8e413 --- /dev/null +++ b/.claude/skills/run-e2e-tests/SKILL.md @@ -0,0 +1,47 @@ +--- +name: run-e2e-tests +description: Runs end-to-end or upgrade tests on existing long-running Maestro clusters deployed in Azure AKS +category: Testing +tags: [azure, aks, maestro, e2e, testing, upgrade, kubernetes] +--- + +# Run E2E Tests on Long-Running Cluster + +Runs end-to-end or upgrade tests on existing long-running Maestro clusters deployed in Azure AKS. + +**Prerequisites:** +- Azure CLI, kubectl, kubelogin, jq must be installed +- Logged into Azure with cluster access +- Long-running clusters must be already deployed +- Required environment variables: + - `SVC_RESOURCE_GROUP`: Resource group for service cluster + - `SVC_CLUSTER_NAME`: Name of service cluster + - `MGMT_RESOURCE_GROUP`: Resource group for management cluster + - `MGMT_CLUSTER_NAME`: Name of management cluster + +**Usage:** +```bash +/run-e2e-tests [test-type] +``` + +Where `test-type` can be: +- `upgrade`: Run upgrade tests (default) +- `e2e`: Run standard E2E tests with Istio +- `all`: Run both upgrade and e2e tests + +**Example:** +```bash +export SVC_RESOURCE_GROUP="hcp-underlay--svc" +export SVC_CLUSTER_NAME="-svc" +export MGMT_RESOURCE_GROUP="hcp-underlay--mgmt-1" +export MGMT_CLUSTER_NAME="-mgmt-1" + +/run-e2e-tests upgrade +``` + +```bash +#!/bin/bash +# Execute the E2E test script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +exec "$SCRIPT_DIR/scripts/run-tests.sh" "$@" +``` diff --git a/.claude/skills/run-e2e-tests/scripts/run-tests.sh b/.claude/skills/run-e2e-tests/scripts/run-tests.sh new file mode 100755 index 00000000..184456df --- /dev/null +++ b/.claude/skills/run-e2e-tests/scripts/run-tests.sh @@ -0,0 +1,316 @@ +#!/bin/bash +set -e + +echo "Starting E2E tests on long-running Maestro clusters..." + +# Parse test type argument +TEST_TYPE="${1:-upgrade}" + +# Step 1: Verify prerequisites +echo "Step 1: Verifying prerequisites..." + +# Check Azure CLI +if ! command -v az &> /dev/null; then + echo "ERROR: Azure CLI is not installed." + echo "Please install it from: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli" + exit 1 +fi +echo "✓ Azure CLI is installed" + +# Check kubectl +if ! command -v kubectl &> /dev/null; then + echo "ERROR: kubectl is not installed." + exit 1 +fi +echo "✓ kubectl is installed" + +# Check kubelogin +if ! command -v kubelogin &> /dev/null; then + echo "Installing kubelogin..." + az aks install-cli +fi +echo "✓ kubelogin is installed ($(kubelogin --version))" + +# Check jq +if ! command -v jq &> /dev/null; then + echo "ERROR: jq is not installed." + echo "Please install it: brew install jq (macOS) or sudo apt install jq (Linux)" + exit 1 +fi +echo "✓ jq is installed" + +# Verify Azure account login +if ! az account show &> /dev/null; then + echo "ERROR: Not logged into Azure." + echo "Please run: az login" + exit 1 +fi + +ACCOUNT_NAME=$(az account show --query "name" -o tsv) +echo "✓ Logged into Azure account: $ACCOUNT_NAME" + +# Check required environment variables +if [ -z "$SVC_RESOURCE_GROUP" ] || [ -z "$SVC_CLUSTER_NAME" ] || \ + [ -z "$MGMT_RESOURCE_GROUP" ] || [ -z "$MGMT_CLUSTER_NAME" ]; then + echo "ERROR: Required environment variables are not set." + echo "Please set:" + echo " export SVC_RESOURCE_GROUP=" + echo " export SVC_CLUSTER_NAME=" + echo " export MGMT_RESOURCE_GROUP=" + echo " export MGMT_CLUSTER_NAME=" + exit 1 +fi + +echo "Using clusters:" +echo " Service: $SVC_RESOURCE_GROUP/$SVC_CLUSTER_NAME" +echo " Management: $MGMT_RESOURCE_GROUP/$MGMT_CLUSTER_NAME" +echo "" + +# Step 2: Get AKS credentials +echo "Step 2: Getting AKS credentials..." + +az aks get-credentials \ + --resource-group "$SVC_RESOURCE_GROUP" \ + --name "$SVC_CLUSTER_NAME" \ + --overwrite-existing \ + -f ./svc-cluster.kubeconfig + +az aks get-credentials \ + --resource-group "$MGMT_RESOURCE_GROUP" \ + --name "$MGMT_CLUSTER_NAME" \ + --overwrite-existing \ + -f ./mgmt-cluster.kubeconfig + +echo "✓ Credentials downloaded" + +# Step 3: Convert kubeconfig for non-interactive login +echo "Step 3: Converting kubeconfig for azurecli..." + +kubelogin convert-kubeconfig -l azurecli --kubeconfig ./svc-cluster.kubeconfig +kubelogin convert-kubeconfig -l azurecli --kubeconfig ./mgmt-cluster.kubeconfig + +echo "✓ Kubeconfig converted" + +# Verify cluster access +echo "Verifying cluster access..." +kubectl --kubeconfig ./svc-cluster.kubeconfig get pods -A -l app=maestro +kubectl --kubeconfig ./mgmt-cluster.kubeconfig get pods -A -l app=maestro-agent + +echo "✓ Cluster access verified" +echo "" + +# Step 4: Generate in-cluster kubeconfig +echo "Step 4: Generating in-cluster kubeconfig..." + +generate_in_cluster_kube() { + local kubeconfig=$1 + local type=$2 + + echo " Generating for $type cluster..." + + # Create service account + kubectl --kubeconfig "$kubeconfig" -n default create serviceaccount e2e-test-admin 2>/dev/null || true + + # Create cluster role binding + cat << EOF | kubectl --kubeconfig "$kubeconfig" apply -f - +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: e2e-test-admin +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-admin +subjects: +- kind: ServiceAccount + name: e2e-test-admin + namespace: default +EOF + + # Create token + local token + token=$(kubectl --kubeconfig "$kubeconfig" create token e2e-test-admin --namespace default --duration=8h) + + # Get cluster info + local api_server + local ca_cert + api_server=$(kubectl --kubeconfig "$kubeconfig" config view -o jsonpath='{.clusters[0].cluster.server}') + ca_cert=$(kubectl --kubeconfig "$kubeconfig" config view --raw -o jsonpath='{.clusters[0].cluster.certificate-authority-data}') + + # Create in-cluster kubeconfig + cat > "${type}-incluster.kubeconfig" << EOF +apiVersion: v1 +kind: Config +clusters: +- name: my-cluster + cluster: + server: "$api_server" + certificate-authority-data: "$ca_cert" +users: +- name: e2e-test-admin + user: + token: "$token" +contexts: +- name: e2e-test-admin-context + context: + cluster: my-cluster + user: e2e-test-admin + namespace: default +current-context: e2e-test-admin-context +EOF +} + +generate_in_cluster_kube "$(pwd)/svc-cluster.kubeconfig" "svc" +generate_in_cluster_kube "$(pwd)/mgmt-cluster.kubeconfig" "mgmt" + +echo "✓ In-cluster kubeconfig files generated" +echo "" + +# Step 5: Extract deployment information +echo "Step 5: Extracting deployment information..." + +# Get pod template hash for active replicaset +pod_template_hash=$(kubectl --kubeconfig "$(pwd)/svc-cluster.kubeconfig" get rs -l app=maestro -n maestro -o jsonpath='{range .items[?(@.spec.replicas>0)]}{.metadata.labels.pod-template-hash}{"\n"}{end}' | head -1) +if [ -z "$pod_template_hash" ]; then + echo "ERROR: No active replicaset found" + exit 1 +fi +echo " Pod template hash: $pod_template_hash" + +# Get pod name +pod_name=$(kubectl --kubeconfig "$(pwd)/svc-cluster.kubeconfig" get pods -n maestro -l pod-template-hash="$pod_template_hash" -o jsonpath='{.items[0].metadata.name}') +if [ -z "$pod_name" ]; then + echo "ERROR: No pod found for replicaset hash $pod_template_hash" + exit 1 +fi +echo " Pod name: $pod_name" + +# Extract commit SHA +commit_sha=$(kubectl --kubeconfig "$(pwd)/svc-cluster.kubeconfig" logs -n maestro "$pod_name" | grep -i "Git Commit" | grep -oE '[a-f0-9]{40}') +if [ -z "$commit_sha" ]; then + echo "ERROR: Could not extract commit SHA from pod logs" + exit 1 +fi +echo " Commit SHA: $commit_sha" + +# Get consumer name +consumer_name=$(kubectl --kubeconfig "$(pwd)/mgmt-cluster.kubeconfig" get deployment maestro-agent -n maestro -o yaml | grep -E "^\s+- --consumer-name=" | sed 's/.*--consumer-name=//' | head -1) +if [ -z "$consumer_name" ]; then + echo "ERROR: Could not extract consumer name from agent deployment" + exit 1 +fi +echo " Consumer name: $consumer_name" +echo "" + +# Step 6: Run tests +echo "Step 6: Running $TEST_TYPE tests..." +echo "==========================================" +echo "" + +TEST_FAILED=0 + +run_upgrade_test() { + echo "Running upgrade tests..." + + if IMAGE="quay.io/redhat-user-workloads/maestro-rhtap-tenant/maestro-e2e:$commit_sha" \ + CONSUMER_NAME="$consumer_name" \ + SERVER_KUBECONFIG="$(pwd)/svc-cluster.kubeconfig" \ + AGENT_IN_CLUSTER_KUBECONFIG="$(pwd)/mgmt-incluster.kubeconfig" \ + SERVICE_ACCOUNT_NAME=clusters-service \ + ENABLE_AUTHORIZATION_POLICY=true \ + bash -x test/upgrade/script/run.sh; then + echo "✓ Upgrade test passed" + return 0 + else + echo "✗ Upgrade test failed" + return 1 + fi +} + +run_e2e_test() { + echo "Running E2E tests with istio..." + + if AGENT_NAMESPACE=maestro \ + IMAGE="quay.io/redhat-user-workloads/maestro-rhtap-tenant/maestro-e2e:$commit_sha" \ + CONSUMER_NAME="$consumer_name" \ + SERVER_KUBECONFIG="$(pwd)/svc-cluster.kubeconfig" \ + AGENT_KUBECONFIG="$(pwd)/mgmt-cluster.kubeconfig" \ + SERVER_IN_CLUSTER_KUBECONFIG="$(pwd)/svc-incluster.kubeconfig" \ + AGENT_IN_CLUSTER_KUBECONFIG="$(pwd)/mgmt-incluster.kubeconfig" \ + SERVICE_ACCOUNT_NAME=clusters-service \ + bash -x test/e2e/istio/test.sh; then + echo "✓ E2E test passed" + return 0 + else + echo "✗ E2E test failed" + return 1 + fi +} + +case "$TEST_TYPE" in + upgrade) + run_upgrade_test || TEST_FAILED=1 + ;; + e2e) + run_e2e_test || TEST_FAILED=1 + ;; + all) + run_upgrade_test || TEST_FAILED=1 + run_e2e_test || TEST_FAILED=1 + ;; + *) + echo "ERROR: Invalid test type: $TEST_TYPE" + echo "Valid options: upgrade, e2e, all" + exit 1 + ;; +esac + +echo "" +echo "==========================================" + +# Step 7: Summarize results +echo "Step 7: Test Summary" +echo "==========================================" + +if [ $TEST_FAILED -eq 0 ]; then + echo "✓ All tests PASSED" + echo "" + echo "Test configuration:" + echo " Image: quay.io/redhat-user-workloads/maestro-rhtap-tenant/maestro-e2e:$commit_sha" + echo " Consumer: $consumer_name" + echo " Test type: $TEST_TYPE" +else + echo "✗ Tests FAILED" + echo "" + echo "Check the test output above for failure details." + echo "Common failure locations:" + echo " - test/upgrade/script/run.sh output" + echo " - test/e2e/istio/test.sh output" + echo " - Pod logs: kubectl --kubeconfig ./svc-cluster.kubeconfig logs -n maestro -l app=maestro" +fi + +echo "" + +# Step 8: Cleanup +echo "Step 8: Cleaning up test resources..." + +kubectl --kubeconfig "$(pwd)/svc-cluster.kubeconfig" delete serviceaccount e2e-test-admin -n default 2>/dev/null || true +kubectl --kubeconfig "$(pwd)/svc-cluster.kubeconfig" delete clusterrolebinding e2e-test-admin 2>/dev/null || true +kubectl --kubeconfig "$(pwd)/mgmt-cluster.kubeconfig" delete serviceaccount e2e-test-admin -n default 2>/dev/null || true +kubectl --kubeconfig "$(pwd)/mgmt-cluster.kubeconfig" delete clusterrolebinding e2e-test-admin 2>/dev/null || true + +# Remove kubeconfig files containing sensitive credentials +echo "Removing temporary kubeconfig files..." +rm -f "$(pwd)/svc-cluster.kubeconfig" "$(pwd)/mgmt-cluster.kubeconfig" +rm -f "$(pwd)/svc-incluster.kubeconfig" "$(pwd)/mgmt-incluster.kubeconfig" + +echo "✓ Cleanup complete" +echo "" + +if [ $TEST_FAILED -eq 0 ]; then + echo "E2E testing completed successfully!" + exit 0 +else + echo "E2E testing completed with failures." + exit 1 +fi diff --git a/.claude/skills/setup-maestro-cluster/SKILL.md b/.claude/skills/setup-maestro-cluster/SKILL.md new file mode 100644 index 00000000..357e7ecf --- /dev/null +++ b/.claude/skills/setup-maestro-cluster/SKILL.md @@ -0,0 +1,30 @@ +--- +name: setup-maestro-cluster +description: Sets up a long-running Maestro cluster environment using Azure ARO-HCP infrastructure with both service and management clusters +category: Infrastructure +tags: [azure, aks, maestro, deployment, cluster, aro-hcp] +--- + +# Setup Maestro Long-Running Cluster + +Sets up a long-running Maestro cluster environment using Azure ARO-HCP infrastructure. This will deploy both service and management clusters. + +**Prerequisites:** +- Azure CLI installed and logged in +- Access to "ARO Hosted Control Planes" Azure subscription +- Internet connectivity + +**Environment variables set:** +- `USER=oasis` (if not already set) +- `PERSIST=true` +- `GITHUB_ACTIONS=true` +- `GOTOOLCHAIN=go1.24.4` + +**Note:** This deployment typically takes 25-30 minutes to complete. + +```bash +#!/bin/bash +# Execute the cluster setup script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +exec "$SCRIPT_DIR/scripts/setup.sh" "$@" +``` diff --git a/.claude/skills/setup-maestro-cluster/scripts/setup.sh b/.claude/skills/setup-maestro-cluster/scripts/setup.sh new file mode 100755 index 00000000..8aa712b8 --- /dev/null +++ b/.claude/skills/setup-maestro-cluster/scripts/setup.sh @@ -0,0 +1,104 @@ +#!/bin/bash +set -e + +echo "Starting Maestro long-running cluster setup..." + + +# Cleanup function +cleanup() { + if [ -n "$TEMP_DIR" ] && [ -d "$TEMP_DIR" ]; then + echo "Cleaning up temporary directory: $TEMP_DIR" + rm -rf "$TEMP_DIR" + fi +} + +# Register cleanup on exit +trap cleanup EXIT INT TERM + +# Step 1: Check if Azure CLI is installed +if ! command -v az &> /dev/null; then + echo "ERROR: Azure CLI is not installed." + echo "Please install it from: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli" + exit 1 +fi + +echo "✓ Azure CLI is installed" + +# Step 2: Verify Azure account login +if ! az account show &> /dev/null; then + echo "ERROR: Not logged into Azure." + echo "Please run: az login" + exit 1 +fi + +ACCOUNT_NAME=$(az account show --query "name" -o tsv) +echo "Current Azure account: $ACCOUNT_NAME" + +if [[ ! "$ACCOUNT_NAME" =~ "ARO Hosted Control Planes" ]]; then + echo "ERROR: Not logged into the correct Azure account." + echo "Expected account containing 'ARO Hosted Control Planes', but got: $ACCOUNT_NAME" + echo "Please login to the correct account using: az login" + exit 1 +fi + +echo "✓ Logged into correct Azure account" + +# Step 3: Clone ARO-HCP repository +TEMP_DIR=$(mktemp -d) + +# Validate mktemp succeeded +if [ -z "$TEMP_DIR" ] || [ ! -d "$TEMP_DIR" ]; then + echo "ERROR: Failed to create temporary directory" + exit 1 +fi + +echo "Cloning ARO-HCP repository to: $TEMP_DIR" + +if ! timeout 300 git clone https://github.com/Azure/ARO-HCP "$TEMP_DIR/ARO-HCP"; then + echo "ERROR: Failed to clone ARO-HCP repository (timeout: 300s)" + rm -rf "$TEMP_DIR" + exit 1 +fi + +echo "✓ Repository cloned successfully" + +# Step 4 & 5: Configure environment and deploy +pushd "$TEMP_DIR/ARO-HCP" > /dev/null + +echo "Setting environment variables..." +# Set USER to oasis if not already set (required by ARO-HCP) +export USER="${USER:-oasis}" +# PERSIST can be set via environment variable (default: true for not auto-cleanup after testing) +export PERSIST="${PERSIST:-true}" +export GITHUB_ACTIONS=true +export GOTOOLCHAIN=go1.24.4 + +echo "USER=$USER" +echo "PERSIST=$PERSIST" +echo "GITHUB_ACTIONS=$GITHUB_ACTIONS" +echo "GOTOOLCHAIN=$GOTOOLCHAIN" + +echo "" +echo "Starting personal-dev-env deployment..." +echo "This may take several minutes..." +echo "" + +if timeout 3600 make personal-dev-env; then + echo "" + echo "✓ Deployment completed successfully!" + echo "ARO-HCP repository location: $TEMP_DIR/ARO-HCP" +else + echo "" + echo "ERROR: Deployment failed or timed out (timeout: 3600s)!" + popd > /dev/null + exit 1 +fi + +popd > /dev/null + +# Cleanup temporary directory +echo "Cleaning up temporary clone..." +rm -rf "$TEMP_DIR" + +echo "" +echo "Setup complete!" diff --git a/.gitignore b/.gitignore index 38fa783b..21974f8a 100755 --- a/.gitignore +++ b/.gitignore @@ -69,3 +69,9 @@ unit-test-results.json *integration-test-results.json test/e2e/setup/aro/aro-hcp test/upgrade/report/* + +# Ignore ARO-HCP kubeconfig +*.kubeconfig + +# Ignore diagnostic reports +maestro-diagnosis-*.txt \ No newline at end of file