From 69604af167a0d55ab3be6e53e67fb02db09459c2 Mon Sep 17 00:00:00 2001 From: weiliu2 Date: Mon, 2 Feb 2026 21:37:32 +1300 Subject: [PATCH 01/11] Private AKS cluster: Add flexible node support - Add private-join command to join Private AKS cluster via Gateway - Add private-leave command with --mode=local|full cleanup options - Add private-install.sh and private-uninstall.sh scripts - Add pkg/privatecluster package with embedded scripts - Add documentation for creating and configuring Private AKS cluster --- commands.go | 102 +++ main.go | 10 +- pkg/privatecluster/README.md | 59 ++ pkg/privatecluster/create_private_cluster.md | 165 ++++ pkg/privatecluster/private-install.sh | 796 +++++++++++++++++++ pkg/privatecluster/private-uninstall.sh | 421 ++++++++++ pkg/privatecluster/scripts.go | 109 +++ 7 files changed, 1659 insertions(+), 3 deletions(-) create mode 100644 pkg/privatecluster/README.md create mode 100644 pkg/privatecluster/create_private_cluster.md create mode 100755 pkg/privatecluster/private-install.sh create mode 100755 pkg/privatecluster/private-uninstall.sh create mode 100644 pkg/privatecluster/scripts.go diff --git a/commands.go b/commands.go index f76dda7..c0d8fe4 100644 --- a/commands.go +++ b/commands.go @@ -14,6 +14,7 @@ import ( "go.goms.io/aks/AKSFlexNode/pkg/bootstrapper" "go.goms.io/aks/AKSFlexNode/pkg/config" "go.goms.io/aks/AKSFlexNode/pkg/logger" + "go.goms.io/aks/AKSFlexNode/pkg/privatecluster" "go.goms.io/aks/AKSFlexNode/pkg/status" ) @@ -118,6 +119,107 @@ func runVersion() { fmt.Printf("Build Time: %s\n", BuildTime) } +// Private cluster command variables +var ( + aksResourceID string + cleanupModeFlag string +) + +// NewPrivateJoinCommand creates a new private-join command +func NewPrivateJoinCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "private-join", + Short: "Join a Private AKS cluster (requires sudo)", + Long: `Join a Private AKS cluster. + +Prerequisites: + 1. A Private AKS cluster must exist with AAD and Azure RBAC enabled + See: pkg/privatecluster/create_private_cluster.md + + 2. Current user must have the following roles on the cluster: + - Azure Kubernetes Service Cluster Admin Role + - Azure Kubernetes Service RBAC Cluster Admin + + 3. Current user must be logged in via 'sudo az login' + +The full resource ID of the Private AKS cluster is required as the --aks-resource-id parameter. +This same resource ID can be used later with the private-leave command.`, + RunE: func(cmd *cobra.Command, args []string) error { + return runPrivateJoin(cmd.Context()) + }, + } + + cmd.Flags().StringVar(&aksResourceID, "aks-resource-id", "", "AKS cluster resource ID (required)") + cmd.MarkFlagRequired("aks-resource-id") + + return cmd +} + +// NewPrivateLeaveCommand creates a new private-leave command +func NewPrivateLeaveCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "private-leave", + Short: "Leave a Private AKS cluster (--mode=local|full, requires sudo)", + Long: `Remove this edge node from a Private AKS cluster. + +Cleanup modes: + --local Local cleanup only (default): + - Remove node from AKS cluster + - Run aks-flex-node unbootstrap + - Remove Arc Agent + - Stop VPN and remove client config + - Keep Gateway for other nodes + + --full Full cleanup (requires --aks-resource-id): + - All local cleanup steps + - Delete Gateway VM + - Delete Gateway subnet, NSG, Public IP + - Delete SSH keys + +This command requires the current user to be logged in via 'sudo az login'.`, + RunE: func(cmd *cobra.Command, args []string) error { + return runPrivateLeave(cmd.Context()) + }, + } + + cmd.Flags().StringVar(&cleanupModeFlag, "mode", "local", "Cleanup mode: 'local' (keep Gateway) or 'full' (remove all Azure resources)") + cmd.Flags().StringVar(&aksResourceID, "aks-resource-id", "", "AKS cluster resource ID (required for --mode=full)") + + return cmd +} + +// runPrivateJoin executes the private cluster join process +func runPrivateJoin(ctx context.Context) error { + if os.Getuid() != 0 { + return fmt.Errorf("this command requires root privileges, please run with 'sudo'") + } + runner := privatecluster.NewScriptRunner("") + return runner.RunPrivateInstall(ctx, aksResourceID) +} + +// runPrivateLeave executes the private cluster leave process +func runPrivateLeave(ctx context.Context) error { + if os.Getuid() != 0 { + return fmt.Errorf("this command requires root privileges, please run with 'sudo'") + } + // Validate cleanup mode + var mode privatecluster.CleanupMode + switch cleanupModeFlag { + case "local": + mode = privatecluster.CleanupModeLocal + case "full": + mode = privatecluster.CleanupModeFull + if aksResourceID == "" { + return fmt.Errorf("--aks-resource-id is required for full cleanup mode") + } + default: + return fmt.Errorf("invalid cleanup mode: %s (use 'local' or 'full')", cleanupModeFlag) + } + + runner := privatecluster.NewScriptRunner("") + return runner.RunPrivateUninstall(ctx, mode, aksResourceID) +} + // runDaemonLoop runs the periodic status collection and bootstrap monitoring daemon func runDaemonLoop(ctx context.Context, cfg *config.Config) error { logger := logger.GetLoggerFromContext(ctx) diff --git a/main.go b/main.go index 7edddd7..a5ad585 100644 --- a/main.go +++ b/main.go @@ -25,13 +25,16 @@ func main() { } // Add global flags for configuration - rootCmd.PersistentFlags().StringVar(&configPath, "config", "", "Path to configuration JSON file (required)") + rootCmd.PersistentFlags().StringVar(&configPath, "config", "", "Path to configuration JSON file (required for agent/unbootstrap)") + rootCmd.PersistentFlags().MarkHidden("config") // Hide from global help, shown in agent/unbootstrap help // Don't mark as required globally - we'll check in PersistentPreRunE for commands that need it // Add commands rootCmd.AddCommand(NewAgentCommand()) rootCmd.AddCommand(NewUnbootstrapCommand()) rootCmd.AddCommand(NewVersionCommand()) + rootCmd.AddCommand(NewPrivateJoinCommand()) + rootCmd.AddCommand(NewPrivateLeaveCommand()) // Set up context with signal handling ctx, cancel := context.WithCancel(context.Background()) @@ -49,8 +52,9 @@ func main() { // Set up persistent pre-run to initialize config and logger rootCmd.PersistentPreRunE = func(cmd *cobra.Command, args []string) error { - // Skip config loading for version command - if cmd.Name() == "version" { + // Skip config loading for commands that don't need it + switch cmd.Name() { + case "version", "private-join", "private-leave": return nil } diff --git a/pkg/privatecluster/README.md b/pkg/privatecluster/README.md new file mode 100644 index 0000000..285d2eb --- /dev/null +++ b/pkg/privatecluster/README.md @@ -0,0 +1,59 @@ +# Private AKS Cluster - Edge Node Join/Leave + +## Prerequisites + +### 1. Login to Azure CLI as root + +```bash +sudo az login +``` + +### 2. Create a Private AKS Cluster + +Create a Private AKS cluster with AAD and Azure RBAC enabled, and assign the required roles to your user. + +See: [create_private_cluster.md](create_private_cluster.md) + +## Join Private AKS Cluster + +### 1. Build the project + +```bash +go build -o aks-flex-node . +``` + +### 2. Join the cluster + +```bash +sudo ./aks-flex-node private-join --aks-resource-id "" +``` + +Example: +```bash +sudo ./aks-flex-node private-join \ + --aks-resource-id "/subscriptions/xxx/resourcegroups/my-rg/providers/Microsoft.ContainerService/managedClusters/my-private-aks" +``` + +### 3. Verify + +```bash +sudo kubectl get nodes +``` + +## Leave Private AKS Cluster + +```bash +sudo ./aks-flex-node private-leave --mode= [--aks-resource-id ""] +``` + +### Mode Comparison + +| Mode | Command | Description | +|------|---------|-------------| +| `local` | `sudo ./aks-flex-node private-leave --mode=local` | Remove node and local components, **keep Gateway** for other nodes | +| `full` | `sudo ./aks-flex-node private-leave --mode=full --aks-resource-id "..."` | Remove all components **including Gateway and Azure resources** | + +### When to use each mode + +- **`--mode=local`**: Other nodes are still using the Gateway, or you plan to rejoin later +- **`--mode=full`**: Last node leaving, clean up all Azure resources (Gateway VM, subnet, NSG, public IP) diff --git a/pkg/privatecluster/create_private_cluster.md b/pkg/privatecluster/create_private_cluster.md new file mode 100644 index 0000000..bfbac97 --- /dev/null +++ b/pkg/privatecluster/create_private_cluster.md @@ -0,0 +1,165 @@ +# Create Private AKS Cluster + +This guide shows how to create a Private AKS Cluster with AAD and Azure RBAC enabled for edge node testing. + +## Prerequisites + +### 1. Login to Azure CLI as root + +```bash +sudo az login +``` + +### 2. Set variables + +```bash +# Required +CLUSTER_NAME="my-private-aks" +RESOURCE_GROUP="my-rg" +LOCATION="eastus2" + +# Optional (defaults) +VNET_NAME="${CLUSTER_NAME}-vnet" +VNET_CIDR="10.224.0.0/12" +SUBNET_NAME="aks-subnet" +SUBNET_CIDR="10.224.0.0/16" +NODE_COUNT=1 +NODE_VM_SIZE="Standard_D2s_v3" +``` + +## Step 1: Create Resource Group + +```bash +az group create \ + --name "$RESOURCE_GROUP" \ + --location "$LOCATION" +``` + +## Step 2: Create VNet and Subnet + +```bash +# Create VNet +az network vnet create \ + --resource-group "$RESOURCE_GROUP" \ + --name "$VNET_NAME" \ + --address-prefix "$VNET_CIDR" + +# Create Subnet +az network vnet subnet create \ + --resource-group "$RESOURCE_GROUP" \ + --vnet-name "$VNET_NAME" \ + --name "$SUBNET_NAME" \ + --address-prefix "$SUBNET_CIDR" +``` + +## Step 3: Create Private AKS Cluster + +```bash +# Get Subnet ID +SUBNET_ID=$(az network vnet subnet show \ + --resource-group "$RESOURCE_GROUP" \ + --vnet-name "$VNET_NAME" \ + --name "$SUBNET_NAME" \ + --query id -o tsv) + +# Create Private AKS Cluster +az aks create \ + --resource-group "$RESOURCE_GROUP" \ + --name "$CLUSTER_NAME" \ + --location "$LOCATION" \ + --node-count "$NODE_COUNT" \ + --node-vm-size "$NODE_VM_SIZE" \ + --network-plugin azure \ + --vnet-subnet-id "$SUBNET_ID" \ + --enable-private-cluster \ + --enable-aad \ + --enable-azure-rbac \ + --generate-ssh-keys +``` + +> **Note:** This may take 5-10 minutes. + +## Step 4: Assign RBAC Roles to Current User + +The current user needs two roles to manage the cluster: + +| Role | Purpose | +|------|---------| +| Azure Kubernetes Service Cluster Admin Role | Get kubectl credentials | +| Azure Kubernetes Service RBAC Cluster Admin | Perform cluster operations | + +```bash +# Get current user's Object ID +USER_OBJECT_ID=$(az ad signed-in-user show --query id -o tsv) + +# Get AKS Resource ID +AKS_RESOURCE_ID=$(az aks show \ + --resource-group "$RESOURCE_GROUP" \ + --name "$CLUSTER_NAME" \ + --query id -o tsv) + +# Assign Role 1: Azure Kubernetes Service Cluster Admin Role +az role assignment create \ + --assignee "$USER_OBJECT_ID" \ + --role "Azure Kubernetes Service Cluster Admin Role" \ + --scope "$AKS_RESOURCE_ID" + +# Assign Role 2: Azure Kubernetes Service RBAC Cluster Admin +az role assignment create \ + --assignee "$USER_OBJECT_ID" \ + --role "Azure Kubernetes Service RBAC Cluster Admin" \ + --scope "$AKS_RESOURCE_ID" +``` + +## Step 5: Get Kubectl Credentials + +```bash +# Create kubeconfig directory +sudo mkdir -p /root/.kube + +# Get credentials +sudo az aks get-credentials \ + --resource-group "$RESOURCE_GROUP" \ + --name "$CLUSTER_NAME" \ + --overwrite-existing \ + --file /root/.kube/config + +# Convert kubeconfig for Azure CLI auth +sudo kubelogin convert-kubeconfig -l azurecli --kubeconfig /root/.kube/config +``` + +## Step 6: Get Cluster Resource ID + +Save this for use with `private-join` and `private-leave` commands: + +```bash +az aks show \ + --resource-group "$RESOURCE_GROUP" \ + --name "$CLUSTER_NAME" \ + --query id -o tsv +``` + +Example output: +``` +/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourcegroups/my-rg/providers/Microsoft.ContainerService/managedClusters/my-private-aks +``` + +## Next Steps + +### Join an edge node to the private cluster + +```bash +sudo ./aks-flex-node private-join \ + --aks-resource-id "/subscriptions/.../resourcegroups/.../providers/Microsoft.ContainerService/managedClusters/my-private-aks" +``` + +### Leave the private cluster + +```bash +# Local cleanup (keep Gateway for other nodes) +sudo ./aks-flex-node private-leave --mode=local + +# Full cleanup (remove Gateway and all Azure resources) +sudo ./aks-flex-node private-leave --mode=full \ + --aks-resource-id "/subscriptions/.../resourcegroups/.../providers/Microsoft.ContainerService/managedClusters/my-private-aks" +``` diff --git a/pkg/privatecluster/private-install.sh b/pkg/privatecluster/private-install.sh new file mode 100755 index 0000000..ac0569e --- /dev/null +++ b/pkg/privatecluster/private-install.sh @@ -0,0 +1,796 @@ +#!/bin/bash +# private-install.sh - Called by: aks-flex-node private-join +# Join local node to Private AKS Cluster via Gateway +# +# Usage: +# sudo ./aks-flex-node private-join --aks-resource-id "/subscriptions/.../managedClusters/xxx" + +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +# Configuration +GATEWAY_NAME="wg-gateway" +GATEWAY_SUBNET_NAME="wg-subnet" +GATEWAY_SUBNET_PREFIX="10.0.100.0/24" +GATEWAY_VPN_NETWORK="172.16.0.0/24" +GATEWAY_VPN_IP="172.16.0.1" +GATEWAY_VM_SIZE="Standard_D2s_v3" +GATEWAY_PORT="51820" +NETWORK_INTERFACE="wg-aks" +# Handle sudo: use original user's home directory +if [[ -n "${SUDO_USER:-}" ]]; then + REAL_HOME=$(getent passwd "$SUDO_USER" | cut -d: -f6) +else + REAL_HOME="$HOME" +fi +SSH_KEY_PATH="${REAL_HOME}/.ssh/id_rsa_wg_gateway" +VERBOSE=false + +# Cleanup function for Ctrl+C +cleanup_on_exit() { + echo "" + log_warning "Interrupted! Cleaning up..." + sudo pkill -f "aks-flex-node agent" 2>/dev/null || true + exit 1 +} + +# Trap Ctrl+C and other termination signals +trap cleanup_on_exit SIGINT SIGTERM + +# Functions +log_info() { + echo -e "${BLUE}INFO:${NC} $1" +} + +log_success() { + echo -e "${GREEN}SUCCESS:${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}WARNING:${NC} $1" +} + +log_error() { + echo -e "${RED}ERROR:${NC} $1" +} + +log_verbose() { + if [[ "$VERBOSE" == "true" ]]; then + echo -e "${BLUE}VERBOSE:${NC} $1" + fi +} + +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + --aks-resource-id) + AKS_RESOURCE_ID="$2" + shift 2 + ;; + --gateway-name) + GATEWAY_NAME="$2" + shift 2 + ;; + --gateway-subnet) + GATEWAY_SUBNET_PREFIX="$2" + shift 2 + ;; + --gateway-vm-size) + GATEWAY_VM_SIZE="$2" + shift 2 + ;; + --verbose) + VERBOSE=true + shift + ;; + *) + log_error "Unknown argument: $1" + exit 1 + ;; + esac + done + + # Validate required arguments + if [[ -z "${AKS_RESOURCE_ID:-}" ]]; then + log_error "Missing required argument: --aks-resource-id" + exit 1 + fi + + # Parse AKS Resource ID + parse_aks_resource_id +} + +parse_aks_resource_id() { + # Format: /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.ContainerService/managedClusters/{name} + # Normalize: Azure CLI sometimes returns lowercase 'resourcegroups', but Go code expects 'resourceGroups' + AKS_RESOURCE_ID=$(echo "$AKS_RESOURCE_ID" | sed 's|/resourcegroups/|/resourceGroups/|g') + + SUBSCRIPTION_ID=$(echo "$AKS_RESOURCE_ID" | cut -d'/' -f3) + RESOURCE_GROUP=$(echo "$AKS_RESOURCE_ID" | cut -d'/' -f5) + AKS_CLUSTER_NAME=$(echo "$AKS_RESOURCE_ID" | cut -d'/' -f9) + + if [[ -z "$SUBSCRIPTION_ID" || -z "$RESOURCE_GROUP" || -z "$AKS_CLUSTER_NAME" ]]; then + log_error "Invalid AKS Resource ID format" + exit 1 + fi + + log_verbose "Subscription ID: $SUBSCRIPTION_ID" + log_verbose "Resource Group: $RESOURCE_GROUP" + log_verbose "AKS Cluster Name: $AKS_CLUSTER_NAME" +} + +# Phase 1: Environment Check +phase1_environment_check() { + # Clean up old kube cache to avoid stale tokens + log_info "Cleaning up old kube cache..." + rm -rf /root/.kube/cache 2>/dev/null || true + rm -rf "${REAL_HOME}/.kube/cache" 2>/dev/null || true + log_success "Kube cache cleaned" + + # Check Azure CLI is installed + log_info "Checking Azure CLI..." + if ! command -v az &>/dev/null; then + log_error "Azure CLI not installed. Please install: curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash" + exit 1 + fi + log_success "Azure CLI installed" + + # Check Azure CLI login status + log_info "Checking Azure CLI login status..." + if ! az account show &>/dev/null; then + log_error "Azure CLI not logged in, please run 'az login' first" + exit 1 + fi + log_success "Azure CLI logged in" + + # Check if token is valid + if ! az account get-access-token --only-show-errors &>/dev/null; then + log_warning "Azure token expired or invalid, re-authenticating..." + az login + fi + + # Set subscription + log_info "Setting subscription: $SUBSCRIPTION_ID" + az account set --subscription "$SUBSCRIPTION_ID" + log_success "Subscription set successfully" + + # Get Tenant ID + TENANT_ID=$(az account show --query tenantId -o tsv) + log_verbose "Tenant ID: $TENANT_ID" + + # Verify AKS cluster exists + log_info "Verifying AKS cluster: $AKS_CLUSTER_NAME" + if ! az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" &>/dev/null; then + log_error "AKS cluster '$AKS_CLUSTER_NAME' not found" + exit 1 + fi + + # Check AAD and RBAC + log_info "Checking AKS cluster AAD and RBAC configuration..." + AAD_ENABLED=$(az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ + --query "aadProfile.managed" -o tsv 2>/dev/null || echo "false") + RBAC_ENABLED=$(az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ + --query "aadProfile.enableAzureRbac" -o tsv 2>/dev/null || echo "false") + + if [[ "$AAD_ENABLED" != "true" ]]; then + log_error "AKS cluster AAD not enabled, please enable: az aks update --enable-aad" + exit 1 + fi + if [[ "$RBAC_ENABLED" != "true" ]]; then + log_error "AKS cluster Azure RBAC not enabled, please enable: az aks update --enable-azure-rbac" + exit 1 + fi + log_success "AKS cluster AAD and RBAC enabled" + + + # Get AKS VNet info + log_info "Getting AKS VNet info..." + AKS_NODE_RG=$(az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ + --query "nodeResourceGroup" -o tsv) + + # Get VNet info from VMSS + VMSS_NAME=$(az vmss list --resource-group "$AKS_NODE_RG" --query "[0].name" -o tsv) + if [[ -z "$VMSS_NAME" ]]; then + log_error "Cannot find AKS node VMSS" + exit 1 + fi + + VNET_SUBNET_ID=$(az vmss show --resource-group "$AKS_NODE_RG" --name "$VMSS_NAME" \ + --query "virtualMachineProfile.networkProfile.networkInterfaceConfigurations[0].ipConfigurations[0].subnet.id" -o tsv) + + VNET_NAME=$(echo "$VNET_SUBNET_ID" | cut -d'/' -f9) + VNET_RG=$(echo "$VNET_SUBNET_ID" | cut -d'/' -f5) + + log_success "VNet: $VNET_NAME (Resource Group: $VNET_RG)" + + # Get Location + LOCATION=$(az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ + --query "location" -o tsv) + log_verbose "Location: $LOCATION" + + # Check local dependencies + log_info "Checking local dependencies..." + + if ! command -v wg &>/dev/null; then + log_info "Installing VPN tools..." + sudo apt-get update && sudo apt-get install -y wireguard-tools + fi + log_success "VPN tools installed" + + if ! command -v jq &>/dev/null; then + log_info "Installing jq..." + sudo apt-get install -y jq + fi + log_success "jq installed" + + # Install kubectl and kubelogin + if ! command -v kubectl &>/dev/null || ! command -v kubelogin &>/dev/null; then + log_info "Installing kubectl and kubelogin..." + az aks install-cli --install-location /usr/local/bin/kubectl --kubelogin-install-location /usr/local/bin/kubelogin + chmod +x /usr/local/bin/kubectl /usr/local/bin/kubelogin + fi + # Verify installation + if ! command -v kubectl &>/dev/null; then + log_error "kubectl installation failed" + exit 1 + fi + if ! command -v kubelogin &>/dev/null; then + log_error "kubelogin installation failed" + exit 1 + fi + log_success "kubectl and kubelogin installed" + + # Install Azure CLI connectedmachine extension + if ! az extension show --name connectedmachine &>/dev/null; then + log_info "Installing Azure CLI connectedmachine extension..." + az config set extension.dynamic_install_allow_preview=true --only-show-errors 2>/dev/null || true + az extension add --name connectedmachine --allow-preview true --only-show-errors + fi + log_success "Azure CLI extensions ready" +} + +# Phase 2: Gateway Setup +phase2_gateway_setup() { + # Check if Gateway exists + log_info "Checking if Gateway exists..." + if az vm show --resource-group "$RESOURCE_GROUP" --name "$GATEWAY_NAME" &>/dev/null; then + log_info "Gateway exists, reusing" + GATEWAY_EXISTS=true + + # Get Public IP + WG_PUBLIC_IP=$(az vm list-ip-addresses --resource-group "$RESOURCE_GROUP" --name "$GATEWAY_NAME" \ + --query "[0].virtualMachine.network.publicIpAddresses[0].ipAddress" -o tsv) + log_success "Gateway Public IP: $WG_PUBLIC_IP" + else + log_info "Gateway not found, creating new one" + GATEWAY_EXISTS=false + create_gateway_infrastructure + fi + + # Ensure SSH key exists + ensure_ssh_key + + # Add SSH key to Gateway (idempotent, works for both new and existing Gateway) + log_info "Adding SSH key to Gateway..." + az vm user update \ + --resource-group "$RESOURCE_GROUP" \ + --name "$GATEWAY_NAME" \ + --username azureuser \ + --ssh-key-value "$(cat ${SSH_KEY_PATH}.pub)" \ + --output none + log_success "SSH key added to Gateway" + + # Wait for VM ready and get server info + wait_for_vm_ready + get_server_info +} + +create_gateway_infrastructure() { + # Create Gateway Subnet + log_info "Checking/creating Gateway subnet..." + if ! az network vnet subnet show --resource-group "$VNET_RG" --vnet-name "$VNET_NAME" \ + --name "$GATEWAY_SUBNET_NAME" &>/dev/null; then + az network vnet subnet create \ + --resource-group "$VNET_RG" \ + --vnet-name "$VNET_NAME" \ + --name "$GATEWAY_SUBNET_NAME" \ + --address-prefixes "$GATEWAY_SUBNET_PREFIX" + log_success "Subnet $GATEWAY_SUBNET_NAME created" + else + log_info "Subnet $GATEWAY_SUBNET_NAME already exists" + fi + + # Create NSG + log_info "Checking/creating NSG..." + NSG_NAME="${GATEWAY_NAME}-nsg" + if ! az network nsg show --resource-group "$RESOURCE_GROUP" --name "$NSG_NAME" &>/dev/null; then + az network nsg create --resource-group "$RESOURCE_GROUP" --name "$NSG_NAME" + + # Add SSH rule (priority 100 to override NRMS-Rule-106 which denies SSH from Internet at priority 106) + az network nsg rule create \ + --resource-group "$RESOURCE_GROUP" \ + --nsg-name "$NSG_NAME" \ + --name allow-ssh \ + --priority 100 \ + --destination-port-ranges 22 \ + --protocol Tcp \ + --access Allow + + # Add VPN rule + az network nsg rule create \ + --resource-group "$RESOURCE_GROUP" \ + --nsg-name "$NSG_NAME" \ + --name allow-wireguard \ + --priority 200 \ + --destination-port-ranges "$GATEWAY_PORT" \ + --protocol Udp \ + --access Allow + + log_success "NSG $NSG_NAME created" + else + log_info "NSG $NSG_NAME already exists" + fi + + # Create Public IP + log_info "Checking/creating Public IP..." + PIP_NAME="${GATEWAY_NAME}-pip" + if ! az network public-ip show --resource-group "$RESOURCE_GROUP" --name "$PIP_NAME" &>/dev/null; then + az network public-ip create \ + --resource-group "$RESOURCE_GROUP" \ + --name "$PIP_NAME" \ + --sku Standard \ + --allocation-method Static + log_success "Public IP $PIP_NAME created" + else + log_info "Public IP $PIP_NAME already exists" + fi + + # Generate SSH key + ensure_ssh_key + + # Create VM + log_info "Creating Gateway..." + az vm create \ + --resource-group "$RESOURCE_GROUP" \ + --name "$GATEWAY_NAME" \ + --image Ubuntu2204 \ + --size "$GATEWAY_VM_SIZE" \ + --vnet-name "$VNET_NAME" \ + --subnet "$GATEWAY_SUBNET_NAME" \ + --nsg "$NSG_NAME" \ + --public-ip-address "$PIP_NAME" \ + --admin-username azureuser \ + --ssh-key-values "${SSH_KEY_PATH}.pub" \ + --zone 1 + + # Get Public IP + WG_PUBLIC_IP=$(az network public-ip show --resource-group "$RESOURCE_GROUP" --name "$PIP_NAME" \ + --query ipAddress -o tsv) + log_success "Gateway created, Public IP: $WG_PUBLIC_IP" + + # Wait for new VM to boot up + log_info "Waiting 120 seconds for VM to boot up..." + sleep 120 +} + +ensure_ssh_key() { + if [[ ! -f "$SSH_KEY_PATH" ]]; then + log_info "Generating SSH key..." + ssh-keygen -t rsa -b 4096 -f "$SSH_KEY_PATH" -N "" + # Fix ownership if running with sudo (so user can SSH without sudo) + if [[ -n "${SUDO_USER:-}" ]]; then + chown "$SUDO_USER:$SUDO_USER" "$SSH_KEY_PATH" "${SSH_KEY_PATH}.pub" + fi + log_success "SSH key generated: $SSH_KEY_PATH" + else + log_info "SSH key already exists: $SSH_KEY_PATH" + fi +} + +wait_for_vm_ready() { + log_info "Checking VM SSH connectivity..." + + # First quick check + if ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o ConnectTimeout=10 -i "$SSH_KEY_PATH" \ + azureuser@"$WG_PUBLIC_IP" "echo ready" &>/dev/null; then + log_success "VM SSH connection ready" + return 0 + fi + + # SSH failed, restart VM if it's an existing VM + if [[ "$GATEWAY_EXISTS" == "true" ]]; then + log_warning "SSH connection failed, restarting VM..." + az vm restart --resource-group "$RESOURCE_GROUP" --name "$GATEWAY_NAME" --no-wait + log_info "Waiting 120 seconds for VM to restart..." + sleep 120 + fi + + # Wait for SSH with retries + log_info "Waiting for VM to be ready..." + local max_attempts=18 + local attempt=0 + + while [[ $attempt -lt $max_attempts ]]; do + if ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o ConnectTimeout=5 -i "$SSH_KEY_PATH" \ + azureuser@"$WG_PUBLIC_IP" "echo ready" &>/dev/null; then + log_success "VM SSH connection ready" + return 0 + fi + attempt=$((attempt + 1)) + log_verbose "Waiting for SSH... ($attempt/$max_attempts)" + sleep 10 + done + + log_error "VM SSH connection timeout" + exit 1 +} + +get_server_info() { + log_info "Getting/configuring Gateway server..." + + # Check if networking is already installed + if ! ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" "command -v wg" &>/dev/null; then + log_info "Installing and configuring networking on Gateway..." + install_wireguard_server + else + log_info "Networking already installed" + fi + + # Get server public key + SERVER_PUBLIC_KEY=$(ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" \ + "sudo cat /etc/wireguard/server_public.key 2>/dev/null || echo ''") + + if [[ -z "$SERVER_PUBLIC_KEY" ]]; then + log_info "Server key not found, reconfiguring..." + install_wireguard_server + SERVER_PUBLIC_KEY=$(ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" \ + "sudo cat /etc/wireguard/server_public.key") + fi + + log_success "Server public key retrieved" + + # Get existing peer count + EXISTING_PEERS=$(ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" \ + "sudo wg show wg0 peers 2>/dev/null | wc -l || echo 0") + log_verbose "Existing peer count: $EXISTING_PEERS" + + # Calculate client IP + CLIENT_IP_SUFFIX=$((EXISTING_PEERS + 2)) + CLIENT_VPN_IP="172.16.0.${CLIENT_IP_SUFFIX}" + log_success "Assigned client VPN IP: $CLIENT_VPN_IP" +} + +install_wireguard_server() { + ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" << 'REMOTE_SCRIPT' +set -e + +# Install networking +sudo apt-get update +sudo apt-get install -y wireguard + +# Generate key pair +sudo wg genkey | sudo tee /etc/wireguard/server_private.key | sudo wg pubkey | sudo tee /etc/wireguard/server_public.key +sudo chmod 600 /etc/wireguard/server_private.key + +SERVER_PRIVATE_KEY=$(sudo cat /etc/wireguard/server_private.key) + +# Create configuration +sudo tee /etc/wireguard/wg0.conf << EOF +[Interface] +PrivateKey = ${SERVER_PRIVATE_KEY} +Address = 172.16.0.1/24 +ListenPort = 51820 +PostUp = iptables -A FORWARD -i wg0 -j ACCEPT; iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE +PostDown = iptables -D FORWARD -i wg0 -j ACCEPT; iptables -t nat -D POSTROUTING -o eth0 -j MASQUERADE +EOF + +# Enable IP forwarding +echo 'net.ipv4.ip_forward=1' | sudo tee -a /etc/sysctl.conf +sudo sysctl -p + +# Start networking +sudo systemctl enable wg-quick@wg0 +sudo systemctl start wg-quick@wg0 || sudo systemctl restart wg-quick@wg0 + +echo "Gateway server configuration complete" +REMOTE_SCRIPT +} + +# Phase 3: Client Configuration +phase3_client_setup() { + # Generate client key pair + log_info "Generating client key pair..." + CLIENT_PRIVATE_KEY=$(wg genkey) + CLIENT_PUBLIC_KEY=$(echo "$CLIENT_PRIVATE_KEY" | wg pubkey) + log_success "Client key pair generated" + + # Create Gateway client configuration + log_info "Creating Gateway client configuration..." + sudo tee /etc/wireguard/${NETWORK_INTERFACE}.conf > /dev/null << EOF +[Interface] +PrivateKey = ${CLIENT_PRIVATE_KEY} +Address = ${CLIENT_VPN_IP}/24 + +[Peer] +PublicKey = ${SERVER_PUBLIC_KEY} +Endpoint = ${WG_PUBLIC_IP}:${GATEWAY_PORT} +AllowedIPs = 10.0.0.0/8, 172.16.0.0/24 +PersistentKeepalive = 25 +EOF + sudo chmod 600 /etc/wireguard/${NETWORK_INTERFACE}.conf + log_success "Client configuration created" + + # Add client peer to server + log_info "Adding client peer to server..." + ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" \ + "sudo wg set wg0 peer '${CLIENT_PUBLIC_KEY}' allowed-ips ${CLIENT_VPN_IP}/32" + + # Persist configuration + ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" "sudo wg-quick save wg0" + log_success "Client peer added" + + # Start networking connection + log_info "Starting networking connection..." + sudo wg-quick down "$NETWORK_INTERFACE" 2>/dev/null || true + sudo wg-quick up "$NETWORK_INTERFACE" + + # Verify connection + sleep 3 + if ping -c 1 -W 3 "$GATEWAY_VPN_IP" &>/dev/null; then + log_success "Networking connected, can ping Gateway ($GATEWAY_VPN_IP)" + else + log_error "Networking connection failed, cannot ping Gateway" + exit 1 + fi +} + +# Phase 4: Node Join +phase4_node_join() { + # Get API Server private FQDN + log_info "Getting AKS API Server address..." + API_SERVER_FQDN=$(az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ + --query "privateFqdn" -o tsv) + log_verbose "API Server FQDN: $API_SERVER_FQDN" + + # Resolve private DNS through Gateway + log_info "Resolving API Server private IP..." + API_SERVER_IP=$(ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" \ + "nslookup $API_SERVER_FQDN | grep -A1 'Name:' | grep 'Address:' | awk '{print \$2}'" 2>/dev/null || echo "") + + if [[ -z "$API_SERVER_IP" ]]; then + log_error "Cannot resolve API Server private IP" + exit 1 + fi + log_success "API Server IP: $API_SERVER_IP" + + # Add hosts entry + log_info "Adding hosts entry..." + if ! grep -q "$API_SERVER_FQDN" /etc/hosts; then + echo "$API_SERVER_IP $API_SERVER_FQDN" | sudo tee -a /etc/hosts + log_success "Hosts entry added" + else + log_info "Hosts entry already exists" + fi + + # Disable swap + log_info "Disabling swap..." + sudo swapoff -a + log_success "Swap disabled" + + # Install Azure Arc agent (required for aks-flex-node) + log_info "Checking Azure Arc agent..." + if ! command -v azcmagent &>/dev/null; then + log_info "Installing Azure Arc agent..." + # Clean up any existing package state to avoid conflicts + sudo dpkg --purge azcmagent 2>/dev/null || true + + local temp_dir + temp_dir=$(mktemp -d) + + curl -L -o "$temp_dir/install_linux_azcmagent.sh" https://gbl.his.arc.azure.com/azcmagent-linux + chmod +x "$temp_dir/install_linux_azcmagent.sh" + sudo bash "$temp_dir/install_linux_azcmagent.sh" + rm -rf "$temp_dir" + + log_success "Azure Arc agent installed" + else + log_info "Azure Arc agent already installed" + fi + + # Get AKS credentials (save to root's kubeconfig for consistency with sudo az login) + log_info "Getting AKS credentials..." + mkdir -p /root/.kube + az aks get-credentials --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ + --overwrite-existing --file /root/.kube/config + + # Convert kubeconfig to use Azure CLI auth (for AAD + Azure RBAC) + log_info "Converting kubeconfig for Azure CLI auth..." + kubelogin convert-kubeconfig -l azurecli --kubeconfig /root/.kube/config + log_success "Kubeconfig ready (saved to /root/.kube/config)" + + # Generate config.json + log_info "Generating aks-flex-node configuration..." + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + PROJECT_ROOT="${SCRIPT_DIR}/../.." + CONFIG_FILE="${PROJECT_ROOT}/config.json" + + cat > "$CONFIG_FILE" << EOF +{ + "azure": { + "subscriptionId": "${SUBSCRIPTION_ID}", + "tenantId": "${TENANT_ID}", + "targetCluster": { + "resourceId": "${AKS_RESOURCE_ID}", + "location": "${LOCATION}" + }, + "arc": { + "resourceGroup": "${RESOURCE_GROUP}", + "location": "${LOCATION}" + } + }, + "network": { + "mode": "wireguard", + "wireguard": { + "serverEndpoint": "${WG_PUBLIC_IP}:${GATEWAY_PORT}", + "serverPublicKey": "${SERVER_PUBLIC_KEY}", + "clientAddress": "${CLIENT_VPN_IP}/24", + "allowedIPs": ["10.0.0.0/8", "172.16.0.0/24"], + "persistentKeepalive": 25, + "testEndpoint": "${API_SERVER_IP}:443" + } + }, + "kubernetes": { + "version": "1.29.0" + }, + "containerd": { + "version": "1.7.11", + "pauseImage": "mcr.microsoft.com/oss/kubernetes/pause:3.6" + }, + "agent": { + "logLevel": "info", + "logDir": "/var/log/aks-flex-node" + } +} +EOF + log_success "Config file generated: $CONFIG_FILE" + + # Run aks-flex-node + log_info "Running aks-flex-node agent..." + cd "${PROJECT_ROOT}" + + # Build if needed + if [[ ! -f "./aks-flex-node" ]]; then + log_info "Building aks-flex-node..." + go build -o aks-flex-node . + fi + + # Kill any existing aks-flex-node agent process + log_info "Stopping any existing aks-flex-node agent..." + sudo pkill -f "aks-flex-node agent" 2>/dev/null || true + sleep 2 + + # Create log directory + sudo mkdir -p /var/log/aks-flex-node + + # Run agent in background + LOG_FILE="/var/log/aks-flex-node/agent.log" + sudo bash -c "./aks-flex-node agent --config '$CONFIG_FILE' > '$LOG_FILE' 2>&1" & + AGENT_PID=$! + log_info "Agent started in background (PID: $AGENT_PID)" + # Wait for bootstrap to complete (check log file, minimal output) + log_info "Waiting for bootstrap to complete (may take 2-3 minutes)..." + log_info "View details: sudo tail -f $LOG_FILE" + + local max_wait=300 + local waited=0 + local bootstrap_success=false + local bootstrap_failed=false + + # Simple progress indicator + printf " " + while [[ $waited -lt $max_wait ]]; do + # Check success/failure + if sudo grep -q "bootstrap completed successfully" "$LOG_FILE" /dev/null; then + bootstrap_success=true + break + fi + if sudo grep -q "Bootstrap failed\|bootstrap failed" "$LOG_FILE" /dev/null; then + bootstrap_failed=true + break + fi + printf "." + sleep 5 + waited=$((waited + 5)) + done + echo "" + + if [[ "$bootstrap_failed" == "true" ]]; then + log_error "Bootstrap failed. Check: sudo tail -50 $LOG_FILE" + exit 1 + fi + + if [[ "$bootstrap_success" == "true" ]]; then + log_success "Bootstrap completed" + else + log_error "Timeout. Check: sudo tail -50 $LOG_FILE" + exit 1 + fi + + # Wait for RBAC propagation (simple dots) + printf " " + for i in {1..3}; do + printf "." + sleep 5 + done + echo "" + log_success "Node join completed" +} + +# Phase 5: Verification +phase5_verification() { + NODE_NAME=$(hostname | tr '[:upper:]' '[:lower:]') + + # Check node status (simple dots) + log_info "Waiting for node ready..." + printf " " + local max_attempts=30 + local attempt=0 + + while [[ $attempt -lt $max_attempts ]]; do + NODE_STATUS=$(kubectl --kubeconfig /root/.kube/config get node "$NODE_NAME" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "") + if [[ "$NODE_STATUS" == "True" ]]; then + break + fi + attempt=$((attempt + 1)) + printf "." + sleep 5 + done + echo "" + + if [[ "$NODE_STATUS" != "True" ]]; then + log_error "Node not ready, timeout" + exit 1 + fi + + log_success "Node $NODE_NAME is Ready" + echo "" + printf "${GREEN}========================================${NC}\n" + printf "${GREEN} Success! Edge Node joined Private AKS Cluster${NC}\n" + printf "${GREEN}========================================${NC}\n" + printf "\n" + printf "Node info:\n" + printf " - Node name: %s\n" "$NODE_NAME" + printf " - VPN IP: %s\n" "$CLIENT_VPN_IP" + printf " - AKS cluster: %s\n" "$AKS_CLUSTER_NAME" + printf "\n" + printf "Cluster nodes:\n" + kubectl --kubeconfig /root/.kube/config get nodes -o wide 2>&1 + printf "\n" + printf "${YELLOW}Tips:${NC}\n" + printf " - Please try: sudo kubectl get nodes\n" + printf "\n" +} + +main() { + echo -e "${GREEN}========================================${NC}" + echo -e "${GREEN} Add Edge Node to Private AKS Cluster${NC}" + echo -e "${GREEN}========================================${NC}" + echo "" + + parse_args "$@" + phase1_environment_check + phase2_gateway_setup + phase3_client_setup + phase4_node_join + phase5_verification +} + +# Run main +main "$@" diff --git a/pkg/privatecluster/private-uninstall.sh b/pkg/privatecluster/private-uninstall.sh new file mode 100755 index 0000000..25bbcf2 --- /dev/null +++ b/pkg/privatecluster/private-uninstall.sh @@ -0,0 +1,421 @@ +#!/bin/bash +# private-uninstall.sh - Called by: aks-flex-node private-leave +# Cleanup Private AKS Cluster Edge Node configuration +# +# Usage: +# sudo ./aks-flex-node private-leave --mode=local # Keep Gateway +# sudo ./aks-flex-node private-leave --mode=full --aks-resource-id "..." # Full cleanup + +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configuration +GATEWAY_NAME="wg-gateway" +GATEWAY_SUBNET_NAME="wg-subnet" +NETWORK_INTERFACE="wg-aks" +CLEANUP_MODE="" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="${SCRIPT_DIR}/../.." + +# Handle sudo: use original user's home directory for SSH keys +if [[ -n "${SUDO_USER:-}" ]]; then + REAL_HOME=$(getent passwd "$SUDO_USER" | cut -d: -f6) +else + REAL_HOME="$HOME" +fi +SSH_KEY_PATH="${REAL_HOME}/.ssh/id_rsa_wg_gateway" + +# Functions +log_info() { + echo -e "${BLUE}INFO:${NC} $1" +} + +log_success() { + echo -e "${GREEN}SUCCESS:${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}WARNING:${NC} $1" +} + +log_error() { + echo -e "${RED}ERROR:${NC} $1" +} + +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + --local) + CLEANUP_MODE="local" + shift + ;; + --full) + CLEANUP_MODE="full" + shift + ;; + --aks-resource-id) + AKS_RESOURCE_ID="$2" + shift 2 + ;; + *) + log_error "Unknown argument: $1" + exit 1 + ;; + esac + done + + if [[ -z "$CLEANUP_MODE" ]]; then + log_error "Please specify cleanup mode: --local or --full" + exit 1 + fi + + if [[ "$CLEANUP_MODE" == "full" && -z "${AKS_RESOURCE_ID:-}" ]]; then + log_error "--full mode requires --aks-resource-id" + exit 1 + fi + + if [[ -n "${AKS_RESOURCE_ID:-}" ]]; then + # Remove possible quotes and whitespace + AKS_RESOURCE_ID=$(echo "$AKS_RESOURCE_ID" | tr -d '"' | tr -d "'" | xargs) + SUBSCRIPTION_ID=$(echo "$AKS_RESOURCE_ID" | cut -d'/' -f3) + RESOURCE_GROUP=$(echo "$AKS_RESOURCE_ID" | cut -d'/' -f5) + AKS_CLUSTER_NAME=$(echo "$AKS_RESOURCE_ID" | cut -d'/' -f9) + + log_info "Parsed subscription ID: $SUBSCRIPTION_ID" + log_info "Parsed resource group: $RESOURCE_GROUP" + log_info "Parsed cluster name: $AKS_CLUSTER_NAME" + fi +} + +cleanup_local() { + log_info "Performing local cleanup (keeping Gateway)..." + + NODE_NAME=$(hostname | tr '[:upper:]' '[:lower:]') + + # Get Gateway IP (before stopping networking) + GATEWAY_PUBLIC_IP="" + CLIENT_PRIVATE_KEY="" + if [[ -f "/etc/wireguard/${NETWORK_INTERFACE}.conf" ]]; then + GATEWAY_PUBLIC_IP=$(sudo cat /etc/wireguard/${NETWORK_INTERFACE}.conf 2>/dev/null | grep "Endpoint" | cut -d'=' -f2 | cut -d':' -f1 | tr -d ' ' || echo "") + CLIENT_PRIVATE_KEY=$(sudo cat /etc/wireguard/${NETWORK_INTERFACE}.conf 2>/dev/null | grep "PrivateKey" | cut -d'=' -f2 | tr -d ' ' || echo "") + fi + + # Remove node from cluster (while networking is still connected) + if command -v kubectl &>/dev/null; then + log_info "Removing node $NODE_NAME from cluster..." + # Try root kubeconfig first, then user's kubeconfig + if kubectl --kubeconfig /root/.kube/config delete node "$NODE_NAME" --ignore-not-found 2>&1; then + log_success "Node removed from cluster" + elif kubectl delete node "$NODE_NAME" --ignore-not-found 2>&1; then + log_success "Node removed from cluster" + else + log_warning "Failed to remove node from cluster (may need manual cleanup: kubectl delete node $NODE_NAME)" + fi + fi + + # Stop any running aks-flex-node agent process + log_info "Stopping aks-flex-node agent..." + sudo pkill -f "aks-flex-node agent" 2>/dev/null || true + sleep 2 + + # Run aks-flex-node unbootstrap + log_info "Running aks-flex-node unbootstrap..." + CONFIG_FILE="${PROJECT_ROOT}/config.json" + AKS_FLEX_NODE="${PROJECT_ROOT}/aks-flex-node" + + if [[ -f "$AKS_FLEX_NODE" && -f "$CONFIG_FILE" ]]; then + sudo "$AKS_FLEX_NODE" unbootstrap --config "$CONFIG_FILE" || true + log_success "aks-flex-node unbootstrap completed" + else + log_warning "aks-flex-node or config.json not found, manually stopping services..." + sudo systemctl stop kubelet 2>/dev/null || true + sudo systemctl disable kubelet 2>/dev/null || true + sudo systemctl stop containerd 2>/dev/null || true + fi + + # Remove Arc Agent and Azure resource + log_info "Removing Arc Agent..." + if command -v azcmagent &>/dev/null; then + # First, delete Azure resource (requires az login) + log_info "Deleting Arc machine from Azure..." + ARC_RG=$(sudo azcmagent show 2>/dev/null | grep "Resource Group" | awk -F: '{print $2}' | xargs || echo "") + if [[ -n "$ARC_RG" ]]; then + az connectedmachine delete --resource-group "$ARC_RG" --name "$NODE_NAME" --yes 2>/dev/null || true + log_success "Arc machine deleted from Azure" + fi + # Then disconnect locally + sudo azcmagent disconnect --force-local-only 2>/dev/null || true + sudo systemctl stop himdsd extd gcad arcproxyd 2>/dev/null || true + sudo systemctl disable himdsd extd gcad arcproxyd 2>/dev/null || true + if command -v apt &>/dev/null; then + sudo apt remove azcmagent -y 2>/dev/null || true + elif command -v yum &>/dev/null; then + sudo yum remove azcmagent -y 2>/dev/null || true + fi + sudo rm -rf /var/opt/azcmagent /opt/azcmagent 2>/dev/null || true + log_success "Arc Agent removed" + else + log_info "Arc Agent not found, skipping" + fi + + # Remove client peer from Gateway + if [[ -n "$GATEWAY_PUBLIC_IP" && -n "$CLIENT_PRIVATE_KEY" && -f "$SSH_KEY_PATH" ]]; then + log_info "Removing client peer from Gateway..." + CLIENT_PUBLIC_KEY=$(echo "$CLIENT_PRIVATE_KEY" | wg pubkey 2>/dev/null || echo "") + if [[ -n "$CLIENT_PUBLIC_KEY" ]]; then + ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o ConnectTimeout=10 -i "$SSH_KEY_PATH" \ + azureuser@"$GATEWAY_PUBLIC_IP" \ + "sudo wg set wg0 peer '$CLIENT_PUBLIC_KEY' remove && sudo wg-quick save wg0" 2>/dev/null || true + log_success "Client peer removed from Gateway" + fi + fi + + # Stop networking + log_info "Stopping VPN connection..." + sudo wg-quick down "$NETWORK_INTERFACE" 2>/dev/null || true + log_success "VPN connection stopped" + + # Delete Gateway client configuration + log_info "Deleting VPN client configuration..." + sudo rm -f /etc/wireguard/${NETWORK_INTERFACE}.conf + log_success "VPN client configuration deleted" + + # Clean up hosts entries + log_info "Cleaning up hosts entries..." + sudo sed -i '/privatelink.*azmk8s.io/d' /etc/hosts + log_success "Hosts entries cleaned up" + + # Delete config.json + log_info "Deleting config file..." + rm -f "$CONFIG_FILE" + + echo "" + log_success "Local cleanup completed!" + echo "" + echo "To rejoin cluster, run:" + echo " sudo ./aks-flex-node private-join --aks-resource-id \"...\"" +} + +cleanup_full() { + log_info "Performing full cleanup..." + + NODE_NAME=$(hostname | tr '[:upper:]' '[:lower:]') + + # Get Gateway IP (before stopping networking) + GATEWAY_PUBLIC_IP="" + CLIENT_PRIVATE_KEY="" + if [[ -f "/etc/wireguard/${NETWORK_INTERFACE}.conf" ]]; then + GATEWAY_PUBLIC_IP=$(sudo cat /etc/wireguard/${NETWORK_INTERFACE}.conf 2>/dev/null | grep "Endpoint" | cut -d'=' -f2 | cut -d':' -f1 | tr -d ' ' || echo "") + CLIENT_PRIVATE_KEY=$(sudo cat /etc/wireguard/${NETWORK_INTERFACE}.conf 2>/dev/null | grep "PrivateKey" | cut -d'=' -f2 | tr -d ' ' || echo "") + fi + + # Remove node from cluster (while networking is still connected) + if command -v kubectl &>/dev/null; then + log_info "Removing node $NODE_NAME from cluster..." + # Try root kubeconfig first, then user's kubeconfig + if kubectl --kubeconfig /root/.kube/config delete node "$NODE_NAME" --ignore-not-found 2>&1; then + log_success "Node removed from cluster" + elif kubectl delete node "$NODE_NAME" --ignore-not-found 2>&1; then + log_success "Node removed from cluster" + else + log_warning "Failed to remove node from cluster (may need manual cleanup: kubectl delete node $NODE_NAME)" + fi + fi + + # Stop any running aks-flex-node agent process + log_info "Stopping aks-flex-node agent..." + sudo pkill -f "aks-flex-node agent" 2>/dev/null || true + sleep 2 + + # Run aks-flex-node unbootstrap + log_info "Running aks-flex-node unbootstrap..." + CONFIG_FILE="${PROJECT_ROOT}/config.json" + AKS_FLEX_NODE="${PROJECT_ROOT}/aks-flex-node" + + if [[ -f "$AKS_FLEX_NODE" && -f "$CONFIG_FILE" ]]; then + sudo "$AKS_FLEX_NODE" unbootstrap --config "$CONFIG_FILE" || true + log_success "aks-flex-node unbootstrap completed" + else + log_warning "aks-flex-node or config.json not found, skipping unbootstrap" + # Manually stop services + log_info "Manually stopping services..." + sudo systemctl stop kubelet 2>/dev/null || true + sudo systemctl disable kubelet 2>/dev/null || true + sudo systemctl stop containerd 2>/dev/null || true + fi + + # Remove Arc Agent and Azure resource + log_info "Removing Arc Agent..." + if command -v azcmagent &>/dev/null; then + # First, delete Azure resource (requires az login) + log_info "Deleting Arc machine from Azure..." + ARC_RG=$(sudo azcmagent show 2>/dev/null | grep "Resource Group" | awk -F: '{print $2}' | xargs || echo "") + if [[ -n "$ARC_RG" ]]; then + az connectedmachine delete --resource-group "$ARC_RG" --name "$NODE_NAME" --yes 2>/dev/null || true + log_success "Arc machine deleted from Azure" + else + # Fallback: try using the resource group from args + az connectedmachine delete --resource-group "$RESOURCE_GROUP" --name "$NODE_NAME" --yes 2>/dev/null || true + fi + # Then disconnect locally + sudo azcmagent disconnect --force-local-only 2>/dev/null || true + sudo systemctl stop himdsd extd gcad arcproxyd 2>/dev/null || true + sudo systemctl disable himdsd extd gcad arcproxyd 2>/dev/null || true + # Remove Arc Agent package + if command -v apt &>/dev/null; then + sudo apt remove azcmagent -y 2>/dev/null || true + elif command -v yum &>/dev/null; then + sudo yum remove azcmagent -y 2>/dev/null || true + fi + # Clean up Arc Agent files + sudo rm -rf /var/opt/azcmagent /opt/azcmagent 2>/dev/null || true + log_success "Arc Agent removed" + else + log_info "Arc Agent not found, skipping" + fi + + # Remove client peer from Gateway + if [[ -n "$GATEWAY_PUBLIC_IP" && -n "$CLIENT_PRIVATE_KEY" && -f "$SSH_KEY_PATH" ]]; then + log_info "Removing client peer from Gateway..." + CLIENT_PUBLIC_KEY=$(echo "$CLIENT_PRIVATE_KEY" | wg pubkey 2>/dev/null || echo "") + if [[ -n "$CLIENT_PUBLIC_KEY" ]]; then + ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o ConnectTimeout=10 -i "$SSH_KEY_PATH" \ + azureuser@"$GATEWAY_PUBLIC_IP" \ + "sudo wg set wg0 peer '$CLIENT_PUBLIC_KEY' remove && sudo wg-quick save wg0" 2>/dev/null || true + log_success "Client peer removed from Gateway" + fi + fi + + # Stop networking + log_info "Stopping networking..." + sudo wg-quick down "$NETWORK_INTERFACE" 2>/dev/null || true + log_success "Networking stopped" + + # Delete Gateway client configuration + log_info "Deleting Gateway client configuration..." + sudo rm -f /etc/wireguard/${NETWORK_INTERFACE}.conf + log_success "Gateway client configuration deleted" + + # Clean up hosts entries + log_info "Cleaning up hosts entries..." + sudo sed -i '/privatelink.*azmk8s.io/d' /etc/hosts + log_success "Hosts entries cleaned up" + + # Delete Azure resources + log_info "Deleting Azure resources..." + az account set --subscription "$SUBSCRIPTION_ID" + + # Delete Gateway (must complete before deleting NIC) + log_info "Deleting Gateway..." + if az vm show --resource-group "$RESOURCE_GROUP" --name "$GATEWAY_NAME" &>/dev/null; then + az vm delete --resource-group "$RESOURCE_GROUP" --name "$GATEWAY_NAME" --yes --only-show-errors + log_success "Gateway deleted" + else + log_info "Gateway not found, skipping" + fi + + # Delete NIC + NIC_NAME="${GATEWAY_NAME}VMNic" + log_info "Deleting NIC..." + if az network nic show --resource-group "$RESOURCE_GROUP" --name "$NIC_NAME" &>/dev/null; then + az network nic delete --resource-group "$RESOURCE_GROUP" --name "$NIC_NAME" --only-show-errors + log_success "NIC deleted" + else + log_info "NIC not found, skipping" + fi + + # Delete Public IP + PIP_NAME="${GATEWAY_NAME}-pip" + log_info "Deleting Public IP..." + if az network public-ip show --resource-group "$RESOURCE_GROUP" --name "$PIP_NAME" &>/dev/null; then + az network public-ip delete --resource-group "$RESOURCE_GROUP" --name "$PIP_NAME" --only-show-errors + log_success "Public IP deleted" + else + log_info "Public IP not found, skipping" + fi + + # Delete NSG + NSG_NAME="${GATEWAY_NAME}-nsg" + log_info "Deleting NSG..." + if az network nsg show --resource-group "$RESOURCE_GROUP" --name "$NSG_NAME" &>/dev/null; then + az network nsg delete --resource-group "$RESOURCE_GROUP" --name "$NSG_NAME" --only-show-errors + log_success "NSG deleted" + else + log_info "NSG not found, skipping" + fi + + # Delete disks + log_info "Deleting disks..." + DISK_NAMES=$(az disk list --resource-group "$RESOURCE_GROUP" --query "[?contains(name, '${GATEWAY_NAME}')].name" -o tsv 2>/dev/null || echo "") + for disk in $DISK_NAMES; do + az disk delete --resource-group "$RESOURCE_GROUP" --name "$disk" --yes --only-show-errors || true + done + + # Get VNet info and delete subnet + log_info "Deleting Gateway subnet..." + AKS_NODE_RG=$(az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ + --query "nodeResourceGroup" -o tsv 2>/dev/null || echo "") + + if [[ -n "$AKS_NODE_RG" ]]; then + VMSS_NAME=$(az vmss list --resource-group "$AKS_NODE_RG" --query "[0].name" -o tsv 2>/dev/null || echo "") + if [[ -n "$VMSS_NAME" ]]; then + VNET_SUBNET_ID=$(az vmss show --resource-group "$AKS_NODE_RG" --name "$VMSS_NAME" \ + --query "virtualMachineProfile.networkProfile.networkInterfaceConfigurations[0].ipConfigurations[0].subnet.id" -o tsv 2>/dev/null || echo "") + if [[ -n "$VNET_SUBNET_ID" ]]; then + VNET_NAME=$(echo "$VNET_SUBNET_ID" | cut -d'/' -f9) + VNET_RG=$(echo "$VNET_SUBNET_ID" | cut -d'/' -f5) + az network vnet subnet delete --resource-group "$VNET_RG" --vnet-name "$VNET_NAME" \ + --name "$GATEWAY_SUBNET_NAME" 2>/dev/null || true + log_success "Gateway subnet deleted" + fi + fi + fi + + # Delete SSH keys + log_info "Deleting SSH keys..." + rm -f "$SSH_KEY_PATH" "${SSH_KEY_PATH}.pub" + log_success "SSH keys deleted" + + # Delete config.json + log_info "Deleting config file..." + rm -f "$CONFIG_FILE" + + echo "" + log_success "Full cleanup completed!" + echo "" + echo "All components and Azure resources have been removed." + echo "The local machine is now clean." +} + +main() { + echo -e "${YELLOW}Remove Edge Node from Private AKS Cluster${NC}" + echo -e "${YELLOW}=====================================${NC}" + echo "" + + parse_args "$@" + + # Install Azure CLI connectedmachine extension if needed + if ! az extension show --name connectedmachine &>/dev/null; then + log_info "Installing Azure CLI connectedmachine extension..." + az config set extension.dynamic_install_allow_preview=true --only-show-errors 2>/dev/null || true + az extension add --name connectedmachine --allow-preview true --only-show-errors 2>/dev/null || true + fi + + case "$CLEANUP_MODE" in + local) + cleanup_local + ;; + full) + cleanup_full + ;; + esac +} + +# Run main +main "$@" diff --git a/pkg/privatecluster/scripts.go b/pkg/privatecluster/scripts.go new file mode 100644 index 0000000..86f72c7 --- /dev/null +++ b/pkg/privatecluster/scripts.go @@ -0,0 +1,109 @@ +package privatecluster + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" +) + +type CleanupMode string + +const ( + // CleanupModeLocal removes node and local components, keeps Gateway for other nodes + CleanupModeLocal CleanupMode = "local" + // CleanupModeFull removes all components including Azure resources (Gateway, subnet, NSG, etc.) + CleanupModeFull CleanupMode = "full" +) + +type ScriptRunner struct { + scriptsDir string +} + +func NewScriptRunner(scriptsDir string) *ScriptRunner { + if scriptsDir == "" { + candidates := []string{ + "./pkg/privatecluster", + } + if execPath, err := os.Executable(); err == nil { + execDir := filepath.Dir(execPath) + candidates = append(candidates, + filepath.Join(execDir, "pkg", "privatecluster"), + execDir, + ) + } + + for _, dir := range candidates { + if _, err := os.Stat(filepath.Join(dir, "private-install.sh")); err == nil { + scriptsDir = dir + break + } + } + } + return &ScriptRunner{scriptsDir: scriptsDir} +} + +// RunPrivateInstall executes the private-install.sh script +// Assumes the Private AKS cluster already exists and user has admin permissions +func (r *ScriptRunner) RunPrivateInstall(ctx context.Context, aksResourceID string) error { + scriptPath := filepath.Join(r.scriptsDir, "private-install.sh") + + // Check if script exists + if _, err := os.Stat(scriptPath); os.IsNotExist(err) { + return fmt.Errorf("script not found: %s", scriptPath) + } + + // Execute script with AKS resource ID as argument + cmd := exec.CommandContext(ctx, "bash", scriptPath, "--aks-resource-id", aksResourceID) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + cmd.Stdin = os.Stdin + + if err := cmd.Run(); err != nil { + return fmt.Errorf("script execution failed: %w", err) + } + + return nil +} + +// RunPrivateUninstall executes the private-uninstall.sh script +// mode: "local" - remove node and local components, keep Gateway +// mode: "full" - remove all components including Azure resources +// aksResourceID is required for "full" mode +func (r *ScriptRunner) RunPrivateUninstall(ctx context.Context, mode CleanupMode, aksResourceID string) error { + scriptPath := filepath.Join(r.scriptsDir, "private-uninstall.sh") + + // Check if script exists + if _, err := os.Stat(scriptPath); os.IsNotExist(err) { + return fmt.Errorf("script not found: %s", scriptPath) + } + + // Build arguments based on mode + var args []string + args = append(args, scriptPath) + + switch mode { + case CleanupModeLocal: + args = append(args, "--local") + case CleanupModeFull: + if aksResourceID == "" { + return fmt.Errorf("--aks-resource-id is required for full cleanup mode") + } + args = append(args, "--full", "--aks-resource-id", aksResourceID) + default: + return fmt.Errorf("invalid cleanup mode: %s (use 'local' or 'full')", mode) + } + + // Execute script + cmd := exec.CommandContext(ctx, "bash", args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + cmd.Stdin = os.Stdin + + if err := cmd.Run(); err != nil { + return fmt.Errorf("script execution failed: %w", err) + } + + return nil +} From 204ad2613d41557725b9311a9d41a1c7c5546ab4 Mon Sep 17 00:00:00 2001 From: weiliu2 Date: Wed, 4 Feb 2026 14:26:04 +1300 Subject: [PATCH 02/11] change to go and remove bash file --- .gitignore | 65 +- commands.go | 175 ++-- main.go | 6 +- pkg/components/arc/arc_installer.go | 50 +- pkg/config/structs.go | 1 + pkg/privatecluster/README.md | 59 +- pkg/privatecluster/azure.go | 445 +++++++++++ pkg/privatecluster/create_private_cluster.md | 12 +- pkg/privatecluster/installer.go | 382 +++++++++ pkg/privatecluster/private-install.sh | 796 ------------------- pkg/privatecluster/private-uninstall.sh | 421 ---------- pkg/privatecluster/privatecluster_test.go | 151 ++++ pkg/privatecluster/scripts.go | 104 +-- pkg/privatecluster/ssh.go | 158 ++++ pkg/privatecluster/types.go | 96 +++ pkg/privatecluster/uninstaller.go | 349 ++++++++ pkg/privatecluster/utils.go | 277 +++++++ pkg/privatecluster/vpn.go | 267 +++++++ 18 files changed, 2352 insertions(+), 1462 deletions(-) create mode 100644 pkg/privatecluster/azure.go create mode 100644 pkg/privatecluster/installer.go delete mode 100755 pkg/privatecluster/private-install.sh delete mode 100755 pkg/privatecluster/private-uninstall.sh create mode 100644 pkg/privatecluster/privatecluster_test.go create mode 100644 pkg/privatecluster/ssh.go create mode 100644 pkg/privatecluster/types.go create mode 100644 pkg/privatecluster/uninstaller.go create mode 100644 pkg/privatecluster/utils.go create mode 100644 pkg/privatecluster/vpn.go diff --git a/.gitignore b/.gitignore index 29dc8a2..232ac73 100644 --- a/.gitignore +++ b/.gitignore @@ -1,48 +1,41 @@ # Binaries for programs and plugins -*.exe -*.exe~ +# Build artifacts +# Config files with sensitive data (keep sample config) +# Go workspace file +# IDE and editor files +# Log files +# OS generated files +# Output of the go coverage tool, specifically when used with LiteIDE +# Test binary, built with `go test -c` +# Test coverage reports *.dll -*.so *.dylib -aks-flex-node - -# Test binary, built with `go test -c` -*.test - -# Output of the go coverage tool, specifically when used with LiteIDE +*.exe +*.exe~ +*.log *.out - -# Test coverage reports -coverage.out -coverage.html -coverage.xml - -# Go workspace file -go.work - -# IDE and editor files -.vscode/ -.idea/ -*.swp +*.so *.swo +*.swp +*.test *~ - -# OS generated files .DS_Store .DS_Store? -._* .Spotlight-V100 .Trashes -ehthumbs.db -Thumbs.db - -# Log files -*.log +._* +.idea/ +.vscode/ +/build/ +/dist/AKSFlexNode /var/log/ - -# Config files with sensitive data (keep sample config) +AKSFlexNode +Standard_D8pds_v6_sku.json +Thumbs.db +aks-flex-node config.json - -# Build artifacts -/build/ -/dist/ \ No newline at end of file +coverage.html +coverage.out +coverage.xml +ehthumbs.db +go.work diff --git a/commands.go b/commands.go index c0d8fe4..1e106be 100644 --- a/commands.go +++ b/commands.go @@ -25,6 +25,9 @@ var ( BuildTime = "unknown" ) +// Unbootstrap command flags +var cleanupMode string + // NewAgentCommand creates a new agent command func NewAgentCommand() *cobra.Command { cmd := &cobra.Command{ @@ -44,12 +47,19 @@ func NewUnbootstrapCommand() *cobra.Command { cmd := &cobra.Command{ Use: "unbootstrap", Short: "Remove AKS node configuration and Arc connection", - Long: "Clean up and remove all AKS node components and Arc registration from this machine", + Long: `Clean up and remove all AKS node components and Arc registration from this machine. + +For private clusters (config has private: true), this also handles VPN cleanup: + --cleanup-mode=local Remove node and local VPN config, keep Gateway (default) + --cleanup-mode=full Remove everything including Gateway VM and Azure resources`, RunE: func(cmd *cobra.Command, args []string) error { return runUnbootstrap(cmd.Context()) }, } + cmd.Flags().StringVar(&cleanupMode, "cleanup-mode", "local", + "Private cluster cleanup mode: 'local' (keep Gateway) or 'full' (remove all Azure resources)") + return cmd } @@ -76,6 +86,19 @@ func runAgent(ctx context.Context) error { return fmt.Errorf("failed to load config from %s: %w", configPath, err) } + // For private clusters, run Gateway/VPN setup before bootstrap + if cfg.Azure.TargetCluster != nil && cfg.Azure.TargetCluster.Private { + logger.Info("Private cluster detected, running Gateway/VPN setup...") + if os.Getuid() != 0 { + return fmt.Errorf("private cluster setup requires root privileges, please run with 'sudo'") + } + runner := privatecluster.NewScriptRunner("") + if err := runner.RunPrivateInstall(ctx, cfg.Azure.TargetCluster.ResourceID); err != nil { + return fmt.Errorf("private cluster setup failed: %w", err) + } + logger.Info("Private cluster setup completed") + } + bootstrapExecutor := bootstrapper.New(cfg, logger) result, err := bootstrapExecutor.Bootstrap(ctx) if err != nil { @@ -87,6 +110,13 @@ func runAgent(ctx context.Context) error { return err } + // Print visible success message + fmt.Println() + fmt.Println("========================================") + fmt.Println(" Join process finished successfully!") + fmt.Println("========================================") + fmt.Println() + // After successful bootstrap, transition to daemon mode logger.Info("Bootstrap completed successfully, transitioning to daemon mode...") return runDaemonLoop(ctx, cfg) @@ -101,6 +131,39 @@ func runUnbootstrap(ctx context.Context) error { return fmt.Errorf("failed to load config from %s: %w", configPath, err) } + // For private clusters, run VPN/Gateway cleanup first + if cfg.Azure.TargetCluster != nil && cfg.Azure.TargetCluster.Private { + logger.Info("Private cluster detected, running VPN/Gateway cleanup...") + + // Validate cleanup mode + var mode privatecluster.CleanupMode + switch cleanupMode { + case "local": + mode = privatecluster.CleanupModeLocal + case "full": + mode = privatecluster.CleanupModeFull + default: + return fmt.Errorf("invalid cleanup mode: %s (use 'local' or 'full')", cleanupMode) + } + + // Check root privileges for private cluster cleanup + if os.Getuid() != 0 { + return fmt.Errorf("private cluster cleanup requires root privileges, please run with 'sudo'") + } + + options := privatecluster.UninstallOptions{ + Mode: mode, + AKSResourceID: cfg.Azure.TargetCluster.ResourceID, + } + uninstaller := privatecluster.NewUninstaller(options) + if err := uninstaller.Uninstall(ctx); err != nil { + logger.Warnf("Private cluster cleanup had errors: %v", err) + // Continue with normal unbootstrap even if private cleanup has issues + } + logger.Info("Private cluster cleanup completed") + } + + // Run normal unbootstrap bootstrapExecutor := bootstrapper.New(cfg, logger) result, err := bootstrapExecutor.Unbootstrap(ctx) if err != nil { @@ -108,7 +171,15 @@ func runUnbootstrap(ctx context.Context) error { } // Handle and log the result (unbootstrap is more lenient with failures) - return handleExecutionResult(result, "unbootstrap", logger) + if err := handleExecutionResult(result, "unbootstrap", logger); err != nil { + return err + } + + // Print final success message + fmt.Println() + fmt.Println("\033[0;32mSUCCESS:\033[0m Unbootstrap completed successfully!") + + return nil } // runVersion displays version information @@ -119,106 +190,6 @@ func runVersion() { fmt.Printf("Build Time: %s\n", BuildTime) } -// Private cluster command variables -var ( - aksResourceID string - cleanupModeFlag string -) - -// NewPrivateJoinCommand creates a new private-join command -func NewPrivateJoinCommand() *cobra.Command { - cmd := &cobra.Command{ - Use: "private-join", - Short: "Join a Private AKS cluster (requires sudo)", - Long: `Join a Private AKS cluster. - -Prerequisites: - 1. A Private AKS cluster must exist with AAD and Azure RBAC enabled - See: pkg/privatecluster/create_private_cluster.md - - 2. Current user must have the following roles on the cluster: - - Azure Kubernetes Service Cluster Admin Role - - Azure Kubernetes Service RBAC Cluster Admin - - 3. Current user must be logged in via 'sudo az login' - -The full resource ID of the Private AKS cluster is required as the --aks-resource-id parameter. -This same resource ID can be used later with the private-leave command.`, - RunE: func(cmd *cobra.Command, args []string) error { - return runPrivateJoin(cmd.Context()) - }, - } - - cmd.Flags().StringVar(&aksResourceID, "aks-resource-id", "", "AKS cluster resource ID (required)") - cmd.MarkFlagRequired("aks-resource-id") - - return cmd -} - -// NewPrivateLeaveCommand creates a new private-leave command -func NewPrivateLeaveCommand() *cobra.Command { - cmd := &cobra.Command{ - Use: "private-leave", - Short: "Leave a Private AKS cluster (--mode=local|full, requires sudo)", - Long: `Remove this edge node from a Private AKS cluster. - -Cleanup modes: - --local Local cleanup only (default): - - Remove node from AKS cluster - - Run aks-flex-node unbootstrap - - Remove Arc Agent - - Stop VPN and remove client config - - Keep Gateway for other nodes - - --full Full cleanup (requires --aks-resource-id): - - All local cleanup steps - - Delete Gateway VM - - Delete Gateway subnet, NSG, Public IP - - Delete SSH keys - -This command requires the current user to be logged in via 'sudo az login'.`, - RunE: func(cmd *cobra.Command, args []string) error { - return runPrivateLeave(cmd.Context()) - }, - } - - cmd.Flags().StringVar(&cleanupModeFlag, "mode", "local", "Cleanup mode: 'local' (keep Gateway) or 'full' (remove all Azure resources)") - cmd.Flags().StringVar(&aksResourceID, "aks-resource-id", "", "AKS cluster resource ID (required for --mode=full)") - - return cmd -} - -// runPrivateJoin executes the private cluster join process -func runPrivateJoin(ctx context.Context) error { - if os.Getuid() != 0 { - return fmt.Errorf("this command requires root privileges, please run with 'sudo'") - } - runner := privatecluster.NewScriptRunner("") - return runner.RunPrivateInstall(ctx, aksResourceID) -} - -// runPrivateLeave executes the private cluster leave process -func runPrivateLeave(ctx context.Context) error { - if os.Getuid() != 0 { - return fmt.Errorf("this command requires root privileges, please run with 'sudo'") - } - // Validate cleanup mode - var mode privatecluster.CleanupMode - switch cleanupModeFlag { - case "local": - mode = privatecluster.CleanupModeLocal - case "full": - mode = privatecluster.CleanupModeFull - if aksResourceID == "" { - return fmt.Errorf("--aks-resource-id is required for full cleanup mode") - } - default: - return fmt.Errorf("invalid cleanup mode: %s (use 'local' or 'full')", cleanupModeFlag) - } - - runner := privatecluster.NewScriptRunner("") - return runner.RunPrivateUninstall(ctx, mode, aksResourceID) -} // runDaemonLoop runs the periodic status collection and bootstrap monitoring daemon func runDaemonLoop(ctx context.Context, cfg *config.Config) error { diff --git a/main.go b/main.go index a5ad585..dbcf4e6 100644 --- a/main.go +++ b/main.go @@ -26,15 +26,13 @@ func main() { // Add global flags for configuration rootCmd.PersistentFlags().StringVar(&configPath, "config", "", "Path to configuration JSON file (required for agent/unbootstrap)") - rootCmd.PersistentFlags().MarkHidden("config") // Hide from global help, shown in agent/unbootstrap help + _ = rootCmd.PersistentFlags().MarkHidden("config") // Don't mark as required globally - we'll check in PersistentPreRunE for commands that need it // Add commands rootCmd.AddCommand(NewAgentCommand()) rootCmd.AddCommand(NewUnbootstrapCommand()) rootCmd.AddCommand(NewVersionCommand()) - rootCmd.AddCommand(NewPrivateJoinCommand()) - rootCmd.AddCommand(NewPrivateLeaveCommand()) // Set up context with signal handling ctx, cancel := context.WithCancel(context.Background()) @@ -54,7 +52,7 @@ func main() { rootCmd.PersistentPreRunE = func(cmd *cobra.Command, args []string) error { // Skip config loading for commands that don't need it switch cmd.Name() { - case "version", "private-join", "private-leave": + case "version": return nil } diff --git a/pkg/components/arc/arc_installer.go b/pkg/components/arc/arc_installer.go index bacf435..55a6fb8 100644 --- a/pkg/components/arc/arc_installer.go +++ b/pkg/components/arc/arc_installer.go @@ -139,11 +139,32 @@ func (i *Installer) IsCompleted(ctx context.Context) bool { func (i *Installer) registerArcMachine(ctx context.Context) (*armhybridcompute.Machine, error) { i.logger.Info("Registering machine with Azure Arc using Arc agent") - // Check if already registered + // Check if already registered in Azure AND locally connected machine, err := i.getArcMachine(ctx) if err == nil && machine != nil { - i.logger.Infof("Machine already registered as Arc machine: %s", to.String(machine.Name)) - return machine, nil + // Azure resource exists, but also verify local agent is connected + if i.isLocalAgentConnected(ctx) { + i.logger.Infof("Machine already registered and locally connected as Arc machine: %s", to.String(machine.Name)) + return machine, nil + } + i.logger.Warnf("Arc machine '%s' exists in Azure but local agent is disconnected, re-connecting...", to.String(machine.Name)) + + // Step 1: Clean up local agent state + i.logger.Info("Cleaning up local agent state...") + disconnectCmd := exec.CommandContext(ctx, "azcmagent", "disconnect", "--force-local-only") + if output, err := disconnectCmd.CombinedOutput(); err != nil { + i.logger.Warnf("Local disconnect had issues (continuing): %v, output: %s", err, string(output)) + } + + // Step 2: Delete the stale Azure Arc resource so connect can recreate it + arcResourceGroup := i.config.GetArcResourceGroup() + arcMachineName := i.config.GetArcMachineName() + i.logger.Infof("Deleting stale Arc machine resource '%s' from Azure...", arcMachineName) + if _, err := i.hybridComputeMachineClient.Delete(ctx, arcResourceGroup, arcMachineName, nil); err != nil { + i.logger.Warnf("Failed to delete Arc machine from Azure (continuing): %v", err) + } else { + i.logger.Info("Stale Arc machine resource deleted from Azure") + } } // Register using Arc agent command @@ -156,6 +177,29 @@ func (i *Installer) registerArcMachine(ctx context.Context) (*armhybridcompute.M return i.waitForArcRegistration(ctx) } +// isLocalAgentConnected checks if the local Arc agent is connected +func (i *Installer) isLocalAgentConnected(ctx context.Context) bool { + timeoutCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + cmd := exec.CommandContext(timeoutCtx, "azcmagent", "show") + output, err := cmd.Output() + if err != nil { + i.logger.Debugf("azcmagent show failed: %v", err) + return false + } + + for _, line := range strings.Split(string(output), "\n") { + if strings.Contains(line, "Agent Status") && strings.Contains(line, ":") { + parts := strings.SplitN(line, ":", 2) + if len(parts) == 2 { + return strings.TrimSpace(strings.ToLower(parts[1])) == "connected" + } + } + } + return false +} + func (i *Installer) validateManagedCluster(ctx context.Context) error { i.logger.Info("Validating target AKS Managed Cluster requirements for Azure RBAC authentication") diff --git a/pkg/config/structs.go b/pkg/config/structs.go index 3366fcf..2b08cb9 100644 --- a/pkg/config/structs.go +++ b/pkg/config/structs.go @@ -39,6 +39,7 @@ type ServicePrincipalConfig struct { type TargetClusterConfig struct { ResourceID string `json:"resourceId"` // Full resource ID of the target AKS cluster Location string `json:"location"` // Azure region of the cluster (e.g., "eastus", "westus2") + Private bool `json:"private"` // Whether this is a private AKS cluster (requires Gateway/VPN setup) Name string // will be populated from ResourceID ResourceGroup string // will be populated from ResourceID SubscriptionID string // will be populated from ResourceID diff --git a/pkg/privatecluster/README.md b/pkg/privatecluster/README.md index 285d2eb..eba1fd8 100644 --- a/pkg/privatecluster/README.md +++ b/pkg/privatecluster/README.md @@ -14,6 +14,39 @@ Create a Private AKS cluster with AAD and Azure RBAC enabled, and assign the req See: [create_private_cluster.md](create_private_cluster.md) +### 3. Prepare Configuration File + +Create a `config.json` with `"private": true` in the `targetCluster` section: + +```json +{ + "azure": { + "subscriptionId": "", + "tenantId": "", + "targetCluster": { + "resourceId": "/subscriptions//resourceGroups//providers/Microsoft.ContainerService/managedClusters/", + "location": "eastus2", + "private": true + }, + "arc": { + "resourceGroup": "", + "location": "eastus2" + } + }, + "kubernetes": { + "version": "1.33.0" + }, + "containerd": { + "version": "1.7.11", + "pauseImage": "mcr.microsoft.com/oss/kubernetes/pause:3.6" + }, + "agent": { + "logLevel": "info", + "logDir": "/var/log/aks-flex-node" + } +} +``` + ## Join Private AKS Cluster ### 1. Build the project @@ -24,16 +57,18 @@ go build -o aks-flex-node . ### 2. Join the cluster -```bash -sudo ./aks-flex-node private-join --aks-resource-id "" -``` +When the config has `"private": true`, the `agent` command automatically sets up the Gateway/VPN before bootstrapping: -Example: ```bash -sudo ./aks-flex-node private-join \ - --aks-resource-id "/subscriptions/xxx/resourcegroups/my-rg/providers/Microsoft.ContainerService/managedClusters/my-private-aks" +sudo ./aks-flex-node agent --config config.json ``` +This will: +1. Detect private cluster from config +2. Set up Gateway VM and VPN tunnel (WireGuard) +3. Run normal bootstrap (Arc, containerd, kubelet, etc.) +4. Enter daemon mode for status monitoring + ### 3. Verify ```bash @@ -42,18 +77,20 @@ sudo kubectl get nodes ## Leave Private AKS Cluster +When the config has `"private": true`, the `unbootstrap` command automatically handles VPN/Gateway cleanup: + ```bash -sudo ./aks-flex-node private-leave --mode= [--aks-resource-id ""] +sudo ./aks-flex-node unbootstrap --config config.json [--cleanup-mode ] ``` ### Mode Comparison | Mode | Command | Description | |------|---------|-------------| -| `local` | `sudo ./aks-flex-node private-leave --mode=local` | Remove node and local components, **keep Gateway** for other nodes | -| `full` | `sudo ./aks-flex-node private-leave --mode=full --aks-resource-id "..."` | Remove all components **including Gateway and Azure resources** | +| `local` (default) | `sudo ./aks-flex-node unbootstrap --config config.json` | Remove node and local VPN config, **keep Gateway** for other nodes | +| `full` | `sudo ./aks-flex-node unbootstrap --config config.json --cleanup-mode full` | Remove all components **including Gateway VM and Azure resources** | ### When to use each mode -- **`--mode=local`**: Other nodes are still using the Gateway, or you plan to rejoin later -- **`--mode=full`**: Last node leaving, clean up all Azure resources (Gateway VM, subnet, NSG, public IP) +- **`--cleanup-mode=local`** (default): Other nodes are still using the Gateway, or you plan to rejoin later +- **`--cleanup-mode=full`**: Last node leaving, clean up all Azure resources (Gateway VM, subnet, NSG, public IP) diff --git a/pkg/privatecluster/azure.go b/pkg/privatecluster/azure.go new file mode 100644 index 0000000..c08a84e --- /dev/null +++ b/pkg/privatecluster/azure.go @@ -0,0 +1,445 @@ +package privatecluster + +import ( + "context" + "encoding/json" + "fmt" + "strings" +) + +// AzureCLI provides Azure CLI operations +type AzureCLI struct { + logger *Logger +} + +// NewAzureCLI creates a new AzureCLI instance +func NewAzureCLI(logger *Logger) *AzureCLI { + return &AzureCLI{logger: logger} +} + +// CheckInstalled verifies Azure CLI is installed +func (az *AzureCLI) CheckInstalled() error { + if !CommandExists("az") { + return fmt.Errorf("azure CLI not installed, please install: curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash") + } + return nil +} + +// CheckLogin verifies Azure CLI is logged in +func (az *AzureCLI) CheckLogin(ctx context.Context) error { + if !RunCommandSilent(ctx, "az", "account", "show") { + return fmt.Errorf("azure CLI not logged in, please run 'az login' first") + } + return nil +} + +// CheckAndRefreshToken checks if token is valid and refreshes if needed +func (az *AzureCLI) CheckAndRefreshToken(ctx context.Context) error { + if !RunCommandSilent(ctx, "az", "account", "get-access-token", "--only-show-errors") { + az.logger.Warning("Azure token expired or invalid, re-authenticating...") + return RunCommandInteractive(ctx, "az", "login") + } + return nil +} + +// SetSubscription sets the active subscription +func (az *AzureCLI) SetSubscription(ctx context.Context, subscriptionID string) error { + _, err := RunCommand(ctx, "az", "account", "set", "--subscription", subscriptionID) + return err +} + +// GetTenantID returns the current tenant ID +func (az *AzureCLI) GetTenantID(ctx context.Context) (string, error) { + return RunCommand(ctx, "az", "account", "show", "--query", "tenantId", "-o", "tsv") +} + +// AKSClusterExists checks if an AKS cluster exists +func (az *AzureCLI) AKSClusterExists(ctx context.Context, resourceGroup, clusterName string) bool { + return RunCommandSilent(ctx, "az", "aks", "show", + "--resource-group", resourceGroup, + "--name", clusterName) +} + +// GetAKSClusterInfo retrieves AKS cluster information +func (az *AzureCLI) GetAKSClusterInfo(ctx context.Context, resourceGroup, clusterName string) (*AKSClusterInfo, error) { + info := &AKSClusterInfo{ + ResourceGroup: resourceGroup, + ClusterName: clusterName, + } + + // Get AAD enabled status + aadEnabled, _ := RunCommand(ctx, "az", "aks", "show", + "--resource-group", resourceGroup, + "--name", clusterName, + "--query", "aadProfile.managed", "-o", "tsv") + + if strings.ToLower(aadEnabled) != "true" { + return nil, fmt.Errorf("AKS cluster AAD not enabled, please enable: az aks update --enable-aad") + } + + // Get RBAC enabled status + rbacEnabled, _ := RunCommand(ctx, "az", "aks", "show", + "--resource-group", resourceGroup, + "--name", clusterName, + "--query", "aadProfile.enableAzureRbac", "-o", "tsv") + + if strings.ToLower(rbacEnabled) != "true" { + return nil, fmt.Errorf("AKS cluster Azure RBAC not enabled, please enable: az aks update --enable-azure-rbac") + } + + // Get location + location, err := RunCommand(ctx, "az", "aks", "show", + "--resource-group", resourceGroup, + "--name", clusterName, + "--query", "location", "-o", "tsv") + if err != nil { + return nil, fmt.Errorf("failed to get cluster location: %w", err) + } + info.Location = location + + // Get node resource group + nodeRG, err := RunCommand(ctx, "az", "aks", "show", + "--resource-group", resourceGroup, + "--name", clusterName, + "--query", "nodeResourceGroup", "-o", "tsv") + if err != nil { + return nil, fmt.Errorf("failed to get node resource group: %w", err) + } + info.NodeResourceGroup = nodeRG + + // Get private FQDN + privateFQDN, err := RunCommand(ctx, "az", "aks", "show", + "--resource-group", resourceGroup, + "--name", clusterName, + "--query", "privateFqdn", "-o", "tsv") + if err != nil { + return nil, fmt.Errorf("failed to get private FQDN: %w", err) + } + info.PrivateFQDN = privateFQDN + + return info, nil +} + +// GetVNetInfo retrieves VNet information from AKS VMSS +func (az *AzureCLI) GetVNetInfo(ctx context.Context, nodeResourceGroup string) (vnetName, vnetRG string, err error) { + // Get first VMSS name + vmssName, err := RunCommand(ctx, "az", "vmss", "list", + "--resource-group", nodeResourceGroup, + "--query", "[0].name", "-o", "tsv") + if err != nil || vmssName == "" { + return "", "", fmt.Errorf("cannot find AKS node VMSS in %s", nodeResourceGroup) + } + + // Get subnet ID from VMSS + subnetID, err := RunCommand(ctx, "az", "vmss", "show", + "--resource-group", nodeResourceGroup, + "--name", vmssName, + "--query", "virtualMachineProfile.networkProfile.networkInterfaceConfigurations[0].ipConfigurations[0].subnet.id", + "-o", "tsv") + if err != nil { + return "", "", fmt.Errorf("failed to get subnet ID from VMSS: %w", err) + } + + // Parse VNet name and resource group from subnet ID + // Format: /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.Network/virtualNetworks/{vnet}/subnets/{subnet} + parts := strings.Split(subnetID, "/") + if len(parts) < 9 { + return "", "", fmt.Errorf("invalid subnet ID format: %s", subnetID) + } + + vnetRG = parts[4] + vnetName = parts[8] + + return vnetName, vnetRG, nil +} + +// VMExists checks if a VM exists +func (az *AzureCLI) VMExists(ctx context.Context, resourceGroup, vmName string) bool { + return RunCommandSilent(ctx, "az", "vm", "show", + "--resource-group", resourceGroup, + "--name", vmName) +} + +// GetVMPublicIP retrieves a VM's public IP address +func (az *AzureCLI) GetVMPublicIP(ctx context.Context, resourceGroup, vmName string) (string, error) { + return RunCommand(ctx, "az", "vm", "list-ip-addresses", + "--resource-group", resourceGroup, + "--name", vmName, + "--query", "[0].virtualMachine.network.publicIpAddresses[0].ipAddress", + "-o", "tsv") +} + +// CreateSubnet creates a subnet in a VNet +func (az *AzureCLI) CreateSubnet(ctx context.Context, vnetRG, vnetName, subnetName, addressPrefix string) error { + // Check if subnet exists + if RunCommandSilent(ctx, "az", "network", "vnet", "subnet", "show", + "--resource-group", vnetRG, + "--vnet-name", vnetName, + "--name", subnetName) { + az.logger.Info("Subnet %s already exists", subnetName) + return nil + } + + _, err := RunCommand(ctx, "az", "network", "vnet", "subnet", "create", + "--resource-group", vnetRG, + "--vnet-name", vnetName, + "--name", subnetName, + "--address-prefixes", addressPrefix) + return err +} + +// CreateNSG creates a network security group with rules +func (az *AzureCLI) CreateNSG(ctx context.Context, resourceGroup, nsgName string, vpnPort int) error { + // Check if NSG exists + if RunCommandSilent(ctx, "az", "network", "nsg", "show", + "--resource-group", resourceGroup, + "--name", nsgName) { + az.logger.Info("NSG %s already exists", nsgName) + return nil + } + + // Create NSG + if _, err := RunCommand(ctx, "az", "network", "nsg", "create", + "--resource-group", resourceGroup, + "--name", nsgName); err != nil { + return fmt.Errorf("failed to create NSG: %w", err) + } + + // Add SSH rule (priority 100 to override NRMS-Rule-106) + if _, err := RunCommand(ctx, "az", "network", "nsg", "rule", "create", + "--resource-group", resourceGroup, + "--nsg-name", nsgName, + "--name", "allow-ssh", + "--priority", "100", + "--destination-port-ranges", "22", + "--protocol", "Tcp", + "--access", "Allow"); err != nil { + return fmt.Errorf("failed to create SSH rule: %w", err) + } + + // Add VPN rule + if _, err := RunCommand(ctx, "az", "network", "nsg", "rule", "create", + "--resource-group", resourceGroup, + "--nsg-name", nsgName, + "--name", "allow-vpn", + "--priority", "200", + "--destination-port-ranges", fmt.Sprintf("%d", vpnPort), + "--protocol", "Udp", + "--access", "Allow"); err != nil { + return fmt.Errorf("failed to create VPN rule: %w", err) + } + + return nil +} + +// CreatePublicIP creates a static public IP +func (az *AzureCLI) CreatePublicIP(ctx context.Context, resourceGroup, pipName string) error { + // Check if PIP exists + if RunCommandSilent(ctx, "az", "network", "public-ip", "show", + "--resource-group", resourceGroup, + "--name", pipName) { + az.logger.Info("Public IP %s already exists", pipName) + return nil + } + + _, err := RunCommand(ctx, "az", "network", "public-ip", "create", + "--resource-group", resourceGroup, + "--name", pipName, + "--sku", "Standard", + "--allocation-method", "Static") + return err +} + +// GetPublicIPAddress retrieves a public IP address +func (az *AzureCLI) GetPublicIPAddress(ctx context.Context, resourceGroup, pipName string) (string, error) { + return RunCommand(ctx, "az", "network", "public-ip", "show", + "--resource-group", resourceGroup, + "--name", pipName, + "--query", "ipAddress", "-o", "tsv") +} + +// CreateVM creates a VM with specified configuration +func (az *AzureCLI) CreateVM(ctx context.Context, resourceGroup, vmName, vnetName, subnetName, nsgName, pipName, sshKeyPath, vmSize string) error { + _, err := RunCommand(ctx, "az", "vm", "create", + "--resource-group", resourceGroup, + "--name", vmName, + "--image", "Ubuntu2204", + "--size", vmSize, + "--vnet-name", vnetName, + "--subnet", subnetName, + "--nsg", nsgName, + "--public-ip-address", pipName, + "--admin-username", "azureuser", + "--ssh-key-values", sshKeyPath+".pub", + "--zone", "1") + return err +} + +// AddSSHKeyToVM adds an SSH key to a VM +func (az *AzureCLI) AddSSHKeyToVM(ctx context.Context, resourceGroup, vmName, sshKeyPath string) error { + pubKey, err := ReadFileContent(sshKeyPath + ".pub") + if err != nil { + return fmt.Errorf("failed to read SSH public key: %w", err) + } + + _, err = RunCommand(ctx, "az", "vm", "user", "update", + "--resource-group", resourceGroup, + "--name", vmName, + "--username", "azureuser", + "--ssh-key-value", strings.TrimSpace(pubKey), + "--output", "none") + return err +} + +// RestartVM restarts a VM +func (az *AzureCLI) RestartVM(ctx context.Context, resourceGroup, vmName string) error { + _, err := RunCommand(ctx, "az", "vm", "restart", + "--resource-group", resourceGroup, + "--name", vmName, + "--no-wait") + return err +} + +// DeleteVM deletes a VM +func (az *AzureCLI) DeleteVM(ctx context.Context, resourceGroup, vmName string) error { + if !az.VMExists(ctx, resourceGroup, vmName) { + return nil + } + _, err := RunCommand(ctx, "az", "vm", "delete", + "--resource-group", resourceGroup, + "--name", vmName, + "--yes", + "--only-show-errors") + return err +} + +// DeleteNIC deletes a network interface +func (az *AzureCLI) DeleteNIC(ctx context.Context, resourceGroup, nicName string) error { + if !RunCommandSilent(ctx, "az", "network", "nic", "show", + "--resource-group", resourceGroup, + "--name", nicName) { + return nil + } + _, err := RunCommand(ctx, "az", "network", "nic", "delete", + "--resource-group", resourceGroup, + "--name", nicName, + "--only-show-errors") + return err +} + +// DeletePublicIP deletes a public IP +func (az *AzureCLI) DeletePublicIP(ctx context.Context, resourceGroup, pipName string) error { + if !RunCommandSilent(ctx, "az", "network", "public-ip", "show", + "--resource-group", resourceGroup, + "--name", pipName) { + return nil + } + _, err := RunCommand(ctx, "az", "network", "public-ip", "delete", + "--resource-group", resourceGroup, + "--name", pipName, + "--only-show-errors") + return err +} + +// DeleteNSG deletes a network security group +func (az *AzureCLI) DeleteNSG(ctx context.Context, resourceGroup, nsgName string) error { + if !RunCommandSilent(ctx, "az", "network", "nsg", "show", + "--resource-group", resourceGroup, + "--name", nsgName) { + return nil + } + _, err := RunCommand(ctx, "az", "network", "nsg", "delete", + "--resource-group", resourceGroup, + "--name", nsgName, + "--only-show-errors") + return err +} + +// DeleteSubnet deletes a subnet +func (az *AzureCLI) DeleteSubnet(ctx context.Context, vnetRG, vnetName, subnetName string) error { + _, _ = RunCommand(ctx, "az", "network", "vnet", "subnet", "delete", + "--resource-group", vnetRG, + "--vnet-name", vnetName, + "--name", subnetName) + return nil // Ignore errors +} + +// DeleteDisks deletes disks matching a pattern +func (az *AzureCLI) DeleteDisks(ctx context.Context, resourceGroup, pattern string) error { + output, err := RunCommand(ctx, "az", "disk", "list", + "--resource-group", resourceGroup, + "--query", fmt.Sprintf("[?contains(name, '%s')].name", pattern), + "-o", "json") + if err != nil { + return nil // Ignore errors + } + + var diskNames []string + if err := json.Unmarshal([]byte(output), &diskNames); err != nil { + return nil + } + + for _, disk := range diskNames { + _, _ = RunCommand(ctx, "az", "disk", "delete", + "--resource-group", resourceGroup, + "--name", disk, + "--yes", + "--only-show-errors") + } + + return nil +} + +// DeleteConnectedMachine deletes an Arc connected machine +func (az *AzureCLI) DeleteConnectedMachine(ctx context.Context, resourceGroup, machineName string) error { + _, _ = RunCommand(ctx, "az", "connectedmachine", "delete", + "--resource-group", resourceGroup, + "--name", machineName, + "--yes") + return nil // Ignore errors +} + +// GetAKSCredentials gets AKS cluster credentials +func (az *AzureCLI) GetAKSCredentials(ctx context.Context, resourceGroup, clusterName, kubeconfigPath string) error { + // Ensure directory exists + if err := EnsureDirectory("/root/.kube"); err != nil { + return err + } + + _, err := RunCommand(ctx, "az", "aks", "get-credentials", + "--resource-group", resourceGroup, + "--name", clusterName, + "--overwrite-existing", + "--file", kubeconfigPath) + return err +} + +// InstallAKSCLI installs kubectl and kubelogin +func (az *AzureCLI) InstallAKSCLI(ctx context.Context) error { + _, err := RunCommand(ctx, "az", "aks", "install-cli", + "--install-location", "/usr/local/bin/kubectl", + "--kubelogin-install-location", "/usr/local/bin/kubelogin") + if err != nil { + return err + } + + _, _ = RunCommand(ctx, "chmod", "+x", "/usr/local/bin/kubectl", "/usr/local/bin/kubelogin") + return nil +} + +// InstallConnectedMachineExtension installs the connectedmachine extension +func (az *AzureCLI) InstallConnectedMachineExtension(ctx context.Context) error { + // Check if already installed + if RunCommandSilent(ctx, "az", "extension", "show", "--name", "connectedmachine") { + return nil + } + + _, _ = RunCommand(ctx, "az", "config", "set", "extension.dynamic_install_allow_preview=true", "--only-show-errors") + + // Install extension + _, err := RunCommand(ctx, "az", "extension", "add", + "--name", "connectedmachine", + "--allow-preview", "true", + "--only-show-errors") + return err +} diff --git a/pkg/privatecluster/create_private_cluster.md b/pkg/privatecluster/create_private_cluster.md index bfbac97..e4fe3ff 100644 --- a/pkg/privatecluster/create_private_cluster.md +++ b/pkg/privatecluster/create_private_cluster.md @@ -130,7 +130,7 @@ sudo kubelogin convert-kubeconfig -l azurecli --kubeconfig /root/.kube/config ## Step 6: Get Cluster Resource ID -Save this for use with `private-join` and `private-leave` commands: +Save this for use in the `config.json` file's `targetCluster.resourceId` field: ```bash az aks show \ @@ -148,18 +148,18 @@ Example output: ### Join an edge node to the private cluster +Set `"private": true` in your `config.json`, then run: + ```bash -sudo ./aks-flex-node private-join \ - --aks-resource-id "/subscriptions/.../resourcegroups/.../providers/Microsoft.ContainerService/managedClusters/my-private-aks" +sudo ./aks-flex-node agent --config config.json ``` ### Leave the private cluster ```bash # Local cleanup (keep Gateway for other nodes) -sudo ./aks-flex-node private-leave --mode=local +sudo ./aks-flex-node unbootstrap --config config.json # Full cleanup (remove Gateway and all Azure resources) -sudo ./aks-flex-node private-leave --mode=full \ - --aks-resource-id "/subscriptions/.../resourcegroups/.../providers/Microsoft.ContainerService/managedClusters/my-private-aks" +sudo ./aks-flex-node unbootstrap --config config.json --cleanup-mode full ``` diff --git a/pkg/privatecluster/installer.go b/pkg/privatecluster/installer.go new file mode 100644 index 0000000..f6c0e9c --- /dev/null +++ b/pkg/privatecluster/installer.go @@ -0,0 +1,382 @@ +package privatecluster + +import ( + "context" + "fmt" + "time" +) + +// Installer handles private cluster installation +type Installer struct { + logger *Logger + azure *AzureCLI + options InstallOptions + + // State collected during installation + clusterInfo *AKSClusterInfo + vpnConfig VPNConfig + sshKeyPath string + gatewayIP string +} + +// NewInstaller creates a new Installer instance +func NewInstaller(options InstallOptions) *Installer { + logger := NewLogger(options.Verbose) + + // Apply defaults + if options.Gateway.Name == "" { + options.Gateway = DefaultGatewayConfig() + } + + return &Installer{ + logger: logger, + azure: NewAzureCLI(logger), + options: options, + vpnConfig: DefaultVPNConfig(), + sshKeyPath: GetSSHKeyPath(), + } +} + +// Install runs the complete installation process +func (i *Installer) Install(ctx context.Context) error { + fmt.Printf("%s========================================%s\n", colorGreen, colorReset) + fmt.Printf("%s Add Edge Node to Private AKS Cluster%s\n", colorGreen, colorReset) + fmt.Printf("%s========================================%s\n\n", colorGreen, colorReset) + + // Parse resource ID + subscriptionID, resourceGroup, clusterName, err := ParseResourceID(i.options.AKSResourceID) + if err != nil { + return err + } + + i.clusterInfo = &AKSClusterInfo{ + ResourceID: i.options.AKSResourceID, + SubscriptionID: subscriptionID, + ResourceGroup: resourceGroup, + ClusterName: clusterName, + } + + // Phase 1: Environment Check + if err := i.phase1EnvironmentCheck(ctx); err != nil { + return fmt.Errorf("environment check failed: %w", err) + } + + // Phase 2: Gateway Setup + if err := i.phase2GatewaySetup(ctx); err != nil { + return fmt.Errorf("gateway setup failed: %w", err) + } + + // Phase 3: Client Configuration + if err := i.phase3ClientSetup(ctx); err != nil { + return fmt.Errorf("client setup failed: %w", err) + } + + // Phase 4: Node Join Preparation + if err := i.phase4NodeJoin(ctx); err != nil { + return fmt.Errorf("node join failed: %w", err) + } + + // Phase 5 (Verification) skipped - node needs bootstrap to become Ready + i.logger.Success("Private cluster setup completed. Bootstrap will continue...") + return nil +} + +// phase1EnvironmentCheck checks prerequisites +func (i *Installer) phase1EnvironmentCheck(ctx context.Context) error { + _ = CleanKubeCache() + if err := i.azure.CheckInstalled(); err != nil { + return err + } + if err := i.azure.CheckLogin(ctx); err != nil { + return err + } + i.logger.Success("Azure CLI ready") + + // Check/refresh token + if err := i.azure.CheckAndRefreshToken(ctx); err != nil { + return err + } + + if err := i.azure.SetSubscription(ctx, i.clusterInfo.SubscriptionID); err != nil { + return err + } + i.logger.Success("Subscription: %s", i.clusterInfo.SubscriptionID) + + // Get Tenant ID + tenantID, err := i.azure.GetTenantID(ctx) + if err != nil { + return err + } + i.clusterInfo.TenantID = tenantID + i.logger.Verbose("Tenant ID: %s", tenantID) + + if !i.azure.AKSClusterExists(ctx, i.clusterInfo.ResourceGroup, i.clusterInfo.ClusterName) { + return fmt.Errorf("AKS cluster '%s' not found", i.clusterInfo.ClusterName) + } + clusterInfo, err := i.azure.GetAKSClusterInfo(ctx, i.clusterInfo.ResourceGroup, i.clusterInfo.ClusterName) + if err != nil { + return err + } + i.clusterInfo.Location = clusterInfo.Location + i.clusterInfo.NodeResourceGroup = clusterInfo.NodeResourceGroup + i.clusterInfo.PrivateFQDN = clusterInfo.PrivateFQDN + i.logger.Success("AKS cluster: %s (AAD/RBAC enabled)", i.clusterInfo.ClusterName) + + vnetName, vnetRG, err := i.azure.GetVNetInfo(ctx, i.clusterInfo.NodeResourceGroup) + if err != nil { + return err + } + i.clusterInfo.VNetName = vnetName + i.clusterInfo.VNetResourceGroup = vnetRG + i.logger.Success("VNet: %s/%s", vnetRG, vnetName) + + if err := InstallVPNTools(ctx, i.logger); err != nil { + return fmt.Errorf("failed to install VPN tools: %w", err) + } + if err := InstallJQ(ctx, i.logger); err != nil { + return fmt.Errorf("failed to install jq: %w", err) + } + if !CommandExists("kubectl") || !CommandExists("kubelogin") { + if err := i.azure.InstallAKSCLI(ctx); err != nil { + return fmt.Errorf("failed to install kubectl/kubelogin: %w", err) + } + } + if !CommandExists("kubectl") { + return fmt.Errorf("kubectl installation failed") + } + if !CommandExists("kubelogin") { + return fmt.Errorf("kubelogin installation failed") + } + _ = i.azure.InstallConnectedMachineExtension(ctx) + i.logger.Success("Dependencies ready") + + return nil +} + +// phase2GatewaySetup sets up the VPN Gateway +func (i *Installer) phase2GatewaySetup(ctx context.Context) error { + gatewayExists := false + if i.azure.VMExists(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name) { + gatewayExists = true + ip, err := i.azure.GetVMPublicIP(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name) + if err != nil { + return fmt.Errorf("failed to get Gateway public IP: %w", err) + } + i.gatewayIP = ip + i.logger.Success("Gateway exists: %s", i.gatewayIP) + } else { + i.logger.Info("Creating Gateway...") + if err := i.createGatewayInfrastructure(ctx); err != nil { + return err + } + } + + if err := GenerateSSHKey(i.sshKeyPath); err != nil { + return fmt.Errorf("failed to generate SSH key: %w", err) + } + if err := i.azure.AddSSHKeyToVM(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name, i.sshKeyPath); err != nil { + return fmt.Errorf("failed to add SSH key to Gateway: %w", err) + } + + // Wait for VM ready + if err := i.waitForVMReady(ctx, gatewayExists); err != nil { + return err + } + + // Get/configure server + if err := i.configureVPNServer(ctx); err != nil { + return err + } + + return nil +} + +// createGatewayInfrastructure creates Gateway VM and related resources +func (i *Installer) createGatewayInfrastructure(ctx context.Context) error { + nsgName := i.options.Gateway.Name + "-nsg" + pipName := i.options.Gateway.Name + "-pip" + + if err := i.azure.CreateSubnet(ctx, i.clusterInfo.VNetResourceGroup, i.clusterInfo.VNetName, + i.options.Gateway.SubnetName, i.options.Gateway.SubnetPrefix); err != nil { + return fmt.Errorf("failed to create subnet: %w", err) + } + if err := i.azure.CreateNSG(ctx, i.clusterInfo.ResourceGroup, nsgName, i.options.Gateway.Port); err != nil { + return fmt.Errorf("failed to create NSG: %w", err) + } + if err := i.azure.CreatePublicIP(ctx, i.clusterInfo.ResourceGroup, pipName); err != nil { + return fmt.Errorf("failed to create public IP: %w", err) + } + if err := GenerateSSHKey(i.sshKeyPath); err != nil { + return fmt.Errorf("failed to generate SSH key: %w", err) + } + if err := i.azure.CreateVM(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name, + i.clusterInfo.VNetName, i.options.Gateway.SubnetName, nsgName, pipName, + i.sshKeyPath, i.options.Gateway.VMSize); err != nil { + return fmt.Errorf("failed to create Gateway VM: %w", err) + } + + ip, err := i.azure.GetPublicIPAddress(ctx, i.clusterInfo.ResourceGroup, pipName) + if err != nil { + return fmt.Errorf("failed to get public IP address: %w", err) + } + i.gatewayIP = ip + i.logger.Success("Gateway created: %s", i.gatewayIP) + + i.logger.Info("Waiting for VM to boot (120s)...") + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(120 * time.Second): + } + + return nil +} + +// waitForVMReady waits for SSH connectivity to Gateway +func (i *Installer) waitForVMReady(ctx context.Context, gatewayExists bool) error { + sshConfig := DefaultSSHConfig(i.sshKeyPath, i.gatewayIP) + ssh := NewSSHClient(sshConfig, i.logger) + + if ssh.TestConnection(ctx) { + i.logger.Success("SSH ready") + return nil + } + + if gatewayExists { + i.logger.Info("Restarting VM...") + _ = i.azure.RestartVM(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name) + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(120 * time.Second): + } + } + + if err := ssh.WaitForConnection(ctx, 18, 10*time.Second); err != nil { + return fmt.Errorf("VM SSH connection timeout") + } + i.logger.Success("SSH ready") + return nil +} + +// configureVPNServer configures VPN on the Gateway +func (i *Installer) configureVPNServer(ctx context.Context) error { + sshConfig := DefaultSSHConfig(i.sshKeyPath, i.gatewayIP) + ssh := NewSSHClient(sshConfig, i.logger) + vpnServer := NewVPNServerManager(ssh, i.logger) + + if !vpnServer.IsInstalled(ctx) { + i.logger.Info("Installing VPN on Gateway...") + if err := vpnServer.Install(ctx); err != nil { + return fmt.Errorf("failed to install VPN on Gateway: %w", err) + } + } + + serverPubKey, err := vpnServer.GetPublicKey(ctx) + if err != nil { + if err := vpnServer.Install(ctx); err != nil { + return err + } + serverPubKey, err = vpnServer.GetPublicKey(ctx) + if err != nil { + return err + } + } + i.vpnConfig.ServerPublicKey = serverPubKey + i.vpnConfig.ServerEndpoint = i.gatewayIP + + peerCount, _ := vpnServer.GetPeerCount(ctx) + i.vpnConfig.ClientVPNIP = fmt.Sprintf("172.16.0.%d", peerCount+2) + i.logger.Success("VPN server ready, client IP: %s", i.vpnConfig.ClientVPNIP) + + return nil +} + +// phase3ClientSetup configures the local VPN client +func (i *Installer) phase3ClientSetup(ctx context.Context) error { + vpnClient := NewVPNClient(i.vpnConfig, i.logger) + privateKey, publicKey, err := vpnClient.GenerateKeyPair(ctx) + if err != nil { + return err + } + if err := vpnClient.CreateClientConfig(privateKey, i.options.Gateway.Port); err != nil { + return err + } + + sshConfig := DefaultSSHConfig(i.sshKeyPath, i.gatewayIP) + ssh := NewSSHClient(sshConfig, i.logger) + vpnServer := NewVPNServerManager(ssh, i.logger) + if err := vpnServer.AddPeer(ctx, publicKey, i.vpnConfig.ClientVPNIP); err != nil { + return err + } + + if err := vpnClient.Start(ctx); err != nil { + return err + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(3 * time.Second): + } + + if !vpnClient.TestConnection(ctx) { + return fmt.Errorf("VPN connection failed") + } + i.logger.Success("VPN connected: %s", i.vpnConfig.GatewayVPNIP) + + return nil +} + +// phase4NodeJoin joins the node to the AKS cluster +func (i *Installer) phase4NodeJoin(ctx context.Context) error { + sshConfig := DefaultSSHConfig(i.sshKeyPath, i.gatewayIP) + ssh := NewSSHClient(sshConfig, i.logger) + vpnServer := NewVPNServerManager(ssh, i.logger) + + apiServerIP, err := vpnServer.ResolveDNS(ctx, i.clusterInfo.PrivateFQDN) + if err != nil { + return err + } + i.clusterInfo.APIServerIP = apiServerIP + if err := AddHostsEntry(apiServerIP, i.clusterInfo.PrivateFQDN); err != nil { + return fmt.Errorf("failed to add hosts entry: %w", err) + } + i.logger.Success("API Server: %s (%s)", i.clusterInfo.PrivateFQDN, apiServerIP) + + _, _ = RunCommand(ctx, "swapoff", "-a") + + if !CommandExists("azcmagent") { + if err := i.installArcAgent(ctx); err != nil { + return fmt.Errorf("failed to install Arc agent: %w", err) + } + } + + kubeconfigPath := "/root/.kube/config" + if err := i.azure.GetAKSCredentials(ctx, i.clusterInfo.ResourceGroup, i.clusterInfo.ClusterName, kubeconfigPath); err != nil { + return fmt.Errorf("failed to get AKS credentials: %w", err) + } + if _, err := RunCommand(ctx, "kubelogin", "convert-kubeconfig", "-l", "azurecli", "--kubeconfig", kubeconfigPath); err != nil { + return fmt.Errorf("failed to convert kubeconfig: %w", err) + } + i.logger.Success("Kubeconfig ready: %s", kubeconfigPath) + + return nil +} + +// installArcAgent installs Azure Arc agent +func (i *Installer) installArcAgent(ctx context.Context) error { + _, _ = RunCommand(ctx, "dpkg", "--purge", "azcmagent") + if _, err := RunCommand(ctx, "curl", "-L", "-o", "/tmp/install_linux_azcmagent.sh", + "https://gbl.his.arc.azure.com/azcmagent-linux"); err != nil { + return err + } + if _, err := RunCommand(ctx, "chmod", "+x", "/tmp/install_linux_azcmagent.sh"); err != nil { + return err + } + if _, err := RunCommand(ctx, "bash", "/tmp/install_linux_azcmagent.sh"); err != nil { + return err + } + _, _ = RunCommand(ctx, "rm", "-f", "/tmp/install_linux_azcmagent.sh") + return nil +} diff --git a/pkg/privatecluster/private-install.sh b/pkg/privatecluster/private-install.sh deleted file mode 100755 index ac0569e..0000000 --- a/pkg/privatecluster/private-install.sh +++ /dev/null @@ -1,796 +0,0 @@ -#!/bin/bash -# private-install.sh - Called by: aks-flex-node private-join -# Join local node to Private AKS Cluster via Gateway -# -# Usage: -# sudo ./aks-flex-node private-join --aks-resource-id "/subscriptions/.../managedClusters/xxx" - -set -euo pipefail - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -CYAN='\033[0;36m' -NC='\033[0m' # No Color - -# Configuration -GATEWAY_NAME="wg-gateway" -GATEWAY_SUBNET_NAME="wg-subnet" -GATEWAY_SUBNET_PREFIX="10.0.100.0/24" -GATEWAY_VPN_NETWORK="172.16.0.0/24" -GATEWAY_VPN_IP="172.16.0.1" -GATEWAY_VM_SIZE="Standard_D2s_v3" -GATEWAY_PORT="51820" -NETWORK_INTERFACE="wg-aks" -# Handle sudo: use original user's home directory -if [[ -n "${SUDO_USER:-}" ]]; then - REAL_HOME=$(getent passwd "$SUDO_USER" | cut -d: -f6) -else - REAL_HOME="$HOME" -fi -SSH_KEY_PATH="${REAL_HOME}/.ssh/id_rsa_wg_gateway" -VERBOSE=false - -# Cleanup function for Ctrl+C -cleanup_on_exit() { - echo "" - log_warning "Interrupted! Cleaning up..." - sudo pkill -f "aks-flex-node agent" 2>/dev/null || true - exit 1 -} - -# Trap Ctrl+C and other termination signals -trap cleanup_on_exit SIGINT SIGTERM - -# Functions -log_info() { - echo -e "${BLUE}INFO:${NC} $1" -} - -log_success() { - echo -e "${GREEN}SUCCESS:${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}WARNING:${NC} $1" -} - -log_error() { - echo -e "${RED}ERROR:${NC} $1" -} - -log_verbose() { - if [[ "$VERBOSE" == "true" ]]; then - echo -e "${BLUE}VERBOSE:${NC} $1" - fi -} - -parse_args() { - while [[ $# -gt 0 ]]; do - case $1 in - --aks-resource-id) - AKS_RESOURCE_ID="$2" - shift 2 - ;; - --gateway-name) - GATEWAY_NAME="$2" - shift 2 - ;; - --gateway-subnet) - GATEWAY_SUBNET_PREFIX="$2" - shift 2 - ;; - --gateway-vm-size) - GATEWAY_VM_SIZE="$2" - shift 2 - ;; - --verbose) - VERBOSE=true - shift - ;; - *) - log_error "Unknown argument: $1" - exit 1 - ;; - esac - done - - # Validate required arguments - if [[ -z "${AKS_RESOURCE_ID:-}" ]]; then - log_error "Missing required argument: --aks-resource-id" - exit 1 - fi - - # Parse AKS Resource ID - parse_aks_resource_id -} - -parse_aks_resource_id() { - # Format: /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.ContainerService/managedClusters/{name} - # Normalize: Azure CLI sometimes returns lowercase 'resourcegroups', but Go code expects 'resourceGroups' - AKS_RESOURCE_ID=$(echo "$AKS_RESOURCE_ID" | sed 's|/resourcegroups/|/resourceGroups/|g') - - SUBSCRIPTION_ID=$(echo "$AKS_RESOURCE_ID" | cut -d'/' -f3) - RESOURCE_GROUP=$(echo "$AKS_RESOURCE_ID" | cut -d'/' -f5) - AKS_CLUSTER_NAME=$(echo "$AKS_RESOURCE_ID" | cut -d'/' -f9) - - if [[ -z "$SUBSCRIPTION_ID" || -z "$RESOURCE_GROUP" || -z "$AKS_CLUSTER_NAME" ]]; then - log_error "Invalid AKS Resource ID format" - exit 1 - fi - - log_verbose "Subscription ID: $SUBSCRIPTION_ID" - log_verbose "Resource Group: $RESOURCE_GROUP" - log_verbose "AKS Cluster Name: $AKS_CLUSTER_NAME" -} - -# Phase 1: Environment Check -phase1_environment_check() { - # Clean up old kube cache to avoid stale tokens - log_info "Cleaning up old kube cache..." - rm -rf /root/.kube/cache 2>/dev/null || true - rm -rf "${REAL_HOME}/.kube/cache" 2>/dev/null || true - log_success "Kube cache cleaned" - - # Check Azure CLI is installed - log_info "Checking Azure CLI..." - if ! command -v az &>/dev/null; then - log_error "Azure CLI not installed. Please install: curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash" - exit 1 - fi - log_success "Azure CLI installed" - - # Check Azure CLI login status - log_info "Checking Azure CLI login status..." - if ! az account show &>/dev/null; then - log_error "Azure CLI not logged in, please run 'az login' first" - exit 1 - fi - log_success "Azure CLI logged in" - - # Check if token is valid - if ! az account get-access-token --only-show-errors &>/dev/null; then - log_warning "Azure token expired or invalid, re-authenticating..." - az login - fi - - # Set subscription - log_info "Setting subscription: $SUBSCRIPTION_ID" - az account set --subscription "$SUBSCRIPTION_ID" - log_success "Subscription set successfully" - - # Get Tenant ID - TENANT_ID=$(az account show --query tenantId -o tsv) - log_verbose "Tenant ID: $TENANT_ID" - - # Verify AKS cluster exists - log_info "Verifying AKS cluster: $AKS_CLUSTER_NAME" - if ! az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" &>/dev/null; then - log_error "AKS cluster '$AKS_CLUSTER_NAME' not found" - exit 1 - fi - - # Check AAD and RBAC - log_info "Checking AKS cluster AAD and RBAC configuration..." - AAD_ENABLED=$(az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ - --query "aadProfile.managed" -o tsv 2>/dev/null || echo "false") - RBAC_ENABLED=$(az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ - --query "aadProfile.enableAzureRbac" -o tsv 2>/dev/null || echo "false") - - if [[ "$AAD_ENABLED" != "true" ]]; then - log_error "AKS cluster AAD not enabled, please enable: az aks update --enable-aad" - exit 1 - fi - if [[ "$RBAC_ENABLED" != "true" ]]; then - log_error "AKS cluster Azure RBAC not enabled, please enable: az aks update --enable-azure-rbac" - exit 1 - fi - log_success "AKS cluster AAD and RBAC enabled" - - - # Get AKS VNet info - log_info "Getting AKS VNet info..." - AKS_NODE_RG=$(az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ - --query "nodeResourceGroup" -o tsv) - - # Get VNet info from VMSS - VMSS_NAME=$(az vmss list --resource-group "$AKS_NODE_RG" --query "[0].name" -o tsv) - if [[ -z "$VMSS_NAME" ]]; then - log_error "Cannot find AKS node VMSS" - exit 1 - fi - - VNET_SUBNET_ID=$(az vmss show --resource-group "$AKS_NODE_RG" --name "$VMSS_NAME" \ - --query "virtualMachineProfile.networkProfile.networkInterfaceConfigurations[0].ipConfigurations[0].subnet.id" -o tsv) - - VNET_NAME=$(echo "$VNET_SUBNET_ID" | cut -d'/' -f9) - VNET_RG=$(echo "$VNET_SUBNET_ID" | cut -d'/' -f5) - - log_success "VNet: $VNET_NAME (Resource Group: $VNET_RG)" - - # Get Location - LOCATION=$(az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ - --query "location" -o tsv) - log_verbose "Location: $LOCATION" - - # Check local dependencies - log_info "Checking local dependencies..." - - if ! command -v wg &>/dev/null; then - log_info "Installing VPN tools..." - sudo apt-get update && sudo apt-get install -y wireguard-tools - fi - log_success "VPN tools installed" - - if ! command -v jq &>/dev/null; then - log_info "Installing jq..." - sudo apt-get install -y jq - fi - log_success "jq installed" - - # Install kubectl and kubelogin - if ! command -v kubectl &>/dev/null || ! command -v kubelogin &>/dev/null; then - log_info "Installing kubectl and kubelogin..." - az aks install-cli --install-location /usr/local/bin/kubectl --kubelogin-install-location /usr/local/bin/kubelogin - chmod +x /usr/local/bin/kubectl /usr/local/bin/kubelogin - fi - # Verify installation - if ! command -v kubectl &>/dev/null; then - log_error "kubectl installation failed" - exit 1 - fi - if ! command -v kubelogin &>/dev/null; then - log_error "kubelogin installation failed" - exit 1 - fi - log_success "kubectl and kubelogin installed" - - # Install Azure CLI connectedmachine extension - if ! az extension show --name connectedmachine &>/dev/null; then - log_info "Installing Azure CLI connectedmachine extension..." - az config set extension.dynamic_install_allow_preview=true --only-show-errors 2>/dev/null || true - az extension add --name connectedmachine --allow-preview true --only-show-errors - fi - log_success "Azure CLI extensions ready" -} - -# Phase 2: Gateway Setup -phase2_gateway_setup() { - # Check if Gateway exists - log_info "Checking if Gateway exists..." - if az vm show --resource-group "$RESOURCE_GROUP" --name "$GATEWAY_NAME" &>/dev/null; then - log_info "Gateway exists, reusing" - GATEWAY_EXISTS=true - - # Get Public IP - WG_PUBLIC_IP=$(az vm list-ip-addresses --resource-group "$RESOURCE_GROUP" --name "$GATEWAY_NAME" \ - --query "[0].virtualMachine.network.publicIpAddresses[0].ipAddress" -o tsv) - log_success "Gateway Public IP: $WG_PUBLIC_IP" - else - log_info "Gateway not found, creating new one" - GATEWAY_EXISTS=false - create_gateway_infrastructure - fi - - # Ensure SSH key exists - ensure_ssh_key - - # Add SSH key to Gateway (idempotent, works for both new and existing Gateway) - log_info "Adding SSH key to Gateway..." - az vm user update \ - --resource-group "$RESOURCE_GROUP" \ - --name "$GATEWAY_NAME" \ - --username azureuser \ - --ssh-key-value "$(cat ${SSH_KEY_PATH}.pub)" \ - --output none - log_success "SSH key added to Gateway" - - # Wait for VM ready and get server info - wait_for_vm_ready - get_server_info -} - -create_gateway_infrastructure() { - # Create Gateway Subnet - log_info "Checking/creating Gateway subnet..." - if ! az network vnet subnet show --resource-group "$VNET_RG" --vnet-name "$VNET_NAME" \ - --name "$GATEWAY_SUBNET_NAME" &>/dev/null; then - az network vnet subnet create \ - --resource-group "$VNET_RG" \ - --vnet-name "$VNET_NAME" \ - --name "$GATEWAY_SUBNET_NAME" \ - --address-prefixes "$GATEWAY_SUBNET_PREFIX" - log_success "Subnet $GATEWAY_SUBNET_NAME created" - else - log_info "Subnet $GATEWAY_SUBNET_NAME already exists" - fi - - # Create NSG - log_info "Checking/creating NSG..." - NSG_NAME="${GATEWAY_NAME}-nsg" - if ! az network nsg show --resource-group "$RESOURCE_GROUP" --name "$NSG_NAME" &>/dev/null; then - az network nsg create --resource-group "$RESOURCE_GROUP" --name "$NSG_NAME" - - # Add SSH rule (priority 100 to override NRMS-Rule-106 which denies SSH from Internet at priority 106) - az network nsg rule create \ - --resource-group "$RESOURCE_GROUP" \ - --nsg-name "$NSG_NAME" \ - --name allow-ssh \ - --priority 100 \ - --destination-port-ranges 22 \ - --protocol Tcp \ - --access Allow - - # Add VPN rule - az network nsg rule create \ - --resource-group "$RESOURCE_GROUP" \ - --nsg-name "$NSG_NAME" \ - --name allow-wireguard \ - --priority 200 \ - --destination-port-ranges "$GATEWAY_PORT" \ - --protocol Udp \ - --access Allow - - log_success "NSG $NSG_NAME created" - else - log_info "NSG $NSG_NAME already exists" - fi - - # Create Public IP - log_info "Checking/creating Public IP..." - PIP_NAME="${GATEWAY_NAME}-pip" - if ! az network public-ip show --resource-group "$RESOURCE_GROUP" --name "$PIP_NAME" &>/dev/null; then - az network public-ip create \ - --resource-group "$RESOURCE_GROUP" \ - --name "$PIP_NAME" \ - --sku Standard \ - --allocation-method Static - log_success "Public IP $PIP_NAME created" - else - log_info "Public IP $PIP_NAME already exists" - fi - - # Generate SSH key - ensure_ssh_key - - # Create VM - log_info "Creating Gateway..." - az vm create \ - --resource-group "$RESOURCE_GROUP" \ - --name "$GATEWAY_NAME" \ - --image Ubuntu2204 \ - --size "$GATEWAY_VM_SIZE" \ - --vnet-name "$VNET_NAME" \ - --subnet "$GATEWAY_SUBNET_NAME" \ - --nsg "$NSG_NAME" \ - --public-ip-address "$PIP_NAME" \ - --admin-username azureuser \ - --ssh-key-values "${SSH_KEY_PATH}.pub" \ - --zone 1 - - # Get Public IP - WG_PUBLIC_IP=$(az network public-ip show --resource-group "$RESOURCE_GROUP" --name "$PIP_NAME" \ - --query ipAddress -o tsv) - log_success "Gateway created, Public IP: $WG_PUBLIC_IP" - - # Wait for new VM to boot up - log_info "Waiting 120 seconds for VM to boot up..." - sleep 120 -} - -ensure_ssh_key() { - if [[ ! -f "$SSH_KEY_PATH" ]]; then - log_info "Generating SSH key..." - ssh-keygen -t rsa -b 4096 -f "$SSH_KEY_PATH" -N "" - # Fix ownership if running with sudo (so user can SSH without sudo) - if [[ -n "${SUDO_USER:-}" ]]; then - chown "$SUDO_USER:$SUDO_USER" "$SSH_KEY_PATH" "${SSH_KEY_PATH}.pub" - fi - log_success "SSH key generated: $SSH_KEY_PATH" - else - log_info "SSH key already exists: $SSH_KEY_PATH" - fi -} - -wait_for_vm_ready() { - log_info "Checking VM SSH connectivity..." - - # First quick check - if ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o ConnectTimeout=10 -i "$SSH_KEY_PATH" \ - azureuser@"$WG_PUBLIC_IP" "echo ready" &>/dev/null; then - log_success "VM SSH connection ready" - return 0 - fi - - # SSH failed, restart VM if it's an existing VM - if [[ "$GATEWAY_EXISTS" == "true" ]]; then - log_warning "SSH connection failed, restarting VM..." - az vm restart --resource-group "$RESOURCE_GROUP" --name "$GATEWAY_NAME" --no-wait - log_info "Waiting 120 seconds for VM to restart..." - sleep 120 - fi - - # Wait for SSH with retries - log_info "Waiting for VM to be ready..." - local max_attempts=18 - local attempt=0 - - while [[ $attempt -lt $max_attempts ]]; do - if ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o ConnectTimeout=5 -i "$SSH_KEY_PATH" \ - azureuser@"$WG_PUBLIC_IP" "echo ready" &>/dev/null; then - log_success "VM SSH connection ready" - return 0 - fi - attempt=$((attempt + 1)) - log_verbose "Waiting for SSH... ($attempt/$max_attempts)" - sleep 10 - done - - log_error "VM SSH connection timeout" - exit 1 -} - -get_server_info() { - log_info "Getting/configuring Gateway server..." - - # Check if networking is already installed - if ! ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" "command -v wg" &>/dev/null; then - log_info "Installing and configuring networking on Gateway..." - install_wireguard_server - else - log_info "Networking already installed" - fi - - # Get server public key - SERVER_PUBLIC_KEY=$(ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" \ - "sudo cat /etc/wireguard/server_public.key 2>/dev/null || echo ''") - - if [[ -z "$SERVER_PUBLIC_KEY" ]]; then - log_info "Server key not found, reconfiguring..." - install_wireguard_server - SERVER_PUBLIC_KEY=$(ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" \ - "sudo cat /etc/wireguard/server_public.key") - fi - - log_success "Server public key retrieved" - - # Get existing peer count - EXISTING_PEERS=$(ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" \ - "sudo wg show wg0 peers 2>/dev/null | wc -l || echo 0") - log_verbose "Existing peer count: $EXISTING_PEERS" - - # Calculate client IP - CLIENT_IP_SUFFIX=$((EXISTING_PEERS + 2)) - CLIENT_VPN_IP="172.16.0.${CLIENT_IP_SUFFIX}" - log_success "Assigned client VPN IP: $CLIENT_VPN_IP" -} - -install_wireguard_server() { - ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" << 'REMOTE_SCRIPT' -set -e - -# Install networking -sudo apt-get update -sudo apt-get install -y wireguard - -# Generate key pair -sudo wg genkey | sudo tee /etc/wireguard/server_private.key | sudo wg pubkey | sudo tee /etc/wireguard/server_public.key -sudo chmod 600 /etc/wireguard/server_private.key - -SERVER_PRIVATE_KEY=$(sudo cat /etc/wireguard/server_private.key) - -# Create configuration -sudo tee /etc/wireguard/wg0.conf << EOF -[Interface] -PrivateKey = ${SERVER_PRIVATE_KEY} -Address = 172.16.0.1/24 -ListenPort = 51820 -PostUp = iptables -A FORWARD -i wg0 -j ACCEPT; iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE -PostDown = iptables -D FORWARD -i wg0 -j ACCEPT; iptables -t nat -D POSTROUTING -o eth0 -j MASQUERADE -EOF - -# Enable IP forwarding -echo 'net.ipv4.ip_forward=1' | sudo tee -a /etc/sysctl.conf -sudo sysctl -p - -# Start networking -sudo systemctl enable wg-quick@wg0 -sudo systemctl start wg-quick@wg0 || sudo systemctl restart wg-quick@wg0 - -echo "Gateway server configuration complete" -REMOTE_SCRIPT -} - -# Phase 3: Client Configuration -phase3_client_setup() { - # Generate client key pair - log_info "Generating client key pair..." - CLIENT_PRIVATE_KEY=$(wg genkey) - CLIENT_PUBLIC_KEY=$(echo "$CLIENT_PRIVATE_KEY" | wg pubkey) - log_success "Client key pair generated" - - # Create Gateway client configuration - log_info "Creating Gateway client configuration..." - sudo tee /etc/wireguard/${NETWORK_INTERFACE}.conf > /dev/null << EOF -[Interface] -PrivateKey = ${CLIENT_PRIVATE_KEY} -Address = ${CLIENT_VPN_IP}/24 - -[Peer] -PublicKey = ${SERVER_PUBLIC_KEY} -Endpoint = ${WG_PUBLIC_IP}:${GATEWAY_PORT} -AllowedIPs = 10.0.0.0/8, 172.16.0.0/24 -PersistentKeepalive = 25 -EOF - sudo chmod 600 /etc/wireguard/${NETWORK_INTERFACE}.conf - log_success "Client configuration created" - - # Add client peer to server - log_info "Adding client peer to server..." - ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" \ - "sudo wg set wg0 peer '${CLIENT_PUBLIC_KEY}' allowed-ips ${CLIENT_VPN_IP}/32" - - # Persist configuration - ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" "sudo wg-quick save wg0" - log_success "Client peer added" - - # Start networking connection - log_info "Starting networking connection..." - sudo wg-quick down "$NETWORK_INTERFACE" 2>/dev/null || true - sudo wg-quick up "$NETWORK_INTERFACE" - - # Verify connection - sleep 3 - if ping -c 1 -W 3 "$GATEWAY_VPN_IP" &>/dev/null; then - log_success "Networking connected, can ping Gateway ($GATEWAY_VPN_IP)" - else - log_error "Networking connection failed, cannot ping Gateway" - exit 1 - fi -} - -# Phase 4: Node Join -phase4_node_join() { - # Get API Server private FQDN - log_info "Getting AKS API Server address..." - API_SERVER_FQDN=$(az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ - --query "privateFqdn" -o tsv) - log_verbose "API Server FQDN: $API_SERVER_FQDN" - - # Resolve private DNS through Gateway - log_info "Resolving API Server private IP..." - API_SERVER_IP=$(ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i "$SSH_KEY_PATH" azureuser@"$WG_PUBLIC_IP" \ - "nslookup $API_SERVER_FQDN | grep -A1 'Name:' | grep 'Address:' | awk '{print \$2}'" 2>/dev/null || echo "") - - if [[ -z "$API_SERVER_IP" ]]; then - log_error "Cannot resolve API Server private IP" - exit 1 - fi - log_success "API Server IP: $API_SERVER_IP" - - # Add hosts entry - log_info "Adding hosts entry..." - if ! grep -q "$API_SERVER_FQDN" /etc/hosts; then - echo "$API_SERVER_IP $API_SERVER_FQDN" | sudo tee -a /etc/hosts - log_success "Hosts entry added" - else - log_info "Hosts entry already exists" - fi - - # Disable swap - log_info "Disabling swap..." - sudo swapoff -a - log_success "Swap disabled" - - # Install Azure Arc agent (required for aks-flex-node) - log_info "Checking Azure Arc agent..." - if ! command -v azcmagent &>/dev/null; then - log_info "Installing Azure Arc agent..." - # Clean up any existing package state to avoid conflicts - sudo dpkg --purge azcmagent 2>/dev/null || true - - local temp_dir - temp_dir=$(mktemp -d) - - curl -L -o "$temp_dir/install_linux_azcmagent.sh" https://gbl.his.arc.azure.com/azcmagent-linux - chmod +x "$temp_dir/install_linux_azcmagent.sh" - sudo bash "$temp_dir/install_linux_azcmagent.sh" - rm -rf "$temp_dir" - - log_success "Azure Arc agent installed" - else - log_info "Azure Arc agent already installed" - fi - - # Get AKS credentials (save to root's kubeconfig for consistency with sudo az login) - log_info "Getting AKS credentials..." - mkdir -p /root/.kube - az aks get-credentials --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ - --overwrite-existing --file /root/.kube/config - - # Convert kubeconfig to use Azure CLI auth (for AAD + Azure RBAC) - log_info "Converting kubeconfig for Azure CLI auth..." - kubelogin convert-kubeconfig -l azurecli --kubeconfig /root/.kube/config - log_success "Kubeconfig ready (saved to /root/.kube/config)" - - # Generate config.json - log_info "Generating aks-flex-node configuration..." - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - PROJECT_ROOT="${SCRIPT_DIR}/../.." - CONFIG_FILE="${PROJECT_ROOT}/config.json" - - cat > "$CONFIG_FILE" << EOF -{ - "azure": { - "subscriptionId": "${SUBSCRIPTION_ID}", - "tenantId": "${TENANT_ID}", - "targetCluster": { - "resourceId": "${AKS_RESOURCE_ID}", - "location": "${LOCATION}" - }, - "arc": { - "resourceGroup": "${RESOURCE_GROUP}", - "location": "${LOCATION}" - } - }, - "network": { - "mode": "wireguard", - "wireguard": { - "serverEndpoint": "${WG_PUBLIC_IP}:${GATEWAY_PORT}", - "serverPublicKey": "${SERVER_PUBLIC_KEY}", - "clientAddress": "${CLIENT_VPN_IP}/24", - "allowedIPs": ["10.0.0.0/8", "172.16.0.0/24"], - "persistentKeepalive": 25, - "testEndpoint": "${API_SERVER_IP}:443" - } - }, - "kubernetes": { - "version": "1.29.0" - }, - "containerd": { - "version": "1.7.11", - "pauseImage": "mcr.microsoft.com/oss/kubernetes/pause:3.6" - }, - "agent": { - "logLevel": "info", - "logDir": "/var/log/aks-flex-node" - } -} -EOF - log_success "Config file generated: $CONFIG_FILE" - - # Run aks-flex-node - log_info "Running aks-flex-node agent..." - cd "${PROJECT_ROOT}" - - # Build if needed - if [[ ! -f "./aks-flex-node" ]]; then - log_info "Building aks-flex-node..." - go build -o aks-flex-node . - fi - - # Kill any existing aks-flex-node agent process - log_info "Stopping any existing aks-flex-node agent..." - sudo pkill -f "aks-flex-node agent" 2>/dev/null || true - sleep 2 - - # Create log directory - sudo mkdir -p /var/log/aks-flex-node - - # Run agent in background - LOG_FILE="/var/log/aks-flex-node/agent.log" - sudo bash -c "./aks-flex-node agent --config '$CONFIG_FILE' > '$LOG_FILE' 2>&1" & - AGENT_PID=$! - log_info "Agent started in background (PID: $AGENT_PID)" - # Wait for bootstrap to complete (check log file, minimal output) - log_info "Waiting for bootstrap to complete (may take 2-3 minutes)..." - log_info "View details: sudo tail -f $LOG_FILE" - - local max_wait=300 - local waited=0 - local bootstrap_success=false - local bootstrap_failed=false - - # Simple progress indicator - printf " " - while [[ $waited -lt $max_wait ]]; do - # Check success/failure - if sudo grep -q "bootstrap completed successfully" "$LOG_FILE" /dev/null; then - bootstrap_success=true - break - fi - if sudo grep -q "Bootstrap failed\|bootstrap failed" "$LOG_FILE" /dev/null; then - bootstrap_failed=true - break - fi - printf "." - sleep 5 - waited=$((waited + 5)) - done - echo "" - - if [[ "$bootstrap_failed" == "true" ]]; then - log_error "Bootstrap failed. Check: sudo tail -50 $LOG_FILE" - exit 1 - fi - - if [[ "$bootstrap_success" == "true" ]]; then - log_success "Bootstrap completed" - else - log_error "Timeout. Check: sudo tail -50 $LOG_FILE" - exit 1 - fi - - # Wait for RBAC propagation (simple dots) - printf " " - for i in {1..3}; do - printf "." - sleep 5 - done - echo "" - log_success "Node join completed" -} - -# Phase 5: Verification -phase5_verification() { - NODE_NAME=$(hostname | tr '[:upper:]' '[:lower:]') - - # Check node status (simple dots) - log_info "Waiting for node ready..." - printf " " - local max_attempts=30 - local attempt=0 - - while [[ $attempt -lt $max_attempts ]]; do - NODE_STATUS=$(kubectl --kubeconfig /root/.kube/config get node "$NODE_NAME" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "") - if [[ "$NODE_STATUS" == "True" ]]; then - break - fi - attempt=$((attempt + 1)) - printf "." - sleep 5 - done - echo "" - - if [[ "$NODE_STATUS" != "True" ]]; then - log_error "Node not ready, timeout" - exit 1 - fi - - log_success "Node $NODE_NAME is Ready" - echo "" - printf "${GREEN}========================================${NC}\n" - printf "${GREEN} Success! Edge Node joined Private AKS Cluster${NC}\n" - printf "${GREEN}========================================${NC}\n" - printf "\n" - printf "Node info:\n" - printf " - Node name: %s\n" "$NODE_NAME" - printf " - VPN IP: %s\n" "$CLIENT_VPN_IP" - printf " - AKS cluster: %s\n" "$AKS_CLUSTER_NAME" - printf "\n" - printf "Cluster nodes:\n" - kubectl --kubeconfig /root/.kube/config get nodes -o wide 2>&1 - printf "\n" - printf "${YELLOW}Tips:${NC}\n" - printf " - Please try: sudo kubectl get nodes\n" - printf "\n" -} - -main() { - echo -e "${GREEN}========================================${NC}" - echo -e "${GREEN} Add Edge Node to Private AKS Cluster${NC}" - echo -e "${GREEN}========================================${NC}" - echo "" - - parse_args "$@" - phase1_environment_check - phase2_gateway_setup - phase3_client_setup - phase4_node_join - phase5_verification -} - -# Run main -main "$@" diff --git a/pkg/privatecluster/private-uninstall.sh b/pkg/privatecluster/private-uninstall.sh deleted file mode 100755 index 25bbcf2..0000000 --- a/pkg/privatecluster/private-uninstall.sh +++ /dev/null @@ -1,421 +0,0 @@ -#!/bin/bash -# private-uninstall.sh - Called by: aks-flex-node private-leave -# Cleanup Private AKS Cluster Edge Node configuration -# -# Usage: -# sudo ./aks-flex-node private-leave --mode=local # Keep Gateway -# sudo ./aks-flex-node private-leave --mode=full --aks-resource-id "..." # Full cleanup - -set -euo pipefail - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# Configuration -GATEWAY_NAME="wg-gateway" -GATEWAY_SUBNET_NAME="wg-subnet" -NETWORK_INTERFACE="wg-aks" -CLEANUP_MODE="" -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="${SCRIPT_DIR}/../.." - -# Handle sudo: use original user's home directory for SSH keys -if [[ -n "${SUDO_USER:-}" ]]; then - REAL_HOME=$(getent passwd "$SUDO_USER" | cut -d: -f6) -else - REAL_HOME="$HOME" -fi -SSH_KEY_PATH="${REAL_HOME}/.ssh/id_rsa_wg_gateway" - -# Functions -log_info() { - echo -e "${BLUE}INFO:${NC} $1" -} - -log_success() { - echo -e "${GREEN}SUCCESS:${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}WARNING:${NC} $1" -} - -log_error() { - echo -e "${RED}ERROR:${NC} $1" -} - -parse_args() { - while [[ $# -gt 0 ]]; do - case $1 in - --local) - CLEANUP_MODE="local" - shift - ;; - --full) - CLEANUP_MODE="full" - shift - ;; - --aks-resource-id) - AKS_RESOURCE_ID="$2" - shift 2 - ;; - *) - log_error "Unknown argument: $1" - exit 1 - ;; - esac - done - - if [[ -z "$CLEANUP_MODE" ]]; then - log_error "Please specify cleanup mode: --local or --full" - exit 1 - fi - - if [[ "$CLEANUP_MODE" == "full" && -z "${AKS_RESOURCE_ID:-}" ]]; then - log_error "--full mode requires --aks-resource-id" - exit 1 - fi - - if [[ -n "${AKS_RESOURCE_ID:-}" ]]; then - # Remove possible quotes and whitespace - AKS_RESOURCE_ID=$(echo "$AKS_RESOURCE_ID" | tr -d '"' | tr -d "'" | xargs) - SUBSCRIPTION_ID=$(echo "$AKS_RESOURCE_ID" | cut -d'/' -f3) - RESOURCE_GROUP=$(echo "$AKS_RESOURCE_ID" | cut -d'/' -f5) - AKS_CLUSTER_NAME=$(echo "$AKS_RESOURCE_ID" | cut -d'/' -f9) - - log_info "Parsed subscription ID: $SUBSCRIPTION_ID" - log_info "Parsed resource group: $RESOURCE_GROUP" - log_info "Parsed cluster name: $AKS_CLUSTER_NAME" - fi -} - -cleanup_local() { - log_info "Performing local cleanup (keeping Gateway)..." - - NODE_NAME=$(hostname | tr '[:upper:]' '[:lower:]') - - # Get Gateway IP (before stopping networking) - GATEWAY_PUBLIC_IP="" - CLIENT_PRIVATE_KEY="" - if [[ -f "/etc/wireguard/${NETWORK_INTERFACE}.conf" ]]; then - GATEWAY_PUBLIC_IP=$(sudo cat /etc/wireguard/${NETWORK_INTERFACE}.conf 2>/dev/null | grep "Endpoint" | cut -d'=' -f2 | cut -d':' -f1 | tr -d ' ' || echo "") - CLIENT_PRIVATE_KEY=$(sudo cat /etc/wireguard/${NETWORK_INTERFACE}.conf 2>/dev/null | grep "PrivateKey" | cut -d'=' -f2 | tr -d ' ' || echo "") - fi - - # Remove node from cluster (while networking is still connected) - if command -v kubectl &>/dev/null; then - log_info "Removing node $NODE_NAME from cluster..." - # Try root kubeconfig first, then user's kubeconfig - if kubectl --kubeconfig /root/.kube/config delete node "$NODE_NAME" --ignore-not-found 2>&1; then - log_success "Node removed from cluster" - elif kubectl delete node "$NODE_NAME" --ignore-not-found 2>&1; then - log_success "Node removed from cluster" - else - log_warning "Failed to remove node from cluster (may need manual cleanup: kubectl delete node $NODE_NAME)" - fi - fi - - # Stop any running aks-flex-node agent process - log_info "Stopping aks-flex-node agent..." - sudo pkill -f "aks-flex-node agent" 2>/dev/null || true - sleep 2 - - # Run aks-flex-node unbootstrap - log_info "Running aks-flex-node unbootstrap..." - CONFIG_FILE="${PROJECT_ROOT}/config.json" - AKS_FLEX_NODE="${PROJECT_ROOT}/aks-flex-node" - - if [[ -f "$AKS_FLEX_NODE" && -f "$CONFIG_FILE" ]]; then - sudo "$AKS_FLEX_NODE" unbootstrap --config "$CONFIG_FILE" || true - log_success "aks-flex-node unbootstrap completed" - else - log_warning "aks-flex-node or config.json not found, manually stopping services..." - sudo systemctl stop kubelet 2>/dev/null || true - sudo systemctl disable kubelet 2>/dev/null || true - sudo systemctl stop containerd 2>/dev/null || true - fi - - # Remove Arc Agent and Azure resource - log_info "Removing Arc Agent..." - if command -v azcmagent &>/dev/null; then - # First, delete Azure resource (requires az login) - log_info "Deleting Arc machine from Azure..." - ARC_RG=$(sudo azcmagent show 2>/dev/null | grep "Resource Group" | awk -F: '{print $2}' | xargs || echo "") - if [[ -n "$ARC_RG" ]]; then - az connectedmachine delete --resource-group "$ARC_RG" --name "$NODE_NAME" --yes 2>/dev/null || true - log_success "Arc machine deleted from Azure" - fi - # Then disconnect locally - sudo azcmagent disconnect --force-local-only 2>/dev/null || true - sudo systemctl stop himdsd extd gcad arcproxyd 2>/dev/null || true - sudo systemctl disable himdsd extd gcad arcproxyd 2>/dev/null || true - if command -v apt &>/dev/null; then - sudo apt remove azcmagent -y 2>/dev/null || true - elif command -v yum &>/dev/null; then - sudo yum remove azcmagent -y 2>/dev/null || true - fi - sudo rm -rf /var/opt/azcmagent /opt/azcmagent 2>/dev/null || true - log_success "Arc Agent removed" - else - log_info "Arc Agent not found, skipping" - fi - - # Remove client peer from Gateway - if [[ -n "$GATEWAY_PUBLIC_IP" && -n "$CLIENT_PRIVATE_KEY" && -f "$SSH_KEY_PATH" ]]; then - log_info "Removing client peer from Gateway..." - CLIENT_PUBLIC_KEY=$(echo "$CLIENT_PRIVATE_KEY" | wg pubkey 2>/dev/null || echo "") - if [[ -n "$CLIENT_PUBLIC_KEY" ]]; then - ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o ConnectTimeout=10 -i "$SSH_KEY_PATH" \ - azureuser@"$GATEWAY_PUBLIC_IP" \ - "sudo wg set wg0 peer '$CLIENT_PUBLIC_KEY' remove && sudo wg-quick save wg0" 2>/dev/null || true - log_success "Client peer removed from Gateway" - fi - fi - - # Stop networking - log_info "Stopping VPN connection..." - sudo wg-quick down "$NETWORK_INTERFACE" 2>/dev/null || true - log_success "VPN connection stopped" - - # Delete Gateway client configuration - log_info "Deleting VPN client configuration..." - sudo rm -f /etc/wireguard/${NETWORK_INTERFACE}.conf - log_success "VPN client configuration deleted" - - # Clean up hosts entries - log_info "Cleaning up hosts entries..." - sudo sed -i '/privatelink.*azmk8s.io/d' /etc/hosts - log_success "Hosts entries cleaned up" - - # Delete config.json - log_info "Deleting config file..." - rm -f "$CONFIG_FILE" - - echo "" - log_success "Local cleanup completed!" - echo "" - echo "To rejoin cluster, run:" - echo " sudo ./aks-flex-node private-join --aks-resource-id \"...\"" -} - -cleanup_full() { - log_info "Performing full cleanup..." - - NODE_NAME=$(hostname | tr '[:upper:]' '[:lower:]') - - # Get Gateway IP (before stopping networking) - GATEWAY_PUBLIC_IP="" - CLIENT_PRIVATE_KEY="" - if [[ -f "/etc/wireguard/${NETWORK_INTERFACE}.conf" ]]; then - GATEWAY_PUBLIC_IP=$(sudo cat /etc/wireguard/${NETWORK_INTERFACE}.conf 2>/dev/null | grep "Endpoint" | cut -d'=' -f2 | cut -d':' -f1 | tr -d ' ' || echo "") - CLIENT_PRIVATE_KEY=$(sudo cat /etc/wireguard/${NETWORK_INTERFACE}.conf 2>/dev/null | grep "PrivateKey" | cut -d'=' -f2 | tr -d ' ' || echo "") - fi - - # Remove node from cluster (while networking is still connected) - if command -v kubectl &>/dev/null; then - log_info "Removing node $NODE_NAME from cluster..." - # Try root kubeconfig first, then user's kubeconfig - if kubectl --kubeconfig /root/.kube/config delete node "$NODE_NAME" --ignore-not-found 2>&1; then - log_success "Node removed from cluster" - elif kubectl delete node "$NODE_NAME" --ignore-not-found 2>&1; then - log_success "Node removed from cluster" - else - log_warning "Failed to remove node from cluster (may need manual cleanup: kubectl delete node $NODE_NAME)" - fi - fi - - # Stop any running aks-flex-node agent process - log_info "Stopping aks-flex-node agent..." - sudo pkill -f "aks-flex-node agent" 2>/dev/null || true - sleep 2 - - # Run aks-flex-node unbootstrap - log_info "Running aks-flex-node unbootstrap..." - CONFIG_FILE="${PROJECT_ROOT}/config.json" - AKS_FLEX_NODE="${PROJECT_ROOT}/aks-flex-node" - - if [[ -f "$AKS_FLEX_NODE" && -f "$CONFIG_FILE" ]]; then - sudo "$AKS_FLEX_NODE" unbootstrap --config "$CONFIG_FILE" || true - log_success "aks-flex-node unbootstrap completed" - else - log_warning "aks-flex-node or config.json not found, skipping unbootstrap" - # Manually stop services - log_info "Manually stopping services..." - sudo systemctl stop kubelet 2>/dev/null || true - sudo systemctl disable kubelet 2>/dev/null || true - sudo systemctl stop containerd 2>/dev/null || true - fi - - # Remove Arc Agent and Azure resource - log_info "Removing Arc Agent..." - if command -v azcmagent &>/dev/null; then - # First, delete Azure resource (requires az login) - log_info "Deleting Arc machine from Azure..." - ARC_RG=$(sudo azcmagent show 2>/dev/null | grep "Resource Group" | awk -F: '{print $2}' | xargs || echo "") - if [[ -n "$ARC_RG" ]]; then - az connectedmachine delete --resource-group "$ARC_RG" --name "$NODE_NAME" --yes 2>/dev/null || true - log_success "Arc machine deleted from Azure" - else - # Fallback: try using the resource group from args - az connectedmachine delete --resource-group "$RESOURCE_GROUP" --name "$NODE_NAME" --yes 2>/dev/null || true - fi - # Then disconnect locally - sudo azcmagent disconnect --force-local-only 2>/dev/null || true - sudo systemctl stop himdsd extd gcad arcproxyd 2>/dev/null || true - sudo systemctl disable himdsd extd gcad arcproxyd 2>/dev/null || true - # Remove Arc Agent package - if command -v apt &>/dev/null; then - sudo apt remove azcmagent -y 2>/dev/null || true - elif command -v yum &>/dev/null; then - sudo yum remove azcmagent -y 2>/dev/null || true - fi - # Clean up Arc Agent files - sudo rm -rf /var/opt/azcmagent /opt/azcmagent 2>/dev/null || true - log_success "Arc Agent removed" - else - log_info "Arc Agent not found, skipping" - fi - - # Remove client peer from Gateway - if [[ -n "$GATEWAY_PUBLIC_IP" && -n "$CLIENT_PRIVATE_KEY" && -f "$SSH_KEY_PATH" ]]; then - log_info "Removing client peer from Gateway..." - CLIENT_PUBLIC_KEY=$(echo "$CLIENT_PRIVATE_KEY" | wg pubkey 2>/dev/null || echo "") - if [[ -n "$CLIENT_PUBLIC_KEY" ]]; then - ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o ConnectTimeout=10 -i "$SSH_KEY_PATH" \ - azureuser@"$GATEWAY_PUBLIC_IP" \ - "sudo wg set wg0 peer '$CLIENT_PUBLIC_KEY' remove && sudo wg-quick save wg0" 2>/dev/null || true - log_success "Client peer removed from Gateway" - fi - fi - - # Stop networking - log_info "Stopping networking..." - sudo wg-quick down "$NETWORK_INTERFACE" 2>/dev/null || true - log_success "Networking stopped" - - # Delete Gateway client configuration - log_info "Deleting Gateway client configuration..." - sudo rm -f /etc/wireguard/${NETWORK_INTERFACE}.conf - log_success "Gateway client configuration deleted" - - # Clean up hosts entries - log_info "Cleaning up hosts entries..." - sudo sed -i '/privatelink.*azmk8s.io/d' /etc/hosts - log_success "Hosts entries cleaned up" - - # Delete Azure resources - log_info "Deleting Azure resources..." - az account set --subscription "$SUBSCRIPTION_ID" - - # Delete Gateway (must complete before deleting NIC) - log_info "Deleting Gateway..." - if az vm show --resource-group "$RESOURCE_GROUP" --name "$GATEWAY_NAME" &>/dev/null; then - az vm delete --resource-group "$RESOURCE_GROUP" --name "$GATEWAY_NAME" --yes --only-show-errors - log_success "Gateway deleted" - else - log_info "Gateway not found, skipping" - fi - - # Delete NIC - NIC_NAME="${GATEWAY_NAME}VMNic" - log_info "Deleting NIC..." - if az network nic show --resource-group "$RESOURCE_GROUP" --name "$NIC_NAME" &>/dev/null; then - az network nic delete --resource-group "$RESOURCE_GROUP" --name "$NIC_NAME" --only-show-errors - log_success "NIC deleted" - else - log_info "NIC not found, skipping" - fi - - # Delete Public IP - PIP_NAME="${GATEWAY_NAME}-pip" - log_info "Deleting Public IP..." - if az network public-ip show --resource-group "$RESOURCE_GROUP" --name "$PIP_NAME" &>/dev/null; then - az network public-ip delete --resource-group "$RESOURCE_GROUP" --name "$PIP_NAME" --only-show-errors - log_success "Public IP deleted" - else - log_info "Public IP not found, skipping" - fi - - # Delete NSG - NSG_NAME="${GATEWAY_NAME}-nsg" - log_info "Deleting NSG..." - if az network nsg show --resource-group "$RESOURCE_GROUP" --name "$NSG_NAME" &>/dev/null; then - az network nsg delete --resource-group "$RESOURCE_GROUP" --name "$NSG_NAME" --only-show-errors - log_success "NSG deleted" - else - log_info "NSG not found, skipping" - fi - - # Delete disks - log_info "Deleting disks..." - DISK_NAMES=$(az disk list --resource-group "$RESOURCE_GROUP" --query "[?contains(name, '${GATEWAY_NAME}')].name" -o tsv 2>/dev/null || echo "") - for disk in $DISK_NAMES; do - az disk delete --resource-group "$RESOURCE_GROUP" --name "$disk" --yes --only-show-errors || true - done - - # Get VNet info and delete subnet - log_info "Deleting Gateway subnet..." - AKS_NODE_RG=$(az aks show --resource-group "$RESOURCE_GROUP" --name "$AKS_CLUSTER_NAME" \ - --query "nodeResourceGroup" -o tsv 2>/dev/null || echo "") - - if [[ -n "$AKS_NODE_RG" ]]; then - VMSS_NAME=$(az vmss list --resource-group "$AKS_NODE_RG" --query "[0].name" -o tsv 2>/dev/null || echo "") - if [[ -n "$VMSS_NAME" ]]; then - VNET_SUBNET_ID=$(az vmss show --resource-group "$AKS_NODE_RG" --name "$VMSS_NAME" \ - --query "virtualMachineProfile.networkProfile.networkInterfaceConfigurations[0].ipConfigurations[0].subnet.id" -o tsv 2>/dev/null || echo "") - if [[ -n "$VNET_SUBNET_ID" ]]; then - VNET_NAME=$(echo "$VNET_SUBNET_ID" | cut -d'/' -f9) - VNET_RG=$(echo "$VNET_SUBNET_ID" | cut -d'/' -f5) - az network vnet subnet delete --resource-group "$VNET_RG" --vnet-name "$VNET_NAME" \ - --name "$GATEWAY_SUBNET_NAME" 2>/dev/null || true - log_success "Gateway subnet deleted" - fi - fi - fi - - # Delete SSH keys - log_info "Deleting SSH keys..." - rm -f "$SSH_KEY_PATH" "${SSH_KEY_PATH}.pub" - log_success "SSH keys deleted" - - # Delete config.json - log_info "Deleting config file..." - rm -f "$CONFIG_FILE" - - echo "" - log_success "Full cleanup completed!" - echo "" - echo "All components and Azure resources have been removed." - echo "The local machine is now clean." -} - -main() { - echo -e "${YELLOW}Remove Edge Node from Private AKS Cluster${NC}" - echo -e "${YELLOW}=====================================${NC}" - echo "" - - parse_args "$@" - - # Install Azure CLI connectedmachine extension if needed - if ! az extension show --name connectedmachine &>/dev/null; then - log_info "Installing Azure CLI connectedmachine extension..." - az config set extension.dynamic_install_allow_preview=true --only-show-errors 2>/dev/null || true - az extension add --name connectedmachine --allow-preview true --only-show-errors 2>/dev/null || true - fi - - case "$CLEANUP_MODE" in - local) - cleanup_local - ;; - full) - cleanup_full - ;; - esac -} - -# Run main -main "$@" diff --git a/pkg/privatecluster/privatecluster_test.go b/pkg/privatecluster/privatecluster_test.go new file mode 100644 index 0000000..dcd47fa --- /dev/null +++ b/pkg/privatecluster/privatecluster_test.go @@ -0,0 +1,151 @@ +package privatecluster + +import ( + "testing" +) + +func TestParseResourceID(t *testing.T) { + tests := []struct { + name string + resourceID string + wantSubID string + wantRG string + wantName string + wantErr bool + }{ + { + name: "valid resource ID", + resourceID: "/subscriptions/549c6279-3a6a-4412-b267-b4da1afbe002/resourceGroups/weiliu2testrg/providers/Microsoft.ContainerService/managedClusters/my-private-aks", + wantSubID: "549c6279-3a6a-4412-b267-b4da1afbe002", + wantRG: "weiliu2testrg", + wantName: "my-private-aks", + wantErr: false, + }, + { + name: "lowercase resourcegroups", + resourceID: "/subscriptions/549c6279-3a6a-4412-b267-b4da1afbe002/resourcegroups/weiliu2testrg/providers/Microsoft.ContainerService/managedClusters/my-private-aks", + wantSubID: "549c6279-3a6a-4412-b267-b4da1afbe002", + wantRG: "weiliu2testrg", + wantName: "my-private-aks", + wantErr: false, + }, + { + name: "invalid resource ID - too short", + resourceID: "/subscriptions/xxx", + wantErr: true, + }, + { + name: "empty resource ID", + resourceID: "", + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + subID, rg, name, err := ParseResourceID(tt.resourceID) + if (err != nil) != tt.wantErr { + t.Errorf("ParseResourceID() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr { + if subID != tt.wantSubID { + t.Errorf("ParseResourceID() subID = %v, want %v", subID, tt.wantSubID) + } + if rg != tt.wantRG { + t.Errorf("ParseResourceID() rg = %v, want %v", rg, tt.wantRG) + } + if name != tt.wantName { + t.Errorf("ParseResourceID() name = %v, want %v", name, tt.wantName) + } + } + }) + } +} + +func TestDefaultConfigs(t *testing.T) { + // Test DefaultGatewayConfig + gw := DefaultGatewayConfig() + if gw.Name != "wg-gateway" { + t.Errorf("DefaultGatewayConfig().Name = %v, want wg-gateway", gw.Name) + } + if gw.Port != 51820 { + t.Errorf("DefaultGatewayConfig().Port = %v, want 51820", gw.Port) + } + + // Test DefaultVPNConfig + vpn := DefaultVPNConfig() + if vpn.NetworkInterface != "wg-aks" { + t.Errorf("DefaultVPNConfig().NetworkInterface = %v, want wg-aks", vpn.NetworkInterface) + } + if vpn.GatewayVPNIP != "172.16.0.1" { + t.Errorf("DefaultVPNConfig().GatewayVPNIP = %v, want 172.16.0.1", vpn.GatewayVPNIP) + } +} + +func TestLogger(t *testing.T) { + // Just test that logger doesn't panic + logger := NewLogger(false) + logger.Info("test info") + logger.Success("test success") + logger.Warning("test warning") + logger.Error("test error") + logger.Verbose("should not print") // verbose=false + + loggerVerbose := NewLogger(true) + loggerVerbose.Verbose("should print") +} + +func TestFileExists(t *testing.T) { + // Test with existing file + if !FileExists("types.go") { + t.Error("FileExists() should return true for types.go") + } + + // Test with non-existing file + if FileExists("nonexistent_file_12345.go") { + t.Error("FileExists() should return false for non-existent file") + } +} + +func TestCommandExists(t *testing.T) { + // Test with common command + if !CommandExists("ls") { + t.Error("CommandExists() should return true for 'ls'") + } + + // Test with non-existing command + if CommandExists("nonexistent_command_12345") { + t.Error("CommandExists() should return false for non-existent command") + } +} + +func TestInstallerCreation(t *testing.T) { + options := InstallOptions{ + AKSResourceID: "/subscriptions/xxx/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/cluster", + Verbose: true, + } + + installer := NewInstaller(options) + if installer == nil { + t.Fatal("NewInstaller() should not return nil") + } + if installer.logger == nil { + t.Error("Installer.logger should not be nil") + } + if installer.azure == nil { + t.Error("Installer.azure should not be nil") + } +} + +func TestUninstallerCreation(t *testing.T) { + options := UninstallOptions{ + Mode: CleanupModeLocal, + AKSResourceID: "", + } + + uninstaller := NewUninstaller(options) + if uninstaller == nil { + t.Error("NewUninstaller() should not return nil") + } +} diff --git a/pkg/privatecluster/scripts.go b/pkg/privatecluster/scripts.go index 86f72c7..ab0d710 100644 --- a/pkg/privatecluster/scripts.go +++ b/pkg/privatecluster/scripts.go @@ -3,107 +3,45 @@ package privatecluster import ( "context" "fmt" - "os" - "os/exec" - "path/filepath" -) - -type CleanupMode string - -const ( - // CleanupModeLocal removes node and local components, keeps Gateway for other nodes - CleanupModeLocal CleanupMode = "local" - // CleanupModeFull removes all components including Azure resources (Gateway, subnet, NSG, etc.) - CleanupModeFull CleanupMode = "full" ) +// ScriptRunner provides backward compatibility (Deprecated: use Installer/Uninstaller directly) type ScriptRunner struct { - scriptsDir string + verbose bool } +// NewScriptRunner creates a new ScriptRunner instance (Deprecated) func NewScriptRunner(scriptsDir string) *ScriptRunner { - if scriptsDir == "" { - candidates := []string{ - "./pkg/privatecluster", - } - if execPath, err := os.Executable(); err == nil { - execDir := filepath.Dir(execPath) - candidates = append(candidates, - filepath.Join(execDir, "pkg", "privatecluster"), - execDir, - ) - } - - for _, dir := range candidates { - if _, err := os.Stat(filepath.Join(dir, "private-install.sh")); err == nil { - scriptsDir = dir - break - } - } - } - return &ScriptRunner{scriptsDir: scriptsDir} + return &ScriptRunner{verbose: false} } -// RunPrivateInstall executes the private-install.sh script -// Assumes the Private AKS cluster already exists and user has admin permissions +// RunPrivateInstall executes the private cluster installation using Go implementation func (r *ScriptRunner) RunPrivateInstall(ctx context.Context, aksResourceID string) error { - scriptPath := filepath.Join(r.scriptsDir, "private-install.sh") - - // Check if script exists - if _, err := os.Stat(scriptPath); os.IsNotExist(err) { - return fmt.Errorf("script not found: %s", scriptPath) + if aksResourceID == "" { + return fmt.Errorf("AKS resource ID is required") } - // Execute script with AKS resource ID as argument - cmd := exec.CommandContext(ctx, "bash", scriptPath, "--aks-resource-id", aksResourceID) - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - cmd.Stdin = os.Stdin - - if err := cmd.Run(); err != nil { - return fmt.Errorf("script execution failed: %w", err) + options := InstallOptions{ + AKSResourceID: aksResourceID, + Gateway: DefaultGatewayConfig(), + Verbose: r.verbose, } - return nil + installer := NewInstaller(options) + return installer.Install(ctx) } -// RunPrivateUninstall executes the private-uninstall.sh script -// mode: "local" - remove node and local components, keep Gateway -// mode: "full" - remove all components including Azure resources -// aksResourceID is required for "full" mode +// RunPrivateUninstall executes the private cluster uninstallation using Go implementation func (r *ScriptRunner) RunPrivateUninstall(ctx context.Context, mode CleanupMode, aksResourceID string) error { - scriptPath := filepath.Join(r.scriptsDir, "private-uninstall.sh") - - // Check if script exists - if _, err := os.Stat(scriptPath); os.IsNotExist(err) { - return fmt.Errorf("script not found: %s", scriptPath) - } - - // Build arguments based on mode - var args []string - args = append(args, scriptPath) - - switch mode { - case CleanupModeLocal: - args = append(args, "--local") - case CleanupModeFull: - if aksResourceID == "" { - return fmt.Errorf("--aks-resource-id is required for full cleanup mode") - } - args = append(args, "--full", "--aks-resource-id", aksResourceID) - default: - return fmt.Errorf("invalid cleanup mode: %s (use 'local' or 'full')", mode) + if mode == CleanupModeFull && aksResourceID == "" { + return fmt.Errorf("--aks-resource-id is required for full cleanup mode") } - // Execute script - cmd := exec.CommandContext(ctx, "bash", args...) - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - cmd.Stdin = os.Stdin - - if err := cmd.Run(); err != nil { - return fmt.Errorf("script execution failed: %w", err) + options := UninstallOptions{ + Mode: mode, + AKSResourceID: aksResourceID, } - return nil + uninstaller := NewUninstaller(options) + return uninstaller.Uninstall(ctx) } diff --git a/pkg/privatecluster/ssh.go b/pkg/privatecluster/ssh.go new file mode 100644 index 0000000..3ab5bc4 --- /dev/null +++ b/pkg/privatecluster/ssh.go @@ -0,0 +1,158 @@ +package privatecluster + +import ( + "context" + "fmt" + "os/exec" + "strings" + "time" +) + +// SSHClient provides SSH operations to a remote host +type SSHClient struct { + config SSHConfig + logger *Logger +} + +// NewSSHClient creates a new SSHClient instance +func NewSSHClient(config SSHConfig, logger *Logger) *SSHClient { + return &SSHClient{ + config: config, + logger: logger, + } +} + +// buildSSHArgs builds common SSH arguments +func (s *SSHClient) buildSSHArgs() []string { + return []string{ + "-o", "IdentitiesOnly=yes", + "-o", "StrictHostKeyChecking=no", + "-o", fmt.Sprintf("ConnectTimeout=%d", s.config.Timeout), + "-i", s.config.KeyPath, + } +} + +// Execute runs a command on the remote host and returns the output +func (s *SSHClient) Execute(ctx context.Context, command string) (string, error) { + args := s.buildSSHArgs() + args = append(args, fmt.Sprintf("%s@%s", s.config.User, s.config.Host), command) + + cmd := exec.CommandContext(ctx, "ssh", args...) // #nosec G204 -- ssh with trusted internal args + output, err := cmd.CombinedOutput() + if err != nil { + return string(output), fmt.Errorf("SSH command failed: %w\nOutput: %s", err, string(output)) + } + return strings.TrimSpace(string(output)), nil +} + +// ExecuteSilent runs a command on the remote host, returning only success/failure +func (s *SSHClient) ExecuteSilent(ctx context.Context, command string) bool { + args := s.buildSSHArgs() + args = append(args, fmt.Sprintf("%s@%s", s.config.User, s.config.Host), command) + + cmd := exec.CommandContext(ctx, "ssh", args...) // #nosec G204 -- ssh with trusted internal args + return cmd.Run() == nil +} + +// ExecuteScript runs a multi-line script on the remote host +func (s *SSHClient) ExecuteScript(ctx context.Context, script string) error { + args := s.buildSSHArgs() + args = append(args, fmt.Sprintf("%s@%s", s.config.User, s.config.Host)) + + cmd := exec.CommandContext(ctx, "ssh", args...) // #nosec G204 -- ssh with trusted internal args + cmd.Stdin = strings.NewReader(script) + + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("SSH script execution failed: %w\nOutput: %s", err, string(output)) + } + return nil +} + +// TestConnection tests if SSH connection is ready +func (s *SSHClient) TestConnection(ctx context.Context) bool { + return s.ExecuteSilent(ctx, "echo ready") +} + +// WaitForConnection waits for SSH connection to be ready with retries +func (s *SSHClient) WaitForConnection(ctx context.Context, maxAttempts int, interval time.Duration) error { + // Quick first check + if s.TestConnection(ctx) { + return nil + } + + s.logger.Info("Waiting for SSH connection to be ready...") + + for attempt := 1; attempt <= maxAttempts; attempt++ { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(interval): + } + + if s.TestConnection(ctx) { + return nil + } + + s.logger.Verbose("Waiting for SSH... (%d/%d)", attempt, maxAttempts) + } + + return fmt.Errorf("SSH connection timeout after %d attempts", maxAttempts) +} + +// ReadRemoteFile reads a file from the remote host +func (s *SSHClient) ReadRemoteFile(ctx context.Context, path string) (string, error) { + return s.Execute(ctx, fmt.Sprintf("sudo cat %s 2>/dev/null || echo ''", path)) +} + +// WriteRemoteFile writes content to a file on the remote host +func (s *SSHClient) WriteRemoteFile(ctx context.Context, path, content string) error { + // Use heredoc to write file + script := fmt.Sprintf(`sudo tee %s > /dev/null << 'EOFCONTENT' +%s +EOFCONTENT`, path, content) + return s.ExecuteScript(ctx, script) +} + +// CommandExists checks if a command exists on the remote host +func (s *SSHClient) CommandExists(ctx context.Context, command string) bool { + return s.ExecuteSilent(ctx, fmt.Sprintf("command -v %s", command)) +} + +// GenerateSSHKey generates an SSH key pair +func GenerateSSHKey(keyPath string) error { + // Check if key already exists + if FileExists(keyPath) { + return nil + } + + // Ensure directory exists + if err := EnsureDirectory(GetRealHome() + "/.ssh"); err != nil { + return err + } + + cmd := exec.Command("ssh-keygen", "-t", "rsa", "-b", "4096", "-f", keyPath, "-N", "") // #nosec G204 -- fixed args + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("failed to generate SSH key: %w\nOutput: %s", err, string(output)) + } + + // Fix ownership if running with sudo + return FixSSHKeyOwnership(keyPath) +} + +// RemoveSSHKeys removes SSH key pair +func RemoveSSHKeys(keyPath string) error { + for _, path := range []string{keyPath, keyPath + ".pub"} { + if FileExists(path) { + if err := removeFile(path); err != nil { + return err + } + } + } + return nil +} + +func removeFile(path string) error { + cmd := exec.Command("rm", "-f", path) // #nosec G204 -- fixed command with path arg + return cmd.Run() +} diff --git a/pkg/privatecluster/types.go b/pkg/privatecluster/types.go new file mode 100644 index 0000000..0706db8 --- /dev/null +++ b/pkg/privatecluster/types.go @@ -0,0 +1,96 @@ +package privatecluster + +// CleanupMode defines the cleanup mode for uninstallation +type CleanupMode string + +const ( + CleanupModeLocal CleanupMode = "local" + CleanupModeFull CleanupMode = "full" +) + +// GatewayConfig holds configuration for the VPN Gateway VM +type GatewayConfig struct { + Name string + SubnetName string + SubnetPrefix string + VMSize string + Port int +} + +// VPNConfig holds VPN connection configuration +type VPNConfig struct { + NetworkInterface string + VPNNetwork string + GatewayVPNIP string + ClientVPNIP string + ServerPublicKey string + ServerEndpoint string +} + +// AKSClusterInfo holds parsed AKS cluster information +type AKSClusterInfo struct { + ResourceID string + SubscriptionID string + ResourceGroup string + ClusterName string + Location string + TenantID string + NodeResourceGroup string + VNetName string + VNetResourceGroup string + PrivateFQDN string + APIServerIP string +} + +// SSHConfig holds SSH connection configuration +type SSHConfig struct { + KeyPath string + Host string + User string + Port int + Timeout int +} + +// InstallOptions holds options for the install operation +type InstallOptions struct { + AKSResourceID string + Gateway GatewayConfig + Verbose bool +} + +// UninstallOptions holds options for the uninstall operation +type UninstallOptions struct { + Mode CleanupMode + AKSResourceID string +} + +// DefaultGatewayConfig returns the default Gateway configuration +func DefaultGatewayConfig() GatewayConfig { + return GatewayConfig{ + Name: "wg-gateway", + SubnetName: "wg-subnet", + SubnetPrefix: "10.0.100.0/24", + VMSize: "Standard_D2s_v3", + Port: 51820, + } +} + +// DefaultVPNConfig returns the default VPN configuration +func DefaultVPNConfig() VPNConfig { + return VPNConfig{ + NetworkInterface: "wg-aks", + VPNNetwork: "172.16.0.0/24", + GatewayVPNIP: "172.16.0.1", + } +} + +// DefaultSSHConfig returns the default SSH configuration +func DefaultSSHConfig(keyPath, host string) SSHConfig { + return SSHConfig{ + KeyPath: keyPath, + Host: host, + User: "azureuser", + Port: 22, + Timeout: 10, + } +} diff --git a/pkg/privatecluster/uninstaller.go b/pkg/privatecluster/uninstaller.go new file mode 100644 index 0000000..c8b3252 --- /dev/null +++ b/pkg/privatecluster/uninstaller.go @@ -0,0 +1,349 @@ +package privatecluster + +import ( + "context" + "fmt" + "strings" +) + +// Uninstaller handles private cluster uninstallation +type Uninstaller struct { + logger *Logger + azure *AzureCLI + options UninstallOptions + + // State + clusterInfo *AKSClusterInfo + vpnConfig VPNConfig + sshKeyPath string + gatewayIP string + clientKey string +} + +// NewUninstaller creates a new Uninstaller instance +func NewUninstaller(options UninstallOptions) *Uninstaller { + logger := NewLogger(false) + return &Uninstaller{ + logger: logger, + azure: NewAzureCLI(logger), + options: options, + vpnConfig: DefaultVPNConfig(), + sshKeyPath: GetSSHKeyPath(), + } +} + +// Uninstall runs the uninstallation process +func (u *Uninstaller) Uninstall(ctx context.Context) error { + fmt.Printf("%sRemove Edge Node from Private AKS Cluster%s\n", colorYellow, colorReset) + fmt.Printf("%s=====================================%s\n\n", colorYellow, colorReset) + + // Parse resource ID if provided + if u.options.AKSResourceID != "" { + subscriptionID, resourceGroup, clusterName, err := ParseResourceID(u.options.AKSResourceID) + if err != nil { + return err + } + u.clusterInfo = &AKSClusterInfo{ + ResourceID: u.options.AKSResourceID, + SubscriptionID: subscriptionID, + ResourceGroup: resourceGroup, + ClusterName: clusterName, + } + u.logger.Info("Cluster: %s/%s (Subscription: %s)", resourceGroup, clusterName, subscriptionID) + } + + _ = u.azure.InstallConnectedMachineExtension(ctx) + + switch u.options.Mode { + case CleanupModeLocal: + return u.cleanupLocal(ctx) + case CleanupModeFull: + return u.cleanupFull(ctx) + default: + return fmt.Errorf("invalid cleanup mode: %s", u.options.Mode) + } +} + +// cleanupLocal performs local cleanup (keeps Gateway) +func (u *Uninstaller) cleanupLocal(ctx context.Context) error { + u.logger.Info("Performing local cleanup (keeping Gateway)...") + + hostname, err := GetHostname() + if err != nil { + return err + } + + // Get Gateway IP and client key from VPN config (before stopping VPN) + u.readVPNConfig() + + // Remove node from cluster (while VPN is still connected) + u.removeNodeFromCluster(ctx, hostname) + + // Stop any running aks-flex-node agent process + u.stopFlexNodeAgent(ctx) + + // Note: main unbootstrap handles kubelet/containerd cleanup + + // Remove Arc Agent + u.removeArcAgent(ctx, hostname) + + // Remove client peer from Gateway + u.removeClientPeerFromGateway(ctx) + + // Stop VPN + u.stopVPN(ctx) + + // Delete VPN client configuration + u.deleteVPNConfig() + + // Clean up hosts entries + u.cleanupHostsEntries() + + // Note: config.json is preserved for potential re-use + + fmt.Println() + u.logger.Success("Local cleanup completed!") + fmt.Println() + fmt.Println("To rejoin cluster, run:") + fmt.Println(" sudo ./aks-flex-node agent --config config.json # with private: true") + + return nil +} + +// cleanupFull performs full cleanup (removes all Azure resources) +func (u *Uninstaller) cleanupFull(ctx context.Context) error { + u.logger.Info("Performing full cleanup...") + + hostname, err := GetHostname() + if err != nil { + return err + } + + // Get Gateway IP and client key from VPN config (before stopping VPN) + u.readVPNConfig() + + // Remove node from cluster (while VPN is still connected) + u.removeNodeFromCluster(ctx, hostname) + + // Stop any running aks-flex-node agent process + u.stopFlexNodeAgent(ctx) + + // Note: main unbootstrap handles kubelet/containerd cleanup + + // Remove Arc Agent + u.removeArcAgent(ctx, hostname) + + // Remove client peer from Gateway + u.removeClientPeerFromGateway(ctx) + + // Stop VPN + u.stopVPN(ctx) + + // Delete VPN client configuration + u.deleteVPNConfig() + + // Clean up hosts entries + u.cleanupHostsEntries() + + // Delete Azure resources + if err := u.deleteAzureResources(ctx); err != nil { + u.logger.Warning("Failed to delete some Azure resources: %v", err) + } + + // Delete SSH keys + u.deleteSSHKeys() + + // Note: config.json is preserved for potential re-use + + fmt.Println() + u.logger.Success("Full cleanup completed!") + fmt.Println() + fmt.Println("All components and Azure resources have been removed.") + fmt.Println("The local machine is now clean.") + + return nil +} + +// readVPNConfig reads Gateway IP and client key from VPN config +func (u *Uninstaller) readVPNConfig() { + vpnClient := NewVPNClient(u.vpnConfig, u.logger) + gatewayIP, clientKey, err := vpnClient.GetClientConfigInfo() + if err == nil { + u.gatewayIP = gatewayIP + u.clientKey = clientKey + } +} + +// removeNodeFromCluster removes the node from the Kubernetes cluster +func (u *Uninstaller) removeNodeFromCluster(ctx context.Context, nodeName string) { + if !CommandExists("kubectl") { + return + } + + u.logger.Info("Removing node %s from cluster...", nodeName) + + // Try root kubeconfig first + if _, err := RunCommand(ctx, "kubectl", "--kubeconfig", "/root/.kube/config", + "delete", "node", nodeName, "--ignore-not-found"); err == nil { + u.logger.Success("Node removed from cluster") + return + } + + // Try default kubeconfig + if _, err := RunCommand(ctx, "kubectl", "delete", "node", nodeName, "--ignore-not-found"); err == nil { + u.logger.Success("Node removed from cluster") + return + } + + u.logger.Warning("Failed to remove node from cluster (may need manual cleanup: kubectl delete node %s)", nodeName) +} + +// stopFlexNodeAgent stops any running aks-flex-node agent process +func (u *Uninstaller) stopFlexNodeAgent(ctx context.Context) { + u.logger.Info("Stopping aks-flex-node agent...") + _, _ = RunCommand(ctx, "pkill", "-f", "aks-flex-node agent") + _, _ = RunCommand(ctx, "sleep", "2") +} + +// removeArcAgent removes Azure Arc agent +func (u *Uninstaller) removeArcAgent(ctx context.Context, nodeName string) { + if !CommandExists("azcmagent") { + u.logger.Info("Arc Agent not found, skipping") + return + } + + u.logger.Info("Removing Arc Agent...") + + // Get Arc resource group + arcRG := "" + output, err := RunCommand(ctx, "azcmagent", "show") + if err == nil { + for _, line := range strings.Split(output, "\n") { + if strings.Contains(line, "Resource Group") { + parts := strings.SplitN(line, ":", 2) + if len(parts) == 2 { + arcRG = strings.TrimSpace(parts[1]) + } + } + } + } + + if arcRG != "" { + u.logger.Info("Deleting Arc machine from Azure...") + _ = u.azure.DeleteConnectedMachine(ctx, arcRG, nodeName) + u.logger.Success("Arc machine deleted from Azure") + } else if u.clusterInfo != nil { + _ = u.azure.DeleteConnectedMachine(ctx, u.clusterInfo.ResourceGroup, nodeName) + } + + _, _ = RunCommand(ctx, "azcmagent", "disconnect", "--force-local-only") + + for _, service := range []string{"himdsd", "extd", "gcad", "arcproxyd"} { + _, _ = RunCommand(ctx, "systemctl", "stop", service) + _, _ = RunCommand(ctx, "systemctl", "disable", service) + } + + if CommandExists("apt") { + _, _ = RunCommand(ctx, "apt", "remove", "azcmagent", "-y") + } else if CommandExists("yum") { + _, _ = RunCommand(ctx, "yum", "remove", "azcmagent", "-y") + } + + _, _ = RunCommand(ctx, "rm", "-rf", "/var/opt/azcmagent", "/opt/azcmagent") + + u.logger.Success("Arc Agent removed") +} + +// removeClientPeerFromGateway removes this client's peer from the Gateway +func (u *Uninstaller) removeClientPeerFromGateway(ctx context.Context) { + if u.gatewayIP == "" || u.clientKey == "" || !FileExists(u.sshKeyPath) { + return + } + + u.logger.Info("Removing client peer from Gateway...") + + // Get public key from private key + vpnClient := NewVPNClient(u.vpnConfig, u.logger) + clientPubKey, err := vpnClient.GetPublicKeyFromPrivate(ctx, u.clientKey) + if err != nil || clientPubKey == "" { + return + } + + // Connect to Gateway and remove peer + sshConfig := DefaultSSHConfig(u.sshKeyPath, u.gatewayIP) + sshConfig.Timeout = 10 + ssh := NewSSHClient(sshConfig, u.logger) + vpnServer := NewVPNServerManager(ssh, u.logger) + + _ = vpnServer.RemovePeer(ctx, clientPubKey) + u.logger.Success("Client peer removed from Gateway") +} + +// stopVPN stops the VPN connection +func (u *Uninstaller) stopVPN(ctx context.Context) { + vpnClient := NewVPNClient(u.vpnConfig, u.logger) + _ = vpnClient.Stop(ctx) + u.logger.Success("VPN connection stopped") +} + +// deleteVPNConfig deletes the VPN client configuration +func (u *Uninstaller) deleteVPNConfig() { + vpnClient := NewVPNClient(u.vpnConfig, u.logger) + _ = vpnClient.RemoveClientConfig() + u.logger.Success("VPN config deleted") +} + +// cleanupHostsEntries removes AKS-related entries from /etc/hosts +func (u *Uninstaller) cleanupHostsEntries() { + _ = RemoveHostsEntries("privatelink") + _ = RemoveHostsEntries("azmk8s.io") + u.logger.Success("Hosts entries cleaned") +} + +// deleteSSHKeys deletes the Gateway SSH keys +func (u *Uninstaller) deleteSSHKeys() { + _ = RemoveSSHKeys(u.sshKeyPath) + u.logger.Success("SSH keys deleted") +} + +// deleteAzureResources deletes all Azure resources created for the Gateway +func (u *Uninstaller) deleteAzureResources(ctx context.Context) error { + if u.clusterInfo == nil { + return fmt.Errorf("cluster info not available") + } + + u.logger.Info("Deleting Azure resources...") + if err := u.azure.SetSubscription(ctx, u.clusterInfo.SubscriptionID); err != nil { + return err + } + + gatewayName := "wg-gateway" + nicName := gatewayName + "VMNic" + pipName := gatewayName + "-pip" + nsgName := gatewayName + "-nsg" + + if err := u.azure.DeleteVM(ctx, u.clusterInfo.ResourceGroup, gatewayName); err != nil { + u.logger.Warning("Delete VM: %v", err) + } + if err := u.azure.DeleteNIC(ctx, u.clusterInfo.ResourceGroup, nicName); err != nil { + u.logger.Warning("Delete NIC: %v", err) + } + if err := u.azure.DeletePublicIP(ctx, u.clusterInfo.ResourceGroup, pipName); err != nil { + u.logger.Warning("Delete Public IP: %v", err) + } + if err := u.azure.DeleteNSG(ctx, u.clusterInfo.ResourceGroup, nsgName); err != nil { + u.logger.Warning("Delete NSG: %v", err) + } + _ = u.azure.DeleteDisks(ctx, u.clusterInfo.ResourceGroup, gatewayName) + + clusterInfo, err := u.azure.GetAKSClusterInfo(ctx, u.clusterInfo.ResourceGroup, u.clusterInfo.ClusterName) + if err == nil { + vnetName, vnetRG, err := u.azure.GetVNetInfo(ctx, clusterInfo.NodeResourceGroup) + if err == nil { + _ = u.azure.DeleteSubnet(ctx, vnetRG, vnetName, "wg-subnet") + } + } + u.logger.Success("Azure resources deleted") + + return nil +} diff --git a/pkg/privatecluster/utils.go b/pkg/privatecluster/utils.go new file mode 100644 index 0000000..a88e599 --- /dev/null +++ b/pkg/privatecluster/utils.go @@ -0,0 +1,277 @@ +package privatecluster + +import ( + "bufio" + "context" + "fmt" + "os" + "os/exec" + "os/user" + "path/filepath" + "strings" +) + +// Color codes for terminal output +const ( + colorRed = "\033[0;31m" + colorGreen = "\033[0;32m" + colorYellow = "\033[1;33m" + colorBlue = "\033[0;34m" + colorReset = "\033[0m" +) + +// Logger provides colored logging for the private cluster operations +type Logger struct { + verbose bool +} + +// NewLogger creates a new Logger instance +func NewLogger(verbose bool) *Logger { + return &Logger{verbose: verbose} +} + +// Info logs an info message +func (l *Logger) Info(format string, args ...interface{}) { + msg := fmt.Sprintf(format, args...) + fmt.Printf("%sINFO:%s %s\n", colorBlue, colorReset, msg) +} + +// Success logs a success message +func (l *Logger) Success(format string, args ...interface{}) { + msg := fmt.Sprintf(format, args...) + fmt.Printf("%sSUCCESS:%s %s\n", colorGreen, colorReset, msg) +} + +// Warning logs a warning message +func (l *Logger) Warning(format string, args ...interface{}) { + msg := fmt.Sprintf(format, args...) + fmt.Printf("%sWARNING:%s %s\n", colorYellow, colorReset, msg) +} + +// Error logs an error message +func (l *Logger) Error(format string, args ...interface{}) { + msg := fmt.Sprintf(format, args...) + fmt.Printf("%sERROR:%s %s\n", colorRed, colorReset, msg) +} + +// Verbose logs a verbose message (only if verbose mode is enabled) +func (l *Logger) Verbose(format string, args ...interface{}) { + if l.verbose { + msg := fmt.Sprintf(format, args...) + fmt.Printf("%sVERBOSE:%s %s\n", colorBlue, colorReset, msg) + } +} + +// RunCommand executes a command and returns its output +func RunCommand(ctx context.Context, name string, args ...string) (string, error) { + cmd := exec.CommandContext(ctx, name, args...) // #nosec G204 -- commands are from trusted internal code + output, err := cmd.CombinedOutput() + if err != nil { + return string(output), fmt.Errorf("command '%s %s' failed: %w\nOutput: %s", + name, strings.Join(args, " "), err, string(output)) + } + return strings.TrimSpace(string(output)), nil +} + +// RunCommandSilent executes a command and returns only whether it succeeded +func RunCommandSilent(ctx context.Context, name string, args ...string) bool { + cmd := exec.CommandContext(ctx, name, args...) // #nosec G204 -- commands are from trusted internal code + return cmd.Run() == nil +} + +// RunCommandInteractive executes a command with stdout/stderr/stdin connected to the terminal +func RunCommandInteractive(ctx context.Context, name string, args ...string) error { + cmd := exec.CommandContext(ctx, name, args...) // #nosec G204 -- commands are from trusted internal code + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + cmd.Stdin = os.Stdin + return cmd.Run() +} + +// CommandExists checks if a command is available in PATH +func CommandExists(name string) bool { + _, err := exec.LookPath(name) + return err == nil +} + +// GetRealHome returns the real user's home directory (handles sudo) +func GetRealHome() string { + // Check if running with sudo + if sudoUser := os.Getenv("SUDO_USER"); sudoUser != "" { + if u, err := user.Lookup(sudoUser); err == nil { + return u.HomeDir + } + } + // Fallback to current user's home + if home := os.Getenv("HOME"); home != "" { + return home + } + if u, err := user.Current(); err == nil { + return u.HomeDir + } + return "/root" +} + +// GetSSHKeyPath returns the default SSH key path for the Gateway +func GetSSHKeyPath() string { + return filepath.Join(GetRealHome(), ".ssh", "id_rsa_wg_gateway") +} + +// EnsureDirectory creates a directory if it doesn't exist +func EnsureDirectory(path string) error { + return os.MkdirAll(path, 0750) +} + +// FileExists checks if a file exists +func FileExists(path string) bool { + _, err := os.Stat(path) + return err == nil +} + +// ReadFileContent reads a file and returns its content +func ReadFileContent(path string) (string, error) { + data, err := os.ReadFile(path) // #nosec G304 -- path is from trusted internal code + if err != nil { + return "", err + } + return string(data), nil +} + +// WriteFileContent writes content to a file with specified permissions +func WriteFileContent(path, content string, perm os.FileMode) error { + return os.WriteFile(path, []byte(content), perm) // #nosec G306 -- perm is from trusted internal code +} + +// AddHostsEntry adds an entry to /etc/hosts if it doesn't exist +func AddHostsEntry(ip, hostname string) error { + hostsPath := "/etc/hosts" + + // Check if entry already exists + content, err := ReadFileContent(hostsPath) + if err != nil { + return fmt.Errorf("failed to read hosts file: %w", err) + } + + if strings.Contains(content, hostname) { + return nil // Entry already exists + } + + f, err := os.OpenFile(hostsPath, os.O_APPEND|os.O_WRONLY, 0644) // #nosec G302,G304 -- /etc/hosts requires 0644 + if err != nil { + return fmt.Errorf("failed to open hosts file: %w", err) + } + defer func() { _ = f.Close() }() + + entry := fmt.Sprintf("%s %s\n", ip, hostname) + if _, err := f.WriteString(entry); err != nil { + return fmt.Errorf("failed to write hosts entry: %w", err) + } + + return nil +} + +// RemoveHostsEntries removes entries matching a pattern from /etc/hosts +func RemoveHostsEntries(pattern string) error { + hostsPath := "/etc/hosts" + + content, err := ReadFileContent(hostsPath) + if err != nil { + return fmt.Errorf("failed to read hosts file: %w", err) + } + + var newLines []string + scanner := bufio.NewScanner(strings.NewReader(content)) + for scanner.Scan() { + line := scanner.Text() + if !strings.Contains(line, pattern) { + newLines = append(newLines, line) + } + } + + newContent := strings.Join(newLines, "\n") + if !strings.HasSuffix(newContent, "\n") { + newContent += "\n" + } + + return WriteFileContent(hostsPath, newContent, 0644) +} + +// ParseResourceID parses an Azure resource ID and returns its components +func ParseResourceID(resourceID string) (subscriptionID, resourceGroup, resourceName string, err error) { + // Normalize: Azure CLI sometimes returns lowercase 'resourcegroups' + resourceID = strings.Replace(resourceID, "/resourcegroups/", "/resourceGroups/", 1) + + parts := strings.Split(resourceID, "/") + if len(parts) < 9 { + return "", "", "", fmt.Errorf("invalid resource ID format: %s", resourceID) + } + + // Format: /subscriptions/{sub}/resourceGroups/{rg}/providers/{provider}/{type}/{name} + subscriptionID = parts[2] + resourceGroup = parts[4] + resourceName = parts[8] + + if subscriptionID == "" || resourceGroup == "" || resourceName == "" { + return "", "", "", fmt.Errorf("failed to parse resource ID components: %s", resourceID) + } + + return subscriptionID, resourceGroup, resourceName, nil +} + +// FixSSHKeyOwnership fixes SSH key ownership when running with sudo +func FixSSHKeyOwnership(keyPath string) error { + sudoUser := os.Getenv("SUDO_USER") + if sudoUser == "" { + return nil // Not running with sudo + } + + u, err := user.Lookup(sudoUser) + if err != nil { + return fmt.Errorf("failed to lookup user %s: %w", sudoUser, err) + } + + // Change ownership of both private and public keys + for _, path := range []string{keyPath, keyPath + ".pub"} { + if FileExists(path) { + cmd := exec.Command("chown", fmt.Sprintf("%s:%s", u.Uid, u.Gid), path) // #nosec G204 -- chown with uid/gid + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to change ownership of %s: %w", path, err) + } + } + } + + return nil +} + +// GetHostname returns the lowercase hostname +func GetHostname() (string, error) { + hostname, err := os.Hostname() + if err != nil { + return "", err + } + return strings.ToLower(hostname), nil +} + +// IsRoot checks if the current process is running as root +func IsRoot() bool { + return os.Getuid() == 0 +} + +// CleanKubeCache removes kube cache directories +func CleanKubeCache() error { + paths := []string{ + "/root/.kube/cache", + filepath.Join(GetRealHome(), ".kube", "cache"), + } + + for _, path := range paths { + if FileExists(path) { + if err := os.RemoveAll(path); err != nil { + // Log but don't fail + continue + } + } + } + + return nil +} diff --git a/pkg/privatecluster/vpn.go b/pkg/privatecluster/vpn.go new file mode 100644 index 0000000..7242c12 --- /dev/null +++ b/pkg/privatecluster/vpn.go @@ -0,0 +1,267 @@ +package privatecluster + +import ( + "context" + "fmt" + "os/exec" + "strconv" + "strings" +) + +// VPNClient provides VPN (WireGuard) operations +type VPNClient struct { + config VPNConfig + logger *Logger +} + +// NewVPNClient creates a new VPNClient instance +func NewVPNClient(config VPNConfig, logger *Logger) *VPNClient { + return &VPNClient{ + config: config, + logger: logger, + } +} + +// GenerateKeyPair generates a WireGuard key pair and returns (privateKey, publicKey) +func (v *VPNClient) GenerateKeyPair(ctx context.Context) (string, string, error) { + // Generate private key + privateKey, err := RunCommand(ctx, "wg", "genkey") + if err != nil { + return "", "", fmt.Errorf("failed to generate VPN private key: %w", err) + } + + cmd := exec.CommandContext(ctx, "wg", "pubkey") // #nosec G204 -- fixed wg command + cmd.Stdin = strings.NewReader(privateKey) + publicKeyBytes, err := cmd.Output() + if err != nil { + return "", "", fmt.Errorf("failed to generate VPN public key: %w", err) + } + + return privateKey, strings.TrimSpace(string(publicKeyBytes)), nil +} + +// CreateClientConfig creates the client VPN configuration file +func (v *VPNClient) CreateClientConfig(privateKey string, gatewayPort int) error { + configPath := fmt.Sprintf("/etc/wireguard/%s.conf", v.config.NetworkInterface) + + config := fmt.Sprintf(`[Interface] +PrivateKey = %s +Address = %s/24 + +[Peer] +PublicKey = %s +Endpoint = %s:%d +AllowedIPs = 10.0.0.0/8, 172.16.0.0/24 +PersistentKeepalive = 25 +`, privateKey, v.config.ClientVPNIP, v.config.ServerPublicKey, v.config.ServerEndpoint, gatewayPort) + + if err := WriteFileContent(configPath, config, 0600); err != nil { + return fmt.Errorf("failed to create VPN client config: %w", err) + } + + return nil +} + +// Start starts the VPN connection +func (v *VPNClient) Start(ctx context.Context) error { + _ = v.Stop(ctx) + + _, err := RunCommand(ctx, "wg-quick", "up", v.config.NetworkInterface) + if err != nil { + return fmt.Errorf("failed to start VPN: %w", err) + } + + return nil +} + +// Stop stops the VPN connection +func (v *VPNClient) Stop(ctx context.Context) error { + _, _ = RunCommand(ctx, "wg-quick", "down", v.config.NetworkInterface) + return nil +} + +// TestConnection tests VPN connectivity by pinging the gateway +func (v *VPNClient) TestConnection(ctx context.Context) bool { + return RunCommandSilent(ctx, "ping", "-c", "1", "-W", "3", v.config.GatewayVPNIP) +} + +// RemoveClientConfig removes the client VPN configuration file +func (v *VPNClient) RemoveClientConfig() error { + configPath := fmt.Sprintf("/etc/wireguard/%s.conf", v.config.NetworkInterface) + if FileExists(configPath) { + cmd := exec.Command("rm", "-f", configPath) // #nosec G204 -- fixed rm command + return cmd.Run() + } + return nil +} + +// GetClientConfigInfo reads the current client config and returns Gateway IP and private key +func (v *VPNClient) GetClientConfigInfo() (gatewayIP, privateKey string, err error) { + configPath := fmt.Sprintf("/etc/wireguard/%s.conf", v.config.NetworkInterface) + + content, err := ReadFileContent(configPath) + if err != nil { + return "", "", fmt.Errorf("failed to read VPN config: %w", err) + } + + // Parse Endpoint to get Gateway IP + for _, line := range strings.Split(content, "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "Endpoint") { + parts := strings.SplitN(line, "=", 2) + if len(parts) == 2 { + endpoint := strings.TrimSpace(parts[1]) + // Remove port + gatewayIP = strings.Split(endpoint, ":")[0] + } + } + if strings.HasPrefix(line, "PrivateKey") { + parts := strings.SplitN(line, "=", 2) + if len(parts) == 2 { + privateKey = strings.TrimSpace(parts[1]) + } + } + } + + return gatewayIP, privateKey, nil +} + +// GetPublicKeyFromPrivate derives public key from private key +func (v *VPNClient) GetPublicKeyFromPrivate(ctx context.Context, privateKey string) (string, error) { + cmd := exec.CommandContext(ctx, "wg", "pubkey") // #nosec G204 -- fixed wg command + cmd.Stdin = strings.NewReader(privateKey) + output, err := cmd.Output() + if err != nil { + return "", fmt.Errorf("failed to derive public key: %w", err) + } + return strings.TrimSpace(string(output)), nil +} + +// VPNServerManager manages VPN server on the Gateway +type VPNServerManager struct { + ssh *SSHClient + logger *Logger +} + +// NewVPNServerManager creates a new VPNServerManager instance +func NewVPNServerManager(ssh *SSHClient, logger *Logger) *VPNServerManager { + return &VPNServerManager{ + ssh: ssh, + logger: logger, + } +} + +// IsInstalled checks if VPN software is installed on the server +func (m *VPNServerManager) IsInstalled(ctx context.Context) bool { + return m.ssh.CommandExists(ctx, "wg") +} + +// Install installs and configures VPN server +func (m *VPNServerManager) Install(ctx context.Context) error { + script := `set -e + +# Install WireGuard +sudo apt-get update +sudo apt-get install -y wireguard + +# Generate key pair +sudo wg genkey | sudo tee /etc/wireguard/server_private.key | sudo wg pubkey | sudo tee /etc/wireguard/server_public.key +sudo chmod 600 /etc/wireguard/server_private.key + +SERVER_PRIVATE_KEY=$(sudo cat /etc/wireguard/server_private.key) + +# Create configuration +sudo tee /etc/wireguard/wg0.conf << EOF +[Interface] +PrivateKey = ${SERVER_PRIVATE_KEY} +Address = 172.16.0.1/24 +ListenPort = 51820 +PostUp = iptables -A FORWARD -i wg0 -j ACCEPT; iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE +PostDown = iptables -D FORWARD -i wg0 -j ACCEPT; iptables -t nat -D POSTROUTING -o eth0 -j MASQUERADE +EOF + +# Enable IP forwarding +echo 'net.ipv4.ip_forward=1' | sudo tee -a /etc/sysctl.conf +sudo sysctl -p + +# Start VPN service +sudo systemctl enable wg-quick@wg0 +sudo systemctl start wg-quick@wg0 || sudo systemctl restart wg-quick@wg0 + +echo "VPN server configuration complete" +` + return m.ssh.ExecuteScript(ctx, script) +} + +// GetPublicKey retrieves the server's public key +func (m *VPNServerManager) GetPublicKey(ctx context.Context) (string, error) { + key, err := m.ssh.ReadRemoteFile(ctx, "/etc/wireguard/server_public.key") + if err != nil || key == "" { + return "", fmt.Errorf("failed to get server public key") + } + return strings.TrimSpace(key), nil +} + +// GetPeerCount returns the number of existing peers +func (m *VPNServerManager) GetPeerCount(ctx context.Context) (int, error) { + output, err := m.ssh.Execute(ctx, "sudo wg show wg0 peers 2>/dev/null | wc -l || echo 0") + if err != nil { + return 0, nil // Default to 0 if error + } + count, _ := strconv.Atoi(strings.TrimSpace(output)) + return count, nil +} + +// AddPeer adds a client peer to the server +func (m *VPNServerManager) AddPeer(ctx context.Context, clientPublicKey, clientIP string) error { + // Add peer + cmd := fmt.Sprintf("sudo wg set wg0 peer '%s' allowed-ips %s/32", clientPublicKey, clientIP) + if _, err := m.ssh.Execute(ctx, cmd); err != nil { + return fmt.Errorf("failed to add peer: %w", err) + } + + // Persist configuration + if _, err := m.ssh.Execute(ctx, "sudo wg-quick save wg0"); err != nil { + return fmt.Errorf("failed to save VPN config: %w", err) + } + + return nil +} + +// RemovePeer removes a client peer from the server +func (m *VPNServerManager) RemovePeer(ctx context.Context, clientPublicKey string) error { + cmd := fmt.Sprintf("sudo wg set wg0 peer '%s' remove && sudo wg-quick save wg0", clientPublicKey) + _, _ = m.ssh.Execute(ctx, cmd) + return nil +} + +// ResolveDNS resolves a hostname through the Gateway +func (m *VPNServerManager) ResolveDNS(ctx context.Context, hostname string) (string, error) { + cmd := fmt.Sprintf("nslookup %s | grep -A1 'Name:' | grep 'Address:' | awk '{print $2}'", hostname) + output, err := m.ssh.Execute(ctx, cmd) + if err != nil || output == "" { + return "", fmt.Errorf("failed to resolve %s through Gateway", hostname) + } + return strings.TrimSpace(output), nil +} + +// InstallVPNTools installs VPN tools locally +func InstallVPNTools(ctx context.Context, logger *Logger) error { + if CommandExists("wg") { + return nil + } + if _, err := RunCommand(ctx, "apt-get", "update"); err != nil { + return err + } + _, err := RunCommand(ctx, "apt-get", "install", "-y", "wireguard-tools") + return err +} + +// InstallJQ installs jq locally +func InstallJQ(ctx context.Context, logger *Logger) error { + if CommandExists("jq") { + return nil + } + _, err := RunCommand(ctx, "apt-get", "install", "-y", "jq") + return err +} From 47f88351e4172a60c91c00a5fef1f4274c6dcb88 Mon Sep 17 00:00:00 2001 From: weiliu2 Date: Sat, 7 Feb 2026 12:10:20 +1300 Subject: [PATCH 03/11] fix lint scan --- pkg/privatecluster/utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/privatecluster/utils.go b/pkg/privatecluster/utils.go index a88e599..43f2f08 100644 --- a/pkg/privatecluster/utils.go +++ b/pkg/privatecluster/utils.go @@ -156,7 +156,7 @@ func AddHostsEntry(ip, hostname string) error { return nil // Entry already exists } - f, err := os.OpenFile(hostsPath, os.O_APPEND|os.O_WRONLY, 0644) // #nosec G302,G304 -- /etc/hosts requires 0644 + f, err := os.OpenFile(hostsPath, os.O_APPEND|os.O_WRONLY, 0600) // #nosec G304 -- hostsPath is validated if err != nil { return fmt.Errorf("failed to open hosts file: %w", err) } From 2e797d997c8e715e159e268f175753422e53f688 Mon Sep 17 00:00:00 2001 From: weiliu2 Date: Sat, 7 Feb 2026 12:25:10 +1300 Subject: [PATCH 04/11] gofmt --- commands.go | 1 - pkg/privatecluster/privatecluster_test.go | 12 ++++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/commands.go b/commands.go index b44338a..8be4f89 100644 --- a/commands.go +++ b/commands.go @@ -191,7 +191,6 @@ func runVersion() { fmt.Printf("Build Time: %s\n", BuildTime) } - // runDaemonLoop runs the periodic status collection and bootstrap monitoring daemon func runDaemonLoop(ctx context.Context, cfg *config.Config) error { logger := logger.GetLoggerFromContext(ctx) diff --git a/pkg/privatecluster/privatecluster_test.go b/pkg/privatecluster/privatecluster_test.go index dcd47fa..9fa36db 100644 --- a/pkg/privatecluster/privatecluster_test.go +++ b/pkg/privatecluster/privatecluster_test.go @@ -6,12 +6,12 @@ import ( func TestParseResourceID(t *testing.T) { tests := []struct { - name string - resourceID string - wantSubID string - wantRG string - wantName string - wantErr bool + name string + resourceID string + wantSubID string + wantRG string + wantName string + wantErr bool }{ { name: "valid resource ID", From a66fa879dece5e589dbaa1c2c0b52a835408798c Mon Sep 17 00:00:00 2001 From: weiliu2 Date: Tue, 10 Feb 2026 19:47:51 +1300 Subject: [PATCH 05/11] comments1 --- commands.go | 10 +- go.mod | 3 + go.sum | 6 + pkg/components/arc/arc_installer.go | 31 +- pkg/privatecluster/azure_client.go | 638 ++++++++++++++++++++++ pkg/privatecluster/installer.go | 90 +-- pkg/privatecluster/privatecluster_test.go | 16 +- pkg/privatecluster/scripts.go | 26 +- pkg/privatecluster/tool_installer.go | 45 ++ pkg/privatecluster/uninstaller.go | 75 ++- 10 files changed, 838 insertions(+), 102 deletions(-) create mode 100644 pkg/privatecluster/azure_client.go create mode 100644 pkg/privatecluster/tool_installer.go diff --git a/commands.go b/commands.go index 8be4f89..65cd453 100644 --- a/commands.go +++ b/commands.go @@ -8,6 +8,7 @@ import ( "path/filepath" "time" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" "github.com/sirupsen/logrus" "github.com/spf13/cobra" @@ -156,7 +157,14 @@ func runUnbootstrap(ctx context.Context) error { Mode: mode, AKSResourceID: cfg.Azure.TargetCluster.ResourceID, } - uninstaller := privatecluster.NewUninstaller(options) + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to create Azure CLI credential: %w", err) + } + uninstaller, err := privatecluster.NewUninstaller(options, cred) + if err != nil { + return fmt.Errorf("failed to create private cluster uninstaller: %w", err) + } if err := uninstaller.Uninstall(ctx); err != nil { logger.Warnf("Private cluster cleanup had errors: %v", err) // Continue with normal unbootstrap even if private cleanup has issues diff --git a/go.mod b/go.mod index 6cecbc6..2e3b2ba 100644 --- a/go.mod +++ b/go.mod @@ -20,6 +20,9 @@ require ( require ( github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6 v6.4.0 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0 // indirect github.com/Azure/go-autorest v14.2.0+incompatible // indirect github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect diff --git a/go.sum b/go.sum index 2a8019e..04064c0 100644 --- a/go.sum +++ b/go.sum @@ -8,14 +8,20 @@ github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDo github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v3 v3.0.0-beta.2 h1:qiir/pptnHqp6hV8QwV+IExYIf6cPsXBfUDUXQ27t2Y= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v3 v3.0.0-beta.2/go.mod h1:jVRrRDLCOuif95HDYC23ADTMlvahB7tMdl519m9Iyjc= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6 v6.4.0 h1:z7Mqz6l0EFH549GvHEqfjKvi+cRScxLWbaoeLm9wxVQ= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6 v6.4.0/go.mod h1:v6gbfH+7DG7xH2kUNs+ZJ9tF6O3iNnR85wMtmr+F54o= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v5 v5.0.0 h1:5n7dPVqsWfVKw+ZiEKSd3Kzu7gwBkbEBkeXb8rgaE9Q= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v5 v5.0.0/go.mod h1:HcZY0PHPo/7d75p99lB6lK0qYOP4vLRJUBpiehYXtLQ= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute v1.2.0 h1:7UuAn4ljE+H3GQ7qts3c7oAaMRvge68EgyckoNP/1Ro= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute v1.2.0/go.mod h1:F2eDq/BGK2LOEoDtoHbBOphaPqcjT0K/Y5Am8vf7+0w= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0 h1:PTFGRSlMKCQelWwxUyYVEUqseBJVemLyqWJjvMyt0do= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0/go.mod h1:LRr2FzBTQlONPPa5HREE5+RjSCTXl7BwOvYOaWTqCaI= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0 h1:HYGD75g0bQ3VO/Omedm54v4LrD3B1cGImuRF3AJ5wLo= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0/go.mod h1:ulHyBFJOI0ONiRL4vcJTmS7rx18jQQlEPmAgo80cRdM= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1 h1:7CBQ+Ei8SP2c6ydQTGCCrS35bDxgTMfoP2miAwK++OU= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1/go.mod h1:c/wcGeGx5FUPbM/JltUYHZcKmigwyVLJlDq+4HdtXaw= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0 h1:wxQx2Bt4xzPIKvW59WQf1tJNx/ZZKPfN+EhPX3Z6CYY= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0/go.mod h1:TpiwjwnW/khS0LKs4vW5UmmT9OWcxaveS8U7+tlknzo= github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs= github.com/Azure/go-autorest v14.2.0+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24= github.com/Azure/go-autorest/autorest/to v0.4.1 h1:CxNHBqdzTr7rLtdrtb5CMjJcDut+WNGCVv7OmS5+lTc= diff --git a/pkg/components/arc/arc_installer.go b/pkg/components/arc/arc_installer.go index f92deb1..51c7559 100644 --- a/pkg/components/arc/arc_installer.go +++ b/pkg/components/arc/arc_installer.go @@ -118,8 +118,7 @@ func (i *Installer) setupArcPermissions() error { func (i *Installer) downloadArcInstallScript(ctx context.Context, destPath string) error { // Try curl first if _, err := exec.LookPath("curl"); err == nil { - cmd := exec.CommandContext(ctx, "curl", "-L", "-o", destPath, arcInstallScriptURL) - if err := cmd.Run(); err != nil { + if _, err := utils.RunCommandWithOutput("curl", "-L", "-o", destPath, arcInstallScriptURL); err != nil { return fmt.Errorf("curl download failed: %w", err) } return nil @@ -127,8 +126,7 @@ func (i *Installer) downloadArcInstallScript(ctx context.Context, destPath strin // Try wget as fallback if _, err := exec.LookPath("wget"); err == nil { - cmd := exec.CommandContext(ctx, "wget", "-O", destPath, arcInstallScriptURL) - if err := cmd.Run(); err != nil { + if _, err := utils.RunCommandWithOutput("wget", "-O", destPath, arcInstallScriptURL); err != nil { return fmt.Errorf("wget download failed: %w", err) } return nil @@ -198,19 +196,15 @@ func (i *Installer) IsCompleted(ctx context.Context) bool { return false } - // Use same approach as status collector - check azcmagent show with timeout - timeoutCtx, cancel := context.WithTimeout(ctx, 10*time.Second) - defer cancel() - - cmd := exec.CommandContext(timeoutCtx, "azcmagent", "show") - output, err := cmd.Output() + // Use same approach as status collector - check azcmagent show + output, err := utils.RunCommandWithOutput("azcmagent", "show") if err != nil { i.logger.Debugf("azcmagent show failed: %v - Arc not ready", err) return false } // Parse output to check if agent is connected (same logic as status collector) - lines := strings.Split(strings.TrimSpace(string(output)), "\n") + lines := strings.Split(strings.TrimSpace(output), "\n") for _, line := range lines { line = strings.TrimSpace(line) if strings.Contains(line, "Agent Status") && strings.Contains(line, ":") { @@ -248,9 +242,8 @@ func (i *Installer) registerArcMachine(ctx context.Context) (*armhybridcompute.M // Step 1: Clean up local agent state i.logger.Info("Cleaning up local agent state...") - disconnectCmd := exec.CommandContext(ctx, "azcmagent", "disconnect", "--force-local-only") - if output, err := disconnectCmd.CombinedOutput(); err != nil { - i.logger.Warnf("Local disconnect had issues (continuing): %v, output: %s", err, string(output)) + if output, err := utils.RunCommandWithOutput("azcmagent", "disconnect", "--force-local-only"); err != nil { + i.logger.Warnf("Local disconnect had issues (continuing): %v, output: %s", err, output) } // Step 2: Delete the stale Azure Arc resource so connect can recreate it @@ -275,18 +268,14 @@ func (i *Installer) registerArcMachine(ctx context.Context) (*armhybridcompute.M } // isLocalAgentConnected checks if the local Arc agent is connected -func (i *Installer) isLocalAgentConnected(ctx context.Context) bool { - timeoutCtx, cancel := context.WithTimeout(ctx, 10*time.Second) - defer cancel() - - cmd := exec.CommandContext(timeoutCtx, "azcmagent", "show") - output, err := cmd.Output() +func (i *Installer) isLocalAgentConnected(_ context.Context) bool { + output, err := utils.RunCommandWithOutput("azcmagent", "show") if err != nil { i.logger.Debugf("azcmagent show failed: %v", err) return false } - for _, line := range strings.Split(string(output), "\n") { + for _, line := range strings.Split(output, "\n") { if strings.Contains(line, "Agent Status") && strings.Contains(line, ":") { parts := strings.SplitN(line, ":", 2) if len(parts) == 2 { diff --git a/pkg/privatecluster/azure_client.go b/pkg/privatecluster/azure_client.go new file mode 100644 index 0000000..17b3a9f --- /dev/null +++ b/pkg/privatecluster/azure_client.go @@ -0,0 +1,638 @@ +package privatecluster + +import ( + "context" + "errors" + "fmt" + "net/http" + "os" + "path/filepath" + "strings" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v5" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions" +) + +// AzureClient provides Azure operations using the Azure SDK for Go (Track 2). +type AzureClient struct { + logger *Logger + subscriptionID string + + vmClient *armcompute.VirtualMachinesClient + diskClient *armcompute.DisksClient + vmssClient *armcompute.VirtualMachineScaleSetsClient + subnetClient *armnetwork.SubnetsClient + nsgClient *armnetwork.SecurityGroupsClient + pipClient *armnetwork.PublicIPAddressesClient + nicClient *armnetwork.InterfacesClient + aksClient *armcontainerservice.ManagedClustersClient + arcClient *armhybridcompute.MachinesClient + subscriptionClient *armsubscriptions.Client +} + +// NewAzureClient creates a new AzureClient with all sub-clients initialized. +func NewAzureClient(cred azcore.TokenCredential, subscriptionID string, logger *Logger) (*AzureClient, error) { + c := &AzureClient{ + logger: logger, + subscriptionID: subscriptionID, + } + + var err error + + if c.vmClient, err = armcompute.NewVirtualMachinesClient(subscriptionID, cred, nil); err != nil { + return nil, fmt.Errorf("failed to create VM client: %w", err) + } + if c.diskClient, err = armcompute.NewDisksClient(subscriptionID, cred, nil); err != nil { + return nil, fmt.Errorf("failed to create disk client: %w", err) + } + if c.vmssClient, err = armcompute.NewVirtualMachineScaleSetsClient(subscriptionID, cred, nil); err != nil { + return nil, fmt.Errorf("failed to create VMSS client: %w", err) + } + if c.subnetClient, err = armnetwork.NewSubnetsClient(subscriptionID, cred, nil); err != nil { + return nil, fmt.Errorf("failed to create subnet client: %w", err) + } + if c.nsgClient, err = armnetwork.NewSecurityGroupsClient(subscriptionID, cred, nil); err != nil { + return nil, fmt.Errorf("failed to create NSG client: %w", err) + } + if c.pipClient, err = armnetwork.NewPublicIPAddressesClient(subscriptionID, cred, nil); err != nil { + return nil, fmt.Errorf("failed to create public IP client: %w", err) + } + if c.nicClient, err = armnetwork.NewInterfacesClient(subscriptionID, cred, nil); err != nil { + return nil, fmt.Errorf("failed to create NIC client: %w", err) + } + if c.aksClient, err = armcontainerservice.NewManagedClustersClient(subscriptionID, cred, nil); err != nil { + return nil, fmt.Errorf("failed to create AKS client: %w", err) + } + if c.arcClient, err = armhybridcompute.NewMachinesClient(subscriptionID, cred, nil); err != nil { + return nil, fmt.Errorf("failed to create Arc client: %w", err) + } + if c.subscriptionClient, err = armsubscriptions.NewClient(cred, nil); err != nil { + return nil, fmt.Errorf("failed to create subscription client: %w", err) + } + + return c, nil +} + +// GetTenantID returns the tenant ID for the configured subscription. +func (c *AzureClient) GetTenantID(ctx context.Context) (string, error) { + resp, err := c.subscriptionClient.Get(ctx, c.subscriptionID, nil) + if err != nil { + return "", fmt.Errorf("failed to get subscription info: %w", err) + } + if resp.TenantID == nil { + return "", fmt.Errorf("tenant ID not found for subscription %s", c.subscriptionID) + } + return *resp.TenantID, nil +} + +// AKSClusterExists checks if an AKS cluster exists. +func (c *AzureClient) AKSClusterExists(ctx context.Context, resourceGroup, clusterName string) bool { + _, err := c.aksClient.Get(ctx, resourceGroup, clusterName, nil) + return err == nil +} + +// GetAKSClusterInfo retrieves AKS cluster information in a single API call. +func (c *AzureClient) GetAKSClusterInfo(ctx context.Context, resourceGroup, clusterName string) (*AKSClusterInfo, error) { + resp, err := c.aksClient.Get(ctx, resourceGroup, clusterName, nil) + if err != nil { + return nil, fmt.Errorf("failed to get AKS cluster: %w", err) + } + + cluster := resp.ManagedCluster + props := cluster.Properties + if props == nil { + return nil, fmt.Errorf("AKS cluster properties are nil") + } + + // Check AAD enabled + if props.AADProfile == nil || props.AADProfile.Managed == nil || !*props.AADProfile.Managed { + return nil, fmt.Errorf("AKS cluster AAD not enabled, please enable: az aks update --enable-aad") + } + + // Check Azure RBAC enabled + if props.AADProfile.EnableAzureRBAC == nil || !*props.AADProfile.EnableAzureRBAC { + return nil, fmt.Errorf("AKS cluster Azure RBAC not enabled, please enable: az aks update --enable-azure-rbac") + } + + info := &AKSClusterInfo{ + ResourceGroup: resourceGroup, + ClusterName: clusterName, + } + + if cluster.Location != nil { + info.Location = *cluster.Location + } + if props.NodeResourceGroup != nil { + info.NodeResourceGroup = *props.NodeResourceGroup + } + if props.PrivateFQDN != nil { + info.PrivateFQDN = *props.PrivateFQDN + } + + return info, nil +} + +// GetVNetInfo retrieves VNet information from AKS VMSS in the node resource group. +func (c *AzureClient) GetVNetInfo(ctx context.Context, nodeResourceGroup string) (vnetName, vnetRG string, err error) { + pager := c.vmssClient.NewListPager(nodeResourceGroup, nil) + for pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + return "", "", fmt.Errorf("failed to list VMSS: %w", err) + } + for _, vmss := range page.Value { + subnetID := extractSubnetIDFromVMSS(vmss) + if subnetID == "" { + continue + } + // Format: /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.Network/virtualNetworks/{vnet}/subnets/{subnet} + parts := strings.Split(subnetID, "/") + if len(parts) < 9 { + return "", "", fmt.Errorf("invalid subnet ID format: %s", subnetID) + } + return parts[8], parts[4], nil + } + break // only need first page + } + return "", "", fmt.Errorf("cannot find AKS node VMSS in %s", nodeResourceGroup) +} + +// VMExists checks if a VM exists. +func (c *AzureClient) VMExists(ctx context.Context, resourceGroup, vmName string) bool { + _, err := c.vmClient.Get(ctx, resourceGroup, vmName, nil) + return err == nil +} + +// GetVMPublicIP retrieves a VM's public IP address by tracing VM → NIC → PIP. +func (c *AzureClient) GetVMPublicIP(ctx context.Context, resourceGroup, vmName string) (string, error) { + vmResp, err := c.vmClient.Get(ctx, resourceGroup, vmName, nil) + if err != nil { + return "", fmt.Errorf("failed to get VM: %w", err) + } + if vmResp.Properties == nil || vmResp.Properties.NetworkProfile == nil || + len(vmResp.Properties.NetworkProfile.NetworkInterfaces) == 0 { + return "", fmt.Errorf("VM has no network interfaces") + } + + nicID := vmResp.Properties.NetworkProfile.NetworkInterfaces[0].ID + if nicID == nil { + return "", fmt.Errorf("NIC ID is nil") + } + nicRG, nicName := parseResourceGroupAndName(*nicID) + + nicResp, err := c.nicClient.Get(ctx, nicRG, nicName, nil) + if err != nil { + return "", fmt.Errorf("failed to get NIC: %w", err) + } + if nicResp.Properties == nil || len(nicResp.Properties.IPConfigurations) == 0 { + return "", fmt.Errorf("NIC has no IP configurations") + } + + ipConfig := nicResp.Properties.IPConfigurations[0] + if ipConfig.Properties == nil || ipConfig.Properties.PublicIPAddress == nil || ipConfig.Properties.PublicIPAddress.ID == nil { + return "", fmt.Errorf("NIC has no public IP") + } + pipRG, pipName := parseResourceGroupAndName(*ipConfig.Properties.PublicIPAddress.ID) + + pipResp, err := c.pipClient.Get(ctx, pipRG, pipName, nil) + if err != nil { + return "", fmt.Errorf("failed to get public IP: %w", err) + } + if pipResp.Properties == nil || pipResp.Properties.IPAddress == nil { + return "", fmt.Errorf("public IP address is not allocated") + } + return *pipResp.Properties.IPAddress, nil +} + +// CreateSubnet creates a subnet in a VNet. +func (c *AzureClient) CreateSubnet(ctx context.Context, vnetRG, vnetName, subnetName, addressPrefix string) error { + _, err := c.subnetClient.Get(ctx, vnetRG, vnetName, subnetName, nil) + if err == nil { + c.logger.Info("Subnet %s already exists", subnetName) + return nil + } + + poller, err := c.subnetClient.BeginCreateOrUpdate(ctx, vnetRG, vnetName, subnetName, armnetwork.Subnet{ + Properties: &armnetwork.SubnetPropertiesFormat{ + AddressPrefix: ptr(addressPrefix), + }, + }, nil) + if err != nil { + return fmt.Errorf("failed to create subnet: %w", err) + } + if _, err = poller.PollUntilDone(ctx, nil); err != nil { + return fmt.Errorf("failed to create subnet: %w", err) + } + return nil +} + +// CreateNSG creates a network security group with SSH and VPN rules. +func (c *AzureClient) CreateNSG(ctx context.Context, resourceGroup, nsgName, location string, vpnPort int) error { + _, err := c.nsgClient.Get(ctx, resourceGroup, nsgName, nil) + if err == nil { + c.logger.Info("NSG %s already exists", nsgName) + return nil + } + + nsg := armnetwork.SecurityGroup{ + Location: ptr(location), + Properties: &armnetwork.SecurityGroupPropertiesFormat{ + SecurityRules: []*armnetwork.SecurityRule{ + { + Name: ptr("allow-ssh"), + Properties: &armnetwork.SecurityRulePropertiesFormat{ + Priority: ptr[int32](100), + Protocol: ptr(armnetwork.SecurityRuleProtocolTCP), + Access: ptr(armnetwork.SecurityRuleAccessAllow), + Direction: ptr(armnetwork.SecurityRuleDirectionInbound), + SourceAddressPrefix: ptr("*"), + SourcePortRange: ptr("*"), + DestinationAddressPrefix: ptr("*"), + DestinationPortRanges: []*string{ptr("22")}, + }, + }, + { + Name: ptr("allow-vpn"), + Properties: &armnetwork.SecurityRulePropertiesFormat{ + Priority: ptr[int32](200), + Protocol: ptr(armnetwork.SecurityRuleProtocolUDP), + Access: ptr(armnetwork.SecurityRuleAccessAllow), + Direction: ptr(armnetwork.SecurityRuleDirectionInbound), + SourceAddressPrefix: ptr("*"), + SourcePortRange: ptr("*"), + DestinationAddressPrefix: ptr("*"), + DestinationPortRanges: []*string{ptr(fmt.Sprintf("%d", vpnPort))}, + }, + }, + }, + }, + } + + poller, err := c.nsgClient.BeginCreateOrUpdate(ctx, resourceGroup, nsgName, nsg, nil) + if err != nil { + return fmt.Errorf("failed to create NSG: %w", err) + } + if _, err = poller.PollUntilDone(ctx, nil); err != nil { + return fmt.Errorf("failed to create NSG: %w", err) + } + return nil +} + +// CreatePublicIP creates a static public IP address. +func (c *AzureClient) CreatePublicIP(ctx context.Context, resourceGroup, pipName, location string) error { + _, err := c.pipClient.Get(ctx, resourceGroup, pipName, nil) + if err == nil { + c.logger.Info("Public IP %s already exists", pipName) + return nil + } + + poller, err := c.pipClient.BeginCreateOrUpdate(ctx, resourceGroup, pipName, armnetwork.PublicIPAddress{ + Location: ptr(location), + SKU: &armnetwork.PublicIPAddressSKU{ + Name: ptr(armnetwork.PublicIPAddressSKUNameStandard), + }, + Properties: &armnetwork.PublicIPAddressPropertiesFormat{ + PublicIPAllocationMethod: ptr(armnetwork.IPAllocationMethodStatic), + }, + }, nil) + if err != nil { + return fmt.Errorf("failed to create public IP: %w", err) + } + if _, err = poller.PollUntilDone(ctx, nil); err != nil { + return fmt.Errorf("failed to create public IP: %w", err) + } + return nil +} + +// GetPublicIPAddress retrieves a public IP address value. +func (c *AzureClient) GetPublicIPAddress(ctx context.Context, resourceGroup, pipName string) (string, error) { + resp, err := c.pipClient.Get(ctx, resourceGroup, pipName, nil) + if err != nil { + return "", fmt.Errorf("failed to get public IP: %w", err) + } + if resp.Properties == nil || resp.Properties.IPAddress == nil { + return "", fmt.Errorf("public IP address is not allocated") + } + return *resp.Properties.IPAddress, nil +} + +// CreateVM creates a VM with the specified configuration. +// It first creates a NIC, then creates the VM referencing that NIC. +func (c *AzureClient) CreateVM(ctx context.Context, resourceGroup, vmName, location, vnetRG, vnetName, subnetName, nsgName, pipName, sshKeyPath, vmSize string) error { + // Read SSH public key + pubKeyData, err := ReadFileContent(sshKeyPath + ".pub") + if err != nil { + return fmt.Errorf("failed to read SSH public key: %w", err) + } + pubKey := strings.TrimSpace(pubKeyData) + + // Build resource IDs + subnetID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/%s", + c.subscriptionID, vnetRG, vnetName, subnetName) + nsgID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/securityGroups/%s", + c.subscriptionID, resourceGroup, nsgName) + pipID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/publicIPAddresses/%s", + c.subscriptionID, resourceGroup, pipName) + + // Create NIC + nicName := vmName + "VMNic" + nicPoller, err := c.nicClient.BeginCreateOrUpdate(ctx, resourceGroup, nicName, armnetwork.Interface{ + Location: ptr(location), + Properties: &armnetwork.InterfacePropertiesFormat{ + NetworkSecurityGroup: &armnetwork.SecurityGroup{ + ID: ptr(nsgID), + }, + IPConfigurations: []*armnetwork.InterfaceIPConfiguration{ + { + Name: ptr("ipconfig1"), + Properties: &armnetwork.InterfaceIPConfigurationPropertiesFormat{ + Subnet: &armnetwork.Subnet{ + ID: ptr(subnetID), + }, + PublicIPAddress: &armnetwork.PublicIPAddress{ + ID: ptr(pipID), + }, + PrivateIPAllocationMethod: ptr(armnetwork.IPAllocationMethodDynamic), + }, + }, + }, + }, + }, nil) + if err != nil { + return fmt.Errorf("failed to create NIC: %w", err) + } + nicResp, err := nicPoller.PollUntilDone(ctx, nil) + if err != nil { + return fmt.Errorf("failed to create NIC: %w", err) + } + + // Create VM + vm := armcompute.VirtualMachine{ + Location: ptr(location), + Zones: []*string{ptr("1")}, + Properties: &armcompute.VirtualMachineProperties{ + HardwareProfile: &armcompute.HardwareProfile{ + VMSize: ptr(armcompute.VirtualMachineSizeTypes(vmSize)), + }, + StorageProfile: &armcompute.StorageProfile{ + ImageReference: &armcompute.ImageReference{ + Publisher: ptr("Canonical"), + Offer: ptr("0001-com-ubuntu-server-jammy"), + SKU: ptr("22_04-lts-gen2"), + Version: ptr("latest"), + }, + OSDisk: &armcompute.OSDisk{ + CreateOption: ptr(armcompute.DiskCreateOptionTypesFromImage), + ManagedDisk: &armcompute.ManagedDiskParameters{ + StorageAccountType: ptr(armcompute.StorageAccountTypesPremiumLRS), + }, + }, + }, + OSProfile: &armcompute.OSProfile{ + ComputerName: ptr(vmName), + AdminUsername: ptr("azureuser"), + LinuxConfiguration: &armcompute.LinuxConfiguration{ + DisablePasswordAuthentication: ptr(true), + SSH: &armcompute.SSHConfiguration{ + PublicKeys: []*armcompute.SSHPublicKey{ + { + Path: ptr("/home/azureuser/.ssh/authorized_keys"), + KeyData: ptr(pubKey), + }, + }, + }, + }, + }, + NetworkProfile: &armcompute.NetworkProfile{ + NetworkInterfaces: []*armcompute.NetworkInterfaceReference{ + { + ID: nicResp.ID, + Properties: &armcompute.NetworkInterfaceReferenceProperties{ + Primary: ptr(true), + }, + }, + }, + }, + }, + } + + vmPoller, err := c.vmClient.BeginCreateOrUpdate(ctx, resourceGroup, vmName, vm, nil) + if err != nil { + return fmt.Errorf("failed to create VM: %w", err) + } + if _, err = vmPoller.PollUntilDone(ctx, nil); err != nil { + return fmt.Errorf("failed to create VM: %w", err) + } + return nil +} + +// AddSSHKeyToVM adds an SSH key to a VM using RunCommand. +func (c *AzureClient) AddSSHKeyToVM(ctx context.Context, resourceGroup, vmName, sshKeyPath string) error { + pubKey, err := ReadFileContent(sshKeyPath + ".pub") + if err != nil { + return fmt.Errorf("failed to read SSH public key: %w", err) + } + + script := fmt.Sprintf( + "mkdir -p /home/azureuser/.ssh && echo '%s' >> /home/azureuser/.ssh/authorized_keys && "+ + "sort -u -o /home/azureuser/.ssh/authorized_keys /home/azureuser/.ssh/authorized_keys && "+ + "chown -R azureuser:azureuser /home/azureuser/.ssh && "+ + "chmod 700 /home/azureuser/.ssh && chmod 600 /home/azureuser/.ssh/authorized_keys", + strings.TrimSpace(pubKey)) + + poller, err := c.vmClient.BeginRunCommand(ctx, resourceGroup, vmName, armcompute.RunCommandInput{ + CommandID: ptr("RunShellScript"), + Script: []*string{ptr(script)}, + }, nil) + if err != nil { + return fmt.Errorf("failed to run SSH key command: %w", err) + } + if _, err = poller.PollUntilDone(ctx, nil); err != nil { + return fmt.Errorf("failed to add SSH key to VM: %w", err) + } + return nil +} + +// RestartVM restarts a VM. +func (c *AzureClient) RestartVM(ctx context.Context, resourceGroup, vmName string) error { + poller, err := c.vmClient.BeginRestart(ctx, resourceGroup, vmName, nil) + if err != nil { + return fmt.Errorf("failed to restart VM: %w", err) + } + if _, err = poller.PollUntilDone(ctx, nil); err != nil { + return fmt.Errorf("failed to restart VM: %w", err) + } + return nil +} + +// DeleteVM deletes a VM if it exists. +func (c *AzureClient) DeleteVM(ctx context.Context, resourceGroup, vmName string) error { + if !c.VMExists(ctx, resourceGroup, vmName) { + return nil + } + forceDeletion := true + poller, err := c.vmClient.BeginDelete(ctx, resourceGroup, vmName, &armcompute.VirtualMachinesClientBeginDeleteOptions{ + ForceDeletion: &forceDeletion, + }) + if err != nil { + if isNotFoundError(err) { + return nil + } + return fmt.Errorf("failed to delete VM: %w", err) + } + if _, err = poller.PollUntilDone(ctx, nil); err != nil { + return fmt.Errorf("failed to delete VM: %w", err) + } + return nil +} + +// DeleteNIC deletes a network interface if it exists. +func (c *AzureClient) DeleteNIC(ctx context.Context, resourceGroup, nicName string) error { + poller, err := c.nicClient.BeginDelete(ctx, resourceGroup, nicName, nil) + if err != nil { + if isNotFoundError(err) { + return nil + } + return fmt.Errorf("failed to delete NIC: %w", err) + } + if _, err = poller.PollUntilDone(ctx, nil); err != nil { + return fmt.Errorf("failed to delete NIC: %w", err) + } + return nil +} + +// DeletePublicIP deletes a public IP address if it exists. +func (c *AzureClient) DeletePublicIP(ctx context.Context, resourceGroup, pipName string) error { + poller, err := c.pipClient.BeginDelete(ctx, resourceGroup, pipName, nil) + if err != nil { + if isNotFoundError(err) { + return nil + } + return fmt.Errorf("failed to delete public IP: %w", err) + } + if _, err = poller.PollUntilDone(ctx, nil); err != nil { + return fmt.Errorf("failed to delete public IP: %w", err) + } + return nil +} + +// DeleteNSG deletes a network security group if it exists. +func (c *AzureClient) DeleteNSG(ctx context.Context, resourceGroup, nsgName string) error { + poller, err := c.nsgClient.BeginDelete(ctx, resourceGroup, nsgName, nil) + if err != nil { + if isNotFoundError(err) { + return nil + } + return fmt.Errorf("failed to delete NSG: %w", err) + } + if _, err = poller.PollUntilDone(ctx, nil); err != nil { + return fmt.Errorf("failed to delete NSG: %w", err) + } + return nil +} + +// DeleteSubnet deletes a subnet (errors are ignored). +func (c *AzureClient) DeleteSubnet(ctx context.Context, vnetRG, vnetName, subnetName string) error { + poller, err := c.subnetClient.BeginDelete(ctx, vnetRG, vnetName, subnetName, nil) + if err != nil { + return nil // Ignore errors + } + _, _ = poller.PollUntilDone(ctx, nil) + return nil +} + +// DeleteDisks deletes disks matching a name pattern. +func (c *AzureClient) DeleteDisks(ctx context.Context, resourceGroup, pattern string) error { + pager := c.diskClient.NewListByResourceGroupPager(resourceGroup, nil) + for pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + return nil // Ignore errors + } + for _, disk := range page.Value { + if disk.Name == nil || !strings.Contains(*disk.Name, pattern) { + continue + } + poller, err := c.diskClient.BeginDelete(ctx, resourceGroup, *disk.Name, nil) + if err != nil { + continue + } + _, _ = poller.PollUntilDone(ctx, nil) + } + } + return nil +} + +// DeleteConnectedMachine deletes an Arc connected machine (errors are ignored). +func (c *AzureClient) DeleteConnectedMachine(ctx context.Context, resourceGroup, machineName string) error { + _, _ = c.arcClient.Delete(ctx, resourceGroup, machineName, nil) + return nil +} + +// GetAKSCredentials gets AKS cluster credentials and writes the kubeconfig to the specified path. +func (c *AzureClient) GetAKSCredentials(ctx context.Context, resourceGroup, clusterName, kubeconfigPath string) error { + resp, err := c.aksClient.ListClusterUserCredentials(ctx, resourceGroup, clusterName, nil) + if err != nil { + return fmt.Errorf("failed to get AKS credentials: %w", err) + } + if len(resp.Kubeconfigs) == 0 || resp.Kubeconfigs[0].Value == nil { + return fmt.Errorf("no kubeconfig returned for cluster %s", clusterName) + } + + if err := EnsureDirectory(filepath.Dir(kubeconfigPath)); err != nil { + return fmt.Errorf("failed to create kubeconfig directory: %w", err) + } + if err := os.WriteFile(kubeconfigPath, resp.Kubeconfigs[0].Value, 0600); err != nil { + return fmt.Errorf("failed to write kubeconfig: %w", err) + } + return nil +} + +// --- Helper functions --- + +// ptr returns a pointer to the given value. +func ptr[T any](v T) *T { + return &v +} + +// isNotFoundError checks if an error is a 404 Not Found response. +func isNotFoundError(err error) bool { + var respErr *azcore.ResponseError + return errors.As(err, &respErr) && respErr.StatusCode == http.StatusNotFound +} + +// parseResourceGroupAndName extracts resource group and resource name from an Azure resource ID. +func parseResourceGroupAndName(resourceID string) (resourceGroup, name string) { + parts := strings.Split(resourceID, "/") + for i, part := range parts { + if strings.EqualFold(part, "resourceGroups") && i+1 < len(parts) { + resourceGroup = parts[i+1] + } + } + if len(parts) > 0 { + name = parts[len(parts)-1] + } + return +} + +// extractSubnetIDFromVMSS extracts the subnet ID from a VMSS's network profile. +func extractSubnetIDFromVMSS(vmss *armcompute.VirtualMachineScaleSet) string { + if vmss.Properties == nil || + vmss.Properties.VirtualMachineProfile == nil || + vmss.Properties.VirtualMachineProfile.NetworkProfile == nil { + return "" + } + configs := vmss.Properties.VirtualMachineProfile.NetworkProfile.NetworkInterfaceConfigurations + if len(configs) == 0 || configs[0].Properties == nil { + return "" + } + ipConfigs := configs[0].Properties.IPConfigurations + if len(ipConfigs) == 0 || ipConfigs[0].Properties == nil || ipConfigs[0].Properties.Subnet == nil || ipConfigs[0].Properties.Subnet.ID == nil { + return "" + } + return *ipConfigs[0].Properties.Subnet.ID +} diff --git a/pkg/privatecluster/installer.go b/pkg/privatecluster/installer.go index f6c0e9c..0ab835c 100644 --- a/pkg/privatecluster/installer.go +++ b/pkg/privatecluster/installer.go @@ -4,13 +4,16 @@ import ( "context" "fmt" "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore" ) // Installer handles private cluster installation type Installer struct { - logger *Logger - azure *AzureCLI - options InstallOptions + logger *Logger + azureClient *AzureClient + toolInstaller *ToolInstaller + options InstallOptions // State collected during installation clusterInfo *AKSClusterInfo @@ -19,8 +22,9 @@ type Installer struct { gatewayIP string } -// NewInstaller creates a new Installer instance -func NewInstaller(options InstallOptions) *Installer { +// NewInstaller creates a new Installer instance. +// cred is the Azure credential used for SDK calls. +func NewInstaller(options InstallOptions, cred azcore.TokenCredential) (*Installer, error) { logger := NewLogger(options.Verbose) // Apply defaults @@ -28,13 +32,24 @@ func NewInstaller(options InstallOptions) *Installer { options.Gateway = DefaultGatewayConfig() } - return &Installer{ - logger: logger, - azure: NewAzureCLI(logger), - options: options, - vpnConfig: DefaultVPNConfig(), - sshKeyPath: GetSSHKeyPath(), + subscriptionID, _, _, err := ParseResourceID(options.AKSResourceID) + if err != nil { + return nil, fmt.Errorf("failed to parse resource ID: %w", err) } + + azureClient, err := NewAzureClient(cred, subscriptionID, logger) + if err != nil { + return nil, fmt.Errorf("failed to create Azure client: %w", err) + } + + return &Installer{ + logger: logger, + azureClient: azureClient, + toolInstaller: NewToolInstaller(logger), + options: options, + vpnConfig: DefaultVPNConfig(), + sshKeyPath: GetSSHKeyPath(), + }, nil } // Install runs the complete installation process @@ -84,36 +99,21 @@ func (i *Installer) Install(ctx context.Context) error { // phase1EnvironmentCheck checks prerequisites func (i *Installer) phase1EnvironmentCheck(ctx context.Context) error { _ = CleanKubeCache() - if err := i.azure.CheckInstalled(); err != nil { - return err - } - if err := i.azure.CheckLogin(ctx); err != nil { - return err - } - i.logger.Success("Azure CLI ready") - - // Check/refresh token - if err := i.azure.CheckAndRefreshToken(ctx); err != nil { - return err - } - - if err := i.azure.SetSubscription(ctx, i.clusterInfo.SubscriptionID); err != nil { - return err - } + i.logger.Success("Azure SDK client ready") i.logger.Success("Subscription: %s", i.clusterInfo.SubscriptionID) // Get Tenant ID - tenantID, err := i.azure.GetTenantID(ctx) + tenantID, err := i.azureClient.GetTenantID(ctx) if err != nil { return err } i.clusterInfo.TenantID = tenantID i.logger.Verbose("Tenant ID: %s", tenantID) - if !i.azure.AKSClusterExists(ctx, i.clusterInfo.ResourceGroup, i.clusterInfo.ClusterName) { + if !i.azureClient.AKSClusterExists(ctx, i.clusterInfo.ResourceGroup, i.clusterInfo.ClusterName) { return fmt.Errorf("AKS cluster '%s' not found", i.clusterInfo.ClusterName) } - clusterInfo, err := i.azure.GetAKSClusterInfo(ctx, i.clusterInfo.ResourceGroup, i.clusterInfo.ClusterName) + clusterInfo, err := i.azureClient.GetAKSClusterInfo(ctx, i.clusterInfo.ResourceGroup, i.clusterInfo.ClusterName) if err != nil { return err } @@ -122,7 +122,7 @@ func (i *Installer) phase1EnvironmentCheck(ctx context.Context) error { i.clusterInfo.PrivateFQDN = clusterInfo.PrivateFQDN i.logger.Success("AKS cluster: %s (AAD/RBAC enabled)", i.clusterInfo.ClusterName) - vnetName, vnetRG, err := i.azure.GetVNetInfo(ctx, i.clusterInfo.NodeResourceGroup) + vnetName, vnetRG, err := i.azureClient.GetVNetInfo(ctx, i.clusterInfo.NodeResourceGroup) if err != nil { return err } @@ -137,7 +137,7 @@ func (i *Installer) phase1EnvironmentCheck(ctx context.Context) error { return fmt.Errorf("failed to install jq: %w", err) } if !CommandExists("kubectl") || !CommandExists("kubelogin") { - if err := i.azure.InstallAKSCLI(ctx); err != nil { + if err := i.toolInstaller.InstallAKSCLI(ctx); err != nil { return fmt.Errorf("failed to install kubectl/kubelogin: %w", err) } } @@ -147,7 +147,7 @@ func (i *Installer) phase1EnvironmentCheck(ctx context.Context) error { if !CommandExists("kubelogin") { return fmt.Errorf("kubelogin installation failed") } - _ = i.azure.InstallConnectedMachineExtension(ctx) + _ = i.toolInstaller.InstallConnectedMachineExtension(ctx) i.logger.Success("Dependencies ready") return nil @@ -156,9 +156,9 @@ func (i *Installer) phase1EnvironmentCheck(ctx context.Context) error { // phase2GatewaySetup sets up the VPN Gateway func (i *Installer) phase2GatewaySetup(ctx context.Context) error { gatewayExists := false - if i.azure.VMExists(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name) { + if i.azureClient.VMExists(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name) { gatewayExists = true - ip, err := i.azure.GetVMPublicIP(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name) + ip, err := i.azureClient.GetVMPublicIP(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name) if err != nil { return fmt.Errorf("failed to get Gateway public IP: %w", err) } @@ -174,7 +174,7 @@ func (i *Installer) phase2GatewaySetup(ctx context.Context) error { if err := GenerateSSHKey(i.sshKeyPath); err != nil { return fmt.Errorf("failed to generate SSH key: %w", err) } - if err := i.azure.AddSSHKeyToVM(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name, i.sshKeyPath); err != nil { + if err := i.azureClient.AddSSHKeyToVM(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name, i.sshKeyPath); err != nil { return fmt.Errorf("failed to add SSH key to Gateway: %w", err) } @@ -195,27 +195,29 @@ func (i *Installer) phase2GatewaySetup(ctx context.Context) error { func (i *Installer) createGatewayInfrastructure(ctx context.Context) error { nsgName := i.options.Gateway.Name + "-nsg" pipName := i.options.Gateway.Name + "-pip" + location := i.clusterInfo.Location - if err := i.azure.CreateSubnet(ctx, i.clusterInfo.VNetResourceGroup, i.clusterInfo.VNetName, + if err := i.azureClient.CreateSubnet(ctx, i.clusterInfo.VNetResourceGroup, i.clusterInfo.VNetName, i.options.Gateway.SubnetName, i.options.Gateway.SubnetPrefix); err != nil { return fmt.Errorf("failed to create subnet: %w", err) } - if err := i.azure.CreateNSG(ctx, i.clusterInfo.ResourceGroup, nsgName, i.options.Gateway.Port); err != nil { + if err := i.azureClient.CreateNSG(ctx, i.clusterInfo.ResourceGroup, nsgName, location, i.options.Gateway.Port); err != nil { return fmt.Errorf("failed to create NSG: %w", err) } - if err := i.azure.CreatePublicIP(ctx, i.clusterInfo.ResourceGroup, pipName); err != nil { + if err := i.azureClient.CreatePublicIP(ctx, i.clusterInfo.ResourceGroup, pipName, location); err != nil { return fmt.Errorf("failed to create public IP: %w", err) } if err := GenerateSSHKey(i.sshKeyPath); err != nil { return fmt.Errorf("failed to generate SSH key: %w", err) } - if err := i.azure.CreateVM(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name, - i.clusterInfo.VNetName, i.options.Gateway.SubnetName, nsgName, pipName, + if err := i.azureClient.CreateVM(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name, + location, i.clusterInfo.VNetResourceGroup, i.clusterInfo.VNetName, + i.options.Gateway.SubnetName, nsgName, pipName, i.sshKeyPath, i.options.Gateway.VMSize); err != nil { return fmt.Errorf("failed to create Gateway VM: %w", err) } - ip, err := i.azure.GetPublicIPAddress(ctx, i.clusterInfo.ResourceGroup, pipName) + ip, err := i.azureClient.GetPublicIPAddress(ctx, i.clusterInfo.ResourceGroup, pipName) if err != nil { return fmt.Errorf("failed to get public IP address: %w", err) } @@ -244,7 +246,7 @@ func (i *Installer) waitForVMReady(ctx context.Context, gatewayExists bool) erro if gatewayExists { i.logger.Info("Restarting VM...") - _ = i.azure.RestartVM(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name) + _ = i.azureClient.RestartVM(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name) select { case <-ctx.Done(): return ctx.Err() @@ -353,7 +355,7 @@ func (i *Installer) phase4NodeJoin(ctx context.Context) error { } kubeconfigPath := "/root/.kube/config" - if err := i.azure.GetAKSCredentials(ctx, i.clusterInfo.ResourceGroup, i.clusterInfo.ClusterName, kubeconfigPath); err != nil { + if err := i.azureClient.GetAKSCredentials(ctx, i.clusterInfo.ResourceGroup, i.clusterInfo.ClusterName, kubeconfigPath); err != nil { return fmt.Errorf("failed to get AKS credentials: %w", err) } if _, err := RunCommand(ctx, "kubelogin", "convert-kubeconfig", "-l", "azurecli", "--kubeconfig", kubeconfigPath); err != nil { diff --git a/pkg/privatecluster/privatecluster_test.go b/pkg/privatecluster/privatecluster_test.go index 9fa36db..fd39dbe 100644 --- a/pkg/privatecluster/privatecluster_test.go +++ b/pkg/privatecluster/privatecluster_test.go @@ -126,16 +126,18 @@ func TestInstallerCreation(t *testing.T) { Verbose: true, } - installer := NewInstaller(options) + // NewInstaller requires a credential; pass nil to test creation without Azure calls + installer, err := NewInstaller(options, nil) + // Expected to fail since nil credential can't create Azure clients + if err != nil { + t.Skipf("Skipping: NewInstaller requires valid Azure credential: %v", err) + } if installer == nil { t.Fatal("NewInstaller() should not return nil") } if installer.logger == nil { t.Error("Installer.logger should not be nil") } - if installer.azure == nil { - t.Error("Installer.azure should not be nil") - } } func TestUninstallerCreation(t *testing.T) { @@ -144,7 +146,11 @@ func TestUninstallerCreation(t *testing.T) { AKSResourceID: "", } - uninstaller := NewUninstaller(options) + // NewUninstaller with empty resource ID and nil cred skips Azure client creation + uninstaller, err := NewUninstaller(options, nil) + if err != nil { + t.Fatalf("NewUninstaller() returned error: %v", err) + } if uninstaller == nil { t.Error("NewUninstaller() should not return nil") } diff --git a/pkg/privatecluster/scripts.go b/pkg/privatecluster/scripts.go index ab0d710..990c68f 100644 --- a/pkg/privatecluster/scripts.go +++ b/pkg/privatecluster/scripts.go @@ -3,6 +3,8 @@ package privatecluster import ( "context" "fmt" + + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" ) // ScriptRunner provides backward compatibility (Deprecated: use Installer/Uninstaller directly) @@ -21,13 +23,21 @@ func (r *ScriptRunner) RunPrivateInstall(ctx context.Context, aksResourceID stri return fmt.Errorf("AKS resource ID is required") } + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to create Azure CLI credential: %w", err) + } + options := InstallOptions{ AKSResourceID: aksResourceID, Gateway: DefaultGatewayConfig(), Verbose: r.verbose, } - installer := NewInstaller(options) + installer, err := NewInstaller(options, cred) + if err != nil { + return fmt.Errorf("failed to create installer: %w", err) + } return installer.Install(ctx) } @@ -37,11 +47,23 @@ func (r *ScriptRunner) RunPrivateUninstall(ctx context.Context, mode CleanupMode return fmt.Errorf("--aks-resource-id is required for full cleanup mode") } + var cred *azidentity.AzureCLICredential + if aksResourceID != "" { + var err error + cred, err = azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to create Azure CLI credential: %w", err) + } + } + options := UninstallOptions{ Mode: mode, AKSResourceID: aksResourceID, } - uninstaller := NewUninstaller(options) + uninstaller, err := NewUninstaller(options, cred) + if err != nil { + return fmt.Errorf("failed to create uninstaller: %w", err) + } return uninstaller.Uninstall(ctx) } diff --git a/pkg/privatecluster/tool_installer.go b/pkg/privatecluster/tool_installer.go new file mode 100644 index 0000000..aee2beb --- /dev/null +++ b/pkg/privatecluster/tool_installer.go @@ -0,0 +1,45 @@ +package privatecluster + +import ( + "context" + "fmt" +) + +// ToolInstaller handles installation of CLI tools that cannot be replaced by SDK calls. +type ToolInstaller struct { + logger *Logger +} + +// NewToolInstaller creates a new ToolInstaller instance. +func NewToolInstaller(logger *Logger) *ToolInstaller { + return &ToolInstaller{logger: logger} +} + +// InstallAKSCLI installs kubectl and kubelogin via Azure CLI. +func (t *ToolInstaller) InstallAKSCLI(ctx context.Context) error { + _, err := RunCommand(ctx, "az", "aks", "install-cli", + "--install-location", "/usr/local/bin/kubectl", + "--kubelogin-install-location", "/usr/local/bin/kubelogin") + if err != nil { + return fmt.Errorf("failed to install kubectl/kubelogin: %w", err) + } + + _, _ = RunCommand(ctx, "chmod", "+x", "/usr/local/bin/kubectl", "/usr/local/bin/kubelogin") + return nil +} + +// InstallConnectedMachineExtension installs the connectedmachine Azure CLI extension. +func (t *ToolInstaller) InstallConnectedMachineExtension(ctx context.Context) error { + // Check if already installed + if RunCommandSilent(ctx, "az", "extension", "show", "--name", "connectedmachine") { + return nil + } + + _, _ = RunCommand(ctx, "az", "config", "set", "extension.dynamic_install_allow_preview=true", "--only-show-errors") + + _, err := RunCommand(ctx, "az", "extension", "add", + "--name", "connectedmachine", + "--allow-preview", "true", + "--only-show-errors") + return err +} diff --git a/pkg/privatecluster/uninstaller.go b/pkg/privatecluster/uninstaller.go index c8b3252..638d542 100644 --- a/pkg/privatecluster/uninstaller.go +++ b/pkg/privatecluster/uninstaller.go @@ -4,13 +4,16 @@ import ( "context" "fmt" "strings" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore" ) // Uninstaller handles private cluster uninstallation type Uninstaller struct { - logger *Logger - azure *AzureCLI - options UninstallOptions + logger *Logger + azureClient *AzureClient + toolInstaller *ToolInstaller + options UninstallOptions // State clusterInfo *AKSClusterInfo @@ -20,16 +23,33 @@ type Uninstaller struct { clientKey string } -// NewUninstaller creates a new Uninstaller instance -func NewUninstaller(options UninstallOptions) *Uninstaller { +// NewUninstaller creates a new Uninstaller instance. +// cred is the Azure credential used for SDK calls. If nil, Azure resource cleanup will be skipped. +func NewUninstaller(options UninstallOptions, cred azcore.TokenCredential) (*Uninstaller, error) { logger := NewLogger(false) - return &Uninstaller{ - logger: logger, - azure: NewAzureCLI(logger), - options: options, - vpnConfig: DefaultVPNConfig(), - sshKeyPath: GetSSHKeyPath(), + + u := &Uninstaller{ + logger: logger, + toolInstaller: NewToolInstaller(logger), + options: options, + vpnConfig: DefaultVPNConfig(), + sshKeyPath: GetSSHKeyPath(), } + + // Only create Azure client if we have a resource ID (needed for full cleanup) + if options.AKSResourceID != "" && cred != nil { + subscriptionID, _, _, err := ParseResourceID(options.AKSResourceID) + if err != nil { + return nil, fmt.Errorf("failed to parse resource ID: %w", err) + } + azureClient, err := NewAzureClient(cred, subscriptionID, logger) + if err != nil { + return nil, fmt.Errorf("failed to create Azure client: %w", err) + } + u.azureClient = azureClient + } + + return u, nil } // Uninstall runs the uninstallation process @@ -52,7 +72,7 @@ func (u *Uninstaller) Uninstall(ctx context.Context) error { u.logger.Info("Cluster: %s/%s (Subscription: %s)", resourceGroup, clusterName, subscriptionID) } - _ = u.azure.InstallConnectedMachineExtension(ctx) + _ = u.toolInstaller.InstallConnectedMachineExtension(ctx) switch u.options.Mode { case CleanupModeLocal: @@ -228,12 +248,12 @@ func (u *Uninstaller) removeArcAgent(ctx context.Context, nodeName string) { } } - if arcRG != "" { + if arcRG != "" && u.azureClient != nil { u.logger.Info("Deleting Arc machine from Azure...") - _ = u.azure.DeleteConnectedMachine(ctx, arcRG, nodeName) + _ = u.azureClient.DeleteConnectedMachine(ctx, arcRG, nodeName) u.logger.Success("Arc machine deleted from Azure") - } else if u.clusterInfo != nil { - _ = u.azure.DeleteConnectedMachine(ctx, u.clusterInfo.ResourceGroup, nodeName) + } else if u.clusterInfo != nil && u.azureClient != nil { + _ = u.azureClient.DeleteConnectedMachine(ctx, u.clusterInfo.ResourceGroup, nodeName) } _, _ = RunCommand(ctx, "azcmagent", "disconnect", "--force-local-only") @@ -308,39 +328,36 @@ func (u *Uninstaller) deleteSSHKeys() { // deleteAzureResources deletes all Azure resources created for the Gateway func (u *Uninstaller) deleteAzureResources(ctx context.Context) error { - if u.clusterInfo == nil { - return fmt.Errorf("cluster info not available") + if u.clusterInfo == nil || u.azureClient == nil { + return fmt.Errorf("cluster info or Azure client not available") } u.logger.Info("Deleting Azure resources...") - if err := u.azure.SetSubscription(ctx, u.clusterInfo.SubscriptionID); err != nil { - return err - } gatewayName := "wg-gateway" nicName := gatewayName + "VMNic" pipName := gatewayName + "-pip" nsgName := gatewayName + "-nsg" - if err := u.azure.DeleteVM(ctx, u.clusterInfo.ResourceGroup, gatewayName); err != nil { + if err := u.azureClient.DeleteVM(ctx, u.clusterInfo.ResourceGroup, gatewayName); err != nil { u.logger.Warning("Delete VM: %v", err) } - if err := u.azure.DeleteNIC(ctx, u.clusterInfo.ResourceGroup, nicName); err != nil { + if err := u.azureClient.DeleteNIC(ctx, u.clusterInfo.ResourceGroup, nicName); err != nil { u.logger.Warning("Delete NIC: %v", err) } - if err := u.azure.DeletePublicIP(ctx, u.clusterInfo.ResourceGroup, pipName); err != nil { + if err := u.azureClient.DeletePublicIP(ctx, u.clusterInfo.ResourceGroup, pipName); err != nil { u.logger.Warning("Delete Public IP: %v", err) } - if err := u.azure.DeleteNSG(ctx, u.clusterInfo.ResourceGroup, nsgName); err != nil { + if err := u.azureClient.DeleteNSG(ctx, u.clusterInfo.ResourceGroup, nsgName); err != nil { u.logger.Warning("Delete NSG: %v", err) } - _ = u.azure.DeleteDisks(ctx, u.clusterInfo.ResourceGroup, gatewayName) + _ = u.azureClient.DeleteDisks(ctx, u.clusterInfo.ResourceGroup, gatewayName) - clusterInfo, err := u.azure.GetAKSClusterInfo(ctx, u.clusterInfo.ResourceGroup, u.clusterInfo.ClusterName) + clusterInfo, err := u.azureClient.GetAKSClusterInfo(ctx, u.clusterInfo.ResourceGroup, u.clusterInfo.ClusterName) if err == nil { - vnetName, vnetRG, err := u.azure.GetVNetInfo(ctx, clusterInfo.NodeResourceGroup) + vnetName, vnetRG, err := u.azureClient.GetVNetInfo(ctx, clusterInfo.NodeResourceGroup) if err == nil { - _ = u.azure.DeleteSubnet(ctx, vnetRG, vnetName, "wg-subnet") + _ = u.azureClient.DeleteSubnet(ctx, vnetRG, vnetName, "wg-subnet") } } u.logger.Success("Azure resources deleted") From 940d5c4112728c904fa5884c96567acdbc95e9a3 Mon Sep 17 00:00:00 2001 From: weiliu2 Date: Tue, 10 Feb 2026 20:13:50 +1300 Subject: [PATCH 06/11] comments2 --- commands.go | 4 ++-- pkg/config/structs.go | 2 +- pkg/privatecluster/installer.go | 3 --- pkg/privatecluster/vpn.go | 8 -------- 4 files changed, 3 insertions(+), 14 deletions(-) diff --git a/commands.go b/commands.go index 65cd453..3af5392 100644 --- a/commands.go +++ b/commands.go @@ -89,7 +89,7 @@ func runAgent(ctx context.Context) error { } // For private clusters, run Gateway/VPN setup before bootstrap - if cfg.Azure.TargetCluster != nil && cfg.Azure.TargetCluster.Private { + if cfg.Azure.TargetCluster != nil && cfg.Azure.TargetCluster.IsPrivateCluster { logger.Info("Private cluster detected, running Gateway/VPN setup...") if os.Getuid() != 0 { return fmt.Errorf("private cluster setup requires root privileges, please run with 'sudo'") @@ -134,7 +134,7 @@ func runUnbootstrap(ctx context.Context) error { } // For private clusters, run VPN/Gateway cleanup first - if cfg.Azure.TargetCluster != nil && cfg.Azure.TargetCluster.Private { + if cfg.Azure.TargetCluster != nil && cfg.Azure.TargetCluster.IsPrivateCluster { logger.Info("Private cluster detected, running VPN/Gateway cleanup...") // Validate cleanup mode diff --git a/pkg/config/structs.go b/pkg/config/structs.go index 2ddc472..40e22f2 100644 --- a/pkg/config/structs.go +++ b/pkg/config/structs.go @@ -57,7 +57,7 @@ type BootstrapTokenConfig struct { type TargetClusterConfig struct { ResourceID string `json:"resourceId"` // Full resource ID of the target AKS cluster Location string `json:"location"` // Azure region of the cluster (e.g., "eastus", "westus2") - Private bool `json:"private"` // Whether this is a private AKS cluster (requires Gateway/VPN setup) + IsPrivateCluster bool `json:"private"` // Whether this is a private AKS cluster (requires Gateway/VPN setup) Name string // will be populated from ResourceID ResourceGroup string // will be populated from ResourceID SubscriptionID string // will be populated from ResourceID diff --git a/pkg/privatecluster/installer.go b/pkg/privatecluster/installer.go index 0ab835c..bca41d6 100644 --- a/pkg/privatecluster/installer.go +++ b/pkg/privatecluster/installer.go @@ -133,9 +133,6 @@ func (i *Installer) phase1EnvironmentCheck(ctx context.Context) error { if err := InstallVPNTools(ctx, i.logger); err != nil { return fmt.Errorf("failed to install VPN tools: %w", err) } - if err := InstallJQ(ctx, i.logger); err != nil { - return fmt.Errorf("failed to install jq: %w", err) - } if !CommandExists("kubectl") || !CommandExists("kubelogin") { if err := i.toolInstaller.InstallAKSCLI(ctx); err != nil { return fmt.Errorf("failed to install kubectl/kubelogin: %w", err) diff --git a/pkg/privatecluster/vpn.go b/pkg/privatecluster/vpn.go index 7242c12..537a1a7 100644 --- a/pkg/privatecluster/vpn.go +++ b/pkg/privatecluster/vpn.go @@ -257,11 +257,3 @@ func InstallVPNTools(ctx context.Context, logger *Logger) error { return err } -// InstallJQ installs jq locally -func InstallJQ(ctx context.Context, logger *Logger) error { - if CommandExists("jq") { - return nil - } - _, err := RunCommand(ctx, "apt-get", "install", "-y", "jq") - return err -} From eae888c7aaa99d42495d7b70397eb00c79ff7c89 Mon Sep 17 00:00:00 2001 From: weiliu2 Date: Tue, 10 Feb 2026 20:42:13 +1300 Subject: [PATCH 07/11] comments 3 --- commands.go | 15 +- pkg/privatecluster/azure.go | 445 ---------------------------------- pkg/privatecluster/scripts.go | 69 ------ 3 files changed, 13 insertions(+), 516 deletions(-) delete mode 100644 pkg/privatecluster/azure.go delete mode 100644 pkg/privatecluster/scripts.go diff --git a/commands.go b/commands.go index 3af5392..4bff781 100644 --- a/commands.go +++ b/commands.go @@ -94,8 +94,19 @@ func runAgent(ctx context.Context) error { if os.Getuid() != 0 { return fmt.Errorf("private cluster setup requires root privileges, please run with 'sudo'") } - runner := privatecluster.NewScriptRunner("") - if err := runner.RunPrivateInstall(ctx, cfg.Azure.TargetCluster.ResourceID); err != nil { + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to create Azure CLI credential: %w", err) + } + options := privatecluster.InstallOptions{ + AKSResourceID: cfg.Azure.TargetCluster.ResourceID, + Gateway: privatecluster.DefaultGatewayConfig(), + } + installer, err := privatecluster.NewInstaller(options, cred) + if err != nil { + return fmt.Errorf("failed to create private cluster installer: %w", err) + } + if err := installer.Install(ctx); err != nil { return fmt.Errorf("private cluster setup failed: %w", err) } logger.Info("Private cluster setup completed") diff --git a/pkg/privatecluster/azure.go b/pkg/privatecluster/azure.go deleted file mode 100644 index c08a84e..0000000 --- a/pkg/privatecluster/azure.go +++ /dev/null @@ -1,445 +0,0 @@ -package privatecluster - -import ( - "context" - "encoding/json" - "fmt" - "strings" -) - -// AzureCLI provides Azure CLI operations -type AzureCLI struct { - logger *Logger -} - -// NewAzureCLI creates a new AzureCLI instance -func NewAzureCLI(logger *Logger) *AzureCLI { - return &AzureCLI{logger: logger} -} - -// CheckInstalled verifies Azure CLI is installed -func (az *AzureCLI) CheckInstalled() error { - if !CommandExists("az") { - return fmt.Errorf("azure CLI not installed, please install: curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash") - } - return nil -} - -// CheckLogin verifies Azure CLI is logged in -func (az *AzureCLI) CheckLogin(ctx context.Context) error { - if !RunCommandSilent(ctx, "az", "account", "show") { - return fmt.Errorf("azure CLI not logged in, please run 'az login' first") - } - return nil -} - -// CheckAndRefreshToken checks if token is valid and refreshes if needed -func (az *AzureCLI) CheckAndRefreshToken(ctx context.Context) error { - if !RunCommandSilent(ctx, "az", "account", "get-access-token", "--only-show-errors") { - az.logger.Warning("Azure token expired or invalid, re-authenticating...") - return RunCommandInteractive(ctx, "az", "login") - } - return nil -} - -// SetSubscription sets the active subscription -func (az *AzureCLI) SetSubscription(ctx context.Context, subscriptionID string) error { - _, err := RunCommand(ctx, "az", "account", "set", "--subscription", subscriptionID) - return err -} - -// GetTenantID returns the current tenant ID -func (az *AzureCLI) GetTenantID(ctx context.Context) (string, error) { - return RunCommand(ctx, "az", "account", "show", "--query", "tenantId", "-o", "tsv") -} - -// AKSClusterExists checks if an AKS cluster exists -func (az *AzureCLI) AKSClusterExists(ctx context.Context, resourceGroup, clusterName string) bool { - return RunCommandSilent(ctx, "az", "aks", "show", - "--resource-group", resourceGroup, - "--name", clusterName) -} - -// GetAKSClusterInfo retrieves AKS cluster information -func (az *AzureCLI) GetAKSClusterInfo(ctx context.Context, resourceGroup, clusterName string) (*AKSClusterInfo, error) { - info := &AKSClusterInfo{ - ResourceGroup: resourceGroup, - ClusterName: clusterName, - } - - // Get AAD enabled status - aadEnabled, _ := RunCommand(ctx, "az", "aks", "show", - "--resource-group", resourceGroup, - "--name", clusterName, - "--query", "aadProfile.managed", "-o", "tsv") - - if strings.ToLower(aadEnabled) != "true" { - return nil, fmt.Errorf("AKS cluster AAD not enabled, please enable: az aks update --enable-aad") - } - - // Get RBAC enabled status - rbacEnabled, _ := RunCommand(ctx, "az", "aks", "show", - "--resource-group", resourceGroup, - "--name", clusterName, - "--query", "aadProfile.enableAzureRbac", "-o", "tsv") - - if strings.ToLower(rbacEnabled) != "true" { - return nil, fmt.Errorf("AKS cluster Azure RBAC not enabled, please enable: az aks update --enable-azure-rbac") - } - - // Get location - location, err := RunCommand(ctx, "az", "aks", "show", - "--resource-group", resourceGroup, - "--name", clusterName, - "--query", "location", "-o", "tsv") - if err != nil { - return nil, fmt.Errorf("failed to get cluster location: %w", err) - } - info.Location = location - - // Get node resource group - nodeRG, err := RunCommand(ctx, "az", "aks", "show", - "--resource-group", resourceGroup, - "--name", clusterName, - "--query", "nodeResourceGroup", "-o", "tsv") - if err != nil { - return nil, fmt.Errorf("failed to get node resource group: %w", err) - } - info.NodeResourceGroup = nodeRG - - // Get private FQDN - privateFQDN, err := RunCommand(ctx, "az", "aks", "show", - "--resource-group", resourceGroup, - "--name", clusterName, - "--query", "privateFqdn", "-o", "tsv") - if err != nil { - return nil, fmt.Errorf("failed to get private FQDN: %w", err) - } - info.PrivateFQDN = privateFQDN - - return info, nil -} - -// GetVNetInfo retrieves VNet information from AKS VMSS -func (az *AzureCLI) GetVNetInfo(ctx context.Context, nodeResourceGroup string) (vnetName, vnetRG string, err error) { - // Get first VMSS name - vmssName, err := RunCommand(ctx, "az", "vmss", "list", - "--resource-group", nodeResourceGroup, - "--query", "[0].name", "-o", "tsv") - if err != nil || vmssName == "" { - return "", "", fmt.Errorf("cannot find AKS node VMSS in %s", nodeResourceGroup) - } - - // Get subnet ID from VMSS - subnetID, err := RunCommand(ctx, "az", "vmss", "show", - "--resource-group", nodeResourceGroup, - "--name", vmssName, - "--query", "virtualMachineProfile.networkProfile.networkInterfaceConfigurations[0].ipConfigurations[0].subnet.id", - "-o", "tsv") - if err != nil { - return "", "", fmt.Errorf("failed to get subnet ID from VMSS: %w", err) - } - - // Parse VNet name and resource group from subnet ID - // Format: /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.Network/virtualNetworks/{vnet}/subnets/{subnet} - parts := strings.Split(subnetID, "/") - if len(parts) < 9 { - return "", "", fmt.Errorf("invalid subnet ID format: %s", subnetID) - } - - vnetRG = parts[4] - vnetName = parts[8] - - return vnetName, vnetRG, nil -} - -// VMExists checks if a VM exists -func (az *AzureCLI) VMExists(ctx context.Context, resourceGroup, vmName string) bool { - return RunCommandSilent(ctx, "az", "vm", "show", - "--resource-group", resourceGroup, - "--name", vmName) -} - -// GetVMPublicIP retrieves a VM's public IP address -func (az *AzureCLI) GetVMPublicIP(ctx context.Context, resourceGroup, vmName string) (string, error) { - return RunCommand(ctx, "az", "vm", "list-ip-addresses", - "--resource-group", resourceGroup, - "--name", vmName, - "--query", "[0].virtualMachine.network.publicIpAddresses[0].ipAddress", - "-o", "tsv") -} - -// CreateSubnet creates a subnet in a VNet -func (az *AzureCLI) CreateSubnet(ctx context.Context, vnetRG, vnetName, subnetName, addressPrefix string) error { - // Check if subnet exists - if RunCommandSilent(ctx, "az", "network", "vnet", "subnet", "show", - "--resource-group", vnetRG, - "--vnet-name", vnetName, - "--name", subnetName) { - az.logger.Info("Subnet %s already exists", subnetName) - return nil - } - - _, err := RunCommand(ctx, "az", "network", "vnet", "subnet", "create", - "--resource-group", vnetRG, - "--vnet-name", vnetName, - "--name", subnetName, - "--address-prefixes", addressPrefix) - return err -} - -// CreateNSG creates a network security group with rules -func (az *AzureCLI) CreateNSG(ctx context.Context, resourceGroup, nsgName string, vpnPort int) error { - // Check if NSG exists - if RunCommandSilent(ctx, "az", "network", "nsg", "show", - "--resource-group", resourceGroup, - "--name", nsgName) { - az.logger.Info("NSG %s already exists", nsgName) - return nil - } - - // Create NSG - if _, err := RunCommand(ctx, "az", "network", "nsg", "create", - "--resource-group", resourceGroup, - "--name", nsgName); err != nil { - return fmt.Errorf("failed to create NSG: %w", err) - } - - // Add SSH rule (priority 100 to override NRMS-Rule-106) - if _, err := RunCommand(ctx, "az", "network", "nsg", "rule", "create", - "--resource-group", resourceGroup, - "--nsg-name", nsgName, - "--name", "allow-ssh", - "--priority", "100", - "--destination-port-ranges", "22", - "--protocol", "Tcp", - "--access", "Allow"); err != nil { - return fmt.Errorf("failed to create SSH rule: %w", err) - } - - // Add VPN rule - if _, err := RunCommand(ctx, "az", "network", "nsg", "rule", "create", - "--resource-group", resourceGroup, - "--nsg-name", nsgName, - "--name", "allow-vpn", - "--priority", "200", - "--destination-port-ranges", fmt.Sprintf("%d", vpnPort), - "--protocol", "Udp", - "--access", "Allow"); err != nil { - return fmt.Errorf("failed to create VPN rule: %w", err) - } - - return nil -} - -// CreatePublicIP creates a static public IP -func (az *AzureCLI) CreatePublicIP(ctx context.Context, resourceGroup, pipName string) error { - // Check if PIP exists - if RunCommandSilent(ctx, "az", "network", "public-ip", "show", - "--resource-group", resourceGroup, - "--name", pipName) { - az.logger.Info("Public IP %s already exists", pipName) - return nil - } - - _, err := RunCommand(ctx, "az", "network", "public-ip", "create", - "--resource-group", resourceGroup, - "--name", pipName, - "--sku", "Standard", - "--allocation-method", "Static") - return err -} - -// GetPublicIPAddress retrieves a public IP address -func (az *AzureCLI) GetPublicIPAddress(ctx context.Context, resourceGroup, pipName string) (string, error) { - return RunCommand(ctx, "az", "network", "public-ip", "show", - "--resource-group", resourceGroup, - "--name", pipName, - "--query", "ipAddress", "-o", "tsv") -} - -// CreateVM creates a VM with specified configuration -func (az *AzureCLI) CreateVM(ctx context.Context, resourceGroup, vmName, vnetName, subnetName, nsgName, pipName, sshKeyPath, vmSize string) error { - _, err := RunCommand(ctx, "az", "vm", "create", - "--resource-group", resourceGroup, - "--name", vmName, - "--image", "Ubuntu2204", - "--size", vmSize, - "--vnet-name", vnetName, - "--subnet", subnetName, - "--nsg", nsgName, - "--public-ip-address", pipName, - "--admin-username", "azureuser", - "--ssh-key-values", sshKeyPath+".pub", - "--zone", "1") - return err -} - -// AddSSHKeyToVM adds an SSH key to a VM -func (az *AzureCLI) AddSSHKeyToVM(ctx context.Context, resourceGroup, vmName, sshKeyPath string) error { - pubKey, err := ReadFileContent(sshKeyPath + ".pub") - if err != nil { - return fmt.Errorf("failed to read SSH public key: %w", err) - } - - _, err = RunCommand(ctx, "az", "vm", "user", "update", - "--resource-group", resourceGroup, - "--name", vmName, - "--username", "azureuser", - "--ssh-key-value", strings.TrimSpace(pubKey), - "--output", "none") - return err -} - -// RestartVM restarts a VM -func (az *AzureCLI) RestartVM(ctx context.Context, resourceGroup, vmName string) error { - _, err := RunCommand(ctx, "az", "vm", "restart", - "--resource-group", resourceGroup, - "--name", vmName, - "--no-wait") - return err -} - -// DeleteVM deletes a VM -func (az *AzureCLI) DeleteVM(ctx context.Context, resourceGroup, vmName string) error { - if !az.VMExists(ctx, resourceGroup, vmName) { - return nil - } - _, err := RunCommand(ctx, "az", "vm", "delete", - "--resource-group", resourceGroup, - "--name", vmName, - "--yes", - "--only-show-errors") - return err -} - -// DeleteNIC deletes a network interface -func (az *AzureCLI) DeleteNIC(ctx context.Context, resourceGroup, nicName string) error { - if !RunCommandSilent(ctx, "az", "network", "nic", "show", - "--resource-group", resourceGroup, - "--name", nicName) { - return nil - } - _, err := RunCommand(ctx, "az", "network", "nic", "delete", - "--resource-group", resourceGroup, - "--name", nicName, - "--only-show-errors") - return err -} - -// DeletePublicIP deletes a public IP -func (az *AzureCLI) DeletePublicIP(ctx context.Context, resourceGroup, pipName string) error { - if !RunCommandSilent(ctx, "az", "network", "public-ip", "show", - "--resource-group", resourceGroup, - "--name", pipName) { - return nil - } - _, err := RunCommand(ctx, "az", "network", "public-ip", "delete", - "--resource-group", resourceGroup, - "--name", pipName, - "--only-show-errors") - return err -} - -// DeleteNSG deletes a network security group -func (az *AzureCLI) DeleteNSG(ctx context.Context, resourceGroup, nsgName string) error { - if !RunCommandSilent(ctx, "az", "network", "nsg", "show", - "--resource-group", resourceGroup, - "--name", nsgName) { - return nil - } - _, err := RunCommand(ctx, "az", "network", "nsg", "delete", - "--resource-group", resourceGroup, - "--name", nsgName, - "--only-show-errors") - return err -} - -// DeleteSubnet deletes a subnet -func (az *AzureCLI) DeleteSubnet(ctx context.Context, vnetRG, vnetName, subnetName string) error { - _, _ = RunCommand(ctx, "az", "network", "vnet", "subnet", "delete", - "--resource-group", vnetRG, - "--vnet-name", vnetName, - "--name", subnetName) - return nil // Ignore errors -} - -// DeleteDisks deletes disks matching a pattern -func (az *AzureCLI) DeleteDisks(ctx context.Context, resourceGroup, pattern string) error { - output, err := RunCommand(ctx, "az", "disk", "list", - "--resource-group", resourceGroup, - "--query", fmt.Sprintf("[?contains(name, '%s')].name", pattern), - "-o", "json") - if err != nil { - return nil // Ignore errors - } - - var diskNames []string - if err := json.Unmarshal([]byte(output), &diskNames); err != nil { - return nil - } - - for _, disk := range diskNames { - _, _ = RunCommand(ctx, "az", "disk", "delete", - "--resource-group", resourceGroup, - "--name", disk, - "--yes", - "--only-show-errors") - } - - return nil -} - -// DeleteConnectedMachine deletes an Arc connected machine -func (az *AzureCLI) DeleteConnectedMachine(ctx context.Context, resourceGroup, machineName string) error { - _, _ = RunCommand(ctx, "az", "connectedmachine", "delete", - "--resource-group", resourceGroup, - "--name", machineName, - "--yes") - return nil // Ignore errors -} - -// GetAKSCredentials gets AKS cluster credentials -func (az *AzureCLI) GetAKSCredentials(ctx context.Context, resourceGroup, clusterName, kubeconfigPath string) error { - // Ensure directory exists - if err := EnsureDirectory("/root/.kube"); err != nil { - return err - } - - _, err := RunCommand(ctx, "az", "aks", "get-credentials", - "--resource-group", resourceGroup, - "--name", clusterName, - "--overwrite-existing", - "--file", kubeconfigPath) - return err -} - -// InstallAKSCLI installs kubectl and kubelogin -func (az *AzureCLI) InstallAKSCLI(ctx context.Context) error { - _, err := RunCommand(ctx, "az", "aks", "install-cli", - "--install-location", "/usr/local/bin/kubectl", - "--kubelogin-install-location", "/usr/local/bin/kubelogin") - if err != nil { - return err - } - - _, _ = RunCommand(ctx, "chmod", "+x", "/usr/local/bin/kubectl", "/usr/local/bin/kubelogin") - return nil -} - -// InstallConnectedMachineExtension installs the connectedmachine extension -func (az *AzureCLI) InstallConnectedMachineExtension(ctx context.Context) error { - // Check if already installed - if RunCommandSilent(ctx, "az", "extension", "show", "--name", "connectedmachine") { - return nil - } - - _, _ = RunCommand(ctx, "az", "config", "set", "extension.dynamic_install_allow_preview=true", "--only-show-errors") - - // Install extension - _, err := RunCommand(ctx, "az", "extension", "add", - "--name", "connectedmachine", - "--allow-preview", "true", - "--only-show-errors") - return err -} diff --git a/pkg/privatecluster/scripts.go b/pkg/privatecluster/scripts.go deleted file mode 100644 index 990c68f..0000000 --- a/pkg/privatecluster/scripts.go +++ /dev/null @@ -1,69 +0,0 @@ -package privatecluster - -import ( - "context" - "fmt" - - "github.com/Azure/azure-sdk-for-go/sdk/azidentity" -) - -// ScriptRunner provides backward compatibility (Deprecated: use Installer/Uninstaller directly) -type ScriptRunner struct { - verbose bool -} - -// NewScriptRunner creates a new ScriptRunner instance (Deprecated) -func NewScriptRunner(scriptsDir string) *ScriptRunner { - return &ScriptRunner{verbose: false} -} - -// RunPrivateInstall executes the private cluster installation using Go implementation -func (r *ScriptRunner) RunPrivateInstall(ctx context.Context, aksResourceID string) error { - if aksResourceID == "" { - return fmt.Errorf("AKS resource ID is required") - } - - cred, err := azidentity.NewAzureCLICredential(nil) - if err != nil { - return fmt.Errorf("failed to create Azure CLI credential: %w", err) - } - - options := InstallOptions{ - AKSResourceID: aksResourceID, - Gateway: DefaultGatewayConfig(), - Verbose: r.verbose, - } - - installer, err := NewInstaller(options, cred) - if err != nil { - return fmt.Errorf("failed to create installer: %w", err) - } - return installer.Install(ctx) -} - -// RunPrivateUninstall executes the private cluster uninstallation using Go implementation -func (r *ScriptRunner) RunPrivateUninstall(ctx context.Context, mode CleanupMode, aksResourceID string) error { - if mode == CleanupModeFull && aksResourceID == "" { - return fmt.Errorf("--aks-resource-id is required for full cleanup mode") - } - - var cred *azidentity.AzureCLICredential - if aksResourceID != "" { - var err error - cred, err = azidentity.NewAzureCLICredential(nil) - if err != nil { - return fmt.Errorf("failed to create Azure CLI credential: %w", err) - } - } - - options := UninstallOptions{ - Mode: mode, - AKSResourceID: aksResourceID, - } - - uninstaller, err := NewUninstaller(options, cred) - if err != nil { - return fmt.Errorf("failed to create uninstaller: %w", err) - } - return uninstaller.Uninstall(ctx) -} From 4af92aec1de4f07a9751cd7f98846572b51807e4 Mon Sep 17 00:00:00 2001 From: weiliu2 Date: Tue, 10 Feb 2026 21:46:54 +1300 Subject: [PATCH 08/11] comments 4 --- pkg/privatecluster/azure_client.go | 10 ++-- pkg/privatecluster/ssh.go | 9 ---- pkg/privatecluster/uninstaller.go | 80 +++--------------------------- pkg/privatecluster/utils.go | 14 ------ 4 files changed, 9 insertions(+), 104 deletions(-) diff --git a/pkg/privatecluster/azure_client.go b/pkg/privatecluster/azure_client.go index 17b3a9f..f236a89 100644 --- a/pkg/privatecluster/azure_client.go +++ b/pkg/privatecluster/azure_client.go @@ -136,7 +136,9 @@ func (c *AzureClient) GetAKSClusterInfo(ctx context.Context, resourceGroup, clus return info, nil } -// GetVNetInfo retrieves VNet information from AKS VMSS in the node resource group. +// GetVNetInfo discovers VNet name and resource group by inspecting VMSS subnet configuration +// in the node resource group. This works for both default and BYO VNet scenarios since the +// VNet resource group is extracted from the VMSS subnet ID, not assumed to be nodeResourceGroup. func (c *AzureClient) GetVNetInfo(ctx context.Context, nodeResourceGroup string) (vnetName, vnetRG string, err error) { pager := c.vmssClient.NewListPager(nodeResourceGroup, nil) for pager.More() { @@ -567,12 +569,6 @@ func (c *AzureClient) DeleteDisks(ctx context.Context, resourceGroup, pattern st return nil } -// DeleteConnectedMachine deletes an Arc connected machine (errors are ignored). -func (c *AzureClient) DeleteConnectedMachine(ctx context.Context, resourceGroup, machineName string) error { - _, _ = c.arcClient.Delete(ctx, resourceGroup, machineName, nil) - return nil -} - // GetAKSCredentials gets AKS cluster credentials and writes the kubeconfig to the specified path. func (c *AzureClient) GetAKSCredentials(ctx context.Context, resourceGroup, clusterName, kubeconfigPath string) error { resp, err := c.aksClient.ListClusterUserCredentials(ctx, resourceGroup, clusterName, nil) diff --git a/pkg/privatecluster/ssh.go b/pkg/privatecluster/ssh.go index 3ab5bc4..fe30150 100644 --- a/pkg/privatecluster/ssh.go +++ b/pkg/privatecluster/ssh.go @@ -105,15 +105,6 @@ func (s *SSHClient) ReadRemoteFile(ctx context.Context, path string) (string, er return s.Execute(ctx, fmt.Sprintf("sudo cat %s 2>/dev/null || echo ''", path)) } -// WriteRemoteFile writes content to a file on the remote host -func (s *SSHClient) WriteRemoteFile(ctx context.Context, path, content string) error { - // Use heredoc to write file - script := fmt.Sprintf(`sudo tee %s > /dev/null << 'EOFCONTENT' -%s -EOFCONTENT`, path, content) - return s.ExecuteScript(ctx, script) -} - // CommandExists checks if a command exists on the remote host func (s *SSHClient) CommandExists(ctx context.Context, command string) bool { return s.ExecuteSilent(ctx, fmt.Sprintf("command -v %s", command)) diff --git a/pkg/privatecluster/uninstaller.go b/pkg/privatecluster/uninstaller.go index 638d542..bf3d9d0 100644 --- a/pkg/privatecluster/uninstaller.go +++ b/pkg/privatecluster/uninstaller.go @@ -3,7 +3,6 @@ package privatecluster import ( "context" "fmt" - "strings" "github.com/Azure/azure-sdk-for-go/sdk/azcore" ) @@ -96,16 +95,13 @@ func (u *Uninstaller) cleanupLocal(ctx context.Context) error { // Get Gateway IP and client key from VPN config (before stopping VPN) u.readVPNConfig() - // Remove node from cluster (while VPN is still connected) + // Remove node from cluster while VPN is still connected. + // This must happen here (not in bootstrapper) because the private cluster API server + // is only reachable through the VPN tunnel, which gets torn down below. u.removeNodeFromCluster(ctx, hostname) - // Stop any running aks-flex-node agent process - u.stopFlexNodeAgent(ctx) - - // Note: main unbootstrap handles kubelet/containerd cleanup - - // Remove Arc Agent - u.removeArcAgent(ctx, hostname) + // Note: stopFlexNodeAgent and removeArcAgent are handled by the bootstrapper's + // services.UnInstaller and arc.UnInstaller steps respectively. // Remove client peer from Gateway u.removeClientPeerFromGateway(ctx) @@ -142,17 +138,9 @@ func (u *Uninstaller) cleanupFull(ctx context.Context) error { // Get Gateway IP and client key from VPN config (before stopping VPN) u.readVPNConfig() - // Remove node from cluster (while VPN is still connected) + // Remove node from cluster while VPN is still connected (see comment in cleanupLocal) u.removeNodeFromCluster(ctx, hostname) - // Stop any running aks-flex-node agent process - u.stopFlexNodeAgent(ctx) - - // Note: main unbootstrap handles kubelet/containerd cleanup - - // Remove Arc Agent - u.removeArcAgent(ctx, hostname) - // Remove client peer from Gateway u.removeClientPeerFromGateway(ctx) @@ -218,62 +206,6 @@ func (u *Uninstaller) removeNodeFromCluster(ctx context.Context, nodeName string u.logger.Warning("Failed to remove node from cluster (may need manual cleanup: kubectl delete node %s)", nodeName) } -// stopFlexNodeAgent stops any running aks-flex-node agent process -func (u *Uninstaller) stopFlexNodeAgent(ctx context.Context) { - u.logger.Info("Stopping aks-flex-node agent...") - _, _ = RunCommand(ctx, "pkill", "-f", "aks-flex-node agent") - _, _ = RunCommand(ctx, "sleep", "2") -} - -// removeArcAgent removes Azure Arc agent -func (u *Uninstaller) removeArcAgent(ctx context.Context, nodeName string) { - if !CommandExists("azcmagent") { - u.logger.Info("Arc Agent not found, skipping") - return - } - - u.logger.Info("Removing Arc Agent...") - - // Get Arc resource group - arcRG := "" - output, err := RunCommand(ctx, "azcmagent", "show") - if err == nil { - for _, line := range strings.Split(output, "\n") { - if strings.Contains(line, "Resource Group") { - parts := strings.SplitN(line, ":", 2) - if len(parts) == 2 { - arcRG = strings.TrimSpace(parts[1]) - } - } - } - } - - if arcRG != "" && u.azureClient != nil { - u.logger.Info("Deleting Arc machine from Azure...") - _ = u.azureClient.DeleteConnectedMachine(ctx, arcRG, nodeName) - u.logger.Success("Arc machine deleted from Azure") - } else if u.clusterInfo != nil && u.azureClient != nil { - _ = u.azureClient.DeleteConnectedMachine(ctx, u.clusterInfo.ResourceGroup, nodeName) - } - - _, _ = RunCommand(ctx, "azcmagent", "disconnect", "--force-local-only") - - for _, service := range []string{"himdsd", "extd", "gcad", "arcproxyd"} { - _, _ = RunCommand(ctx, "systemctl", "stop", service) - _, _ = RunCommand(ctx, "systemctl", "disable", service) - } - - if CommandExists("apt") { - _, _ = RunCommand(ctx, "apt", "remove", "azcmagent", "-y") - } else if CommandExists("yum") { - _, _ = RunCommand(ctx, "yum", "remove", "azcmagent", "-y") - } - - _, _ = RunCommand(ctx, "rm", "-rf", "/var/opt/azcmagent", "/opt/azcmagent") - - u.logger.Success("Arc Agent removed") -} - // removeClientPeerFromGateway removes this client's peer from the Gateway func (u *Uninstaller) removeClientPeerFromGateway(ctx context.Context) { if u.gatewayIP == "" || u.clientKey == "" || !FileExists(u.sshKeyPath) { diff --git a/pkg/privatecluster/utils.go b/pkg/privatecluster/utils.go index 43f2f08..49f7a2b 100644 --- a/pkg/privatecluster/utils.go +++ b/pkg/privatecluster/utils.go @@ -79,15 +79,6 @@ func RunCommandSilent(ctx context.Context, name string, args ...string) bool { return cmd.Run() == nil } -// RunCommandInteractive executes a command with stdout/stderr/stdin connected to the terminal -func RunCommandInteractive(ctx context.Context, name string, args ...string) error { - cmd := exec.CommandContext(ctx, name, args...) // #nosec G204 -- commands are from trusted internal code - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - cmd.Stdin = os.Stdin - return cmd.Run() -} - // CommandExists checks if a command is available in PATH func CommandExists(name string) bool { _, err := exec.LookPath(name) @@ -252,11 +243,6 @@ func GetHostname() (string, error) { return strings.ToLower(hostname), nil } -// IsRoot checks if the current process is running as root -func IsRoot() bool { - return os.Getuid() == 0 -} - // CleanKubeCache removes kube cache directories func CleanKubeCache() error { paths := []string{ From 543684a7c5f4265a168d70ee0b19d9446bcb1206 Mon Sep 17 00:00:00 2001 From: weiliu2 Date: Wed, 11 Feb 2026 19:07:10 +1300 Subject: [PATCH 09/11] [stage1] add node to private cluster --- go.mod | 6 +++--- go.sum | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index 2e3b2ba..03204dc 100644 --- a/go.mod +++ b/go.mod @@ -8,8 +8,11 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/azcore v1.19.1 github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.10.1 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v3 v3.0.0-beta.2 + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6 v6.4.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v5 v5.0.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute v1.2.0 + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0 + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0 github.com/Azure/go-autorest/autorest/to v0.4.1 github.com/google/uuid v1.6.0 github.com/sirupsen/logrus v1.9.3 @@ -20,9 +23,6 @@ require ( require ( github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect - github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6 v6.4.0 // indirect - github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0 // indirect - github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0 // indirect github.com/Azure/go-autorest v14.2.0+incompatible // indirect github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect diff --git a/go.sum b/go.sum index 04064c0..194e423 100644 --- a/go.sum +++ b/go.sum @@ -16,10 +16,12 @@ github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcom github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute v1.2.0/go.mod h1:F2eDq/BGK2LOEoDtoHbBOphaPqcjT0K/Y5Am8vf7+0w= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0 h1:PTFGRSlMKCQelWwxUyYVEUqseBJVemLyqWJjvMyt0do= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0/go.mod h1:LRr2FzBTQlONPPa5HREE5+RjSCTXl7BwOvYOaWTqCaI= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v3 v3.1.0 h1:2qsIIvxVT+uE6yrNldntJKlLRgxGbZ85kgtz5SNBhMw= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v3 v3.1.0/go.mod h1:AW8VEadnhw9xox+VaVd9sP7NjzOAnaZBLRH6Tq3cJ38= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0 h1:HYGD75g0bQ3VO/Omedm54v4LrD3B1cGImuRF3AJ5wLo= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0/go.mod h1:ulHyBFJOI0ONiRL4vcJTmS7rx18jQQlEPmAgo80cRdM= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1 h1:7CBQ+Ei8SP2c6ydQTGCCrS35bDxgTMfoP2miAwK++OU= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1/go.mod h1:c/wcGeGx5FUPbM/JltUYHZcKmigwyVLJlDq+4HdtXaw= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.2.0 h1:Dd+RhdJn0OTtVGaeDLZpcumkIVCtA/3/Fo42+eoYvVM= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.2.0/go.mod h1:5kakwfW5CjC9KK+Q4wjXAg+ShuIm2mBMua0ZFj2C8PE= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0 h1:wxQx2Bt4xzPIKvW59WQf1tJNx/ZZKPfN+EhPX3Z6CYY= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0/go.mod h1:TpiwjwnW/khS0LKs4vW5UmmT9OWcxaveS8U7+tlknzo= github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs= From 549cdc6dc1498b3155a7353354b9f52c47aa8417 Mon Sep 17 00:00:00 2001 From: weiliu2 Date: Thu, 12 Feb 2026 04:35:12 +1300 Subject: [PATCH 10/11] remove azure cli and use go track2 client --- commands.go | 67 +------ pkg/bootstrapper/bootstrapper.go | 6 +- pkg/config/structs.go | 22 +-- pkg/privatecluster/azure_client.go | 34 ++-- pkg/privatecluster/installer.go | 206 ++++++++++++--------- pkg/privatecluster/privatecluster_test.go | 78 ++++---- pkg/privatecluster/ssh.go | 14 +- pkg/privatecluster/tool_installer.go | 91 ++++++--- pkg/privatecluster/types.go | 13 -- pkg/privatecluster/uninstaller.go | 214 +++++++++++----------- pkg/privatecluster/utils.go | 57 ------ pkg/privatecluster/vpn.go | 18 +- 12 files changed, 376 insertions(+), 444 deletions(-) diff --git a/commands.go b/commands.go index 4bff781..ea9a54d 100644 --- a/commands.go +++ b/commands.go @@ -8,14 +8,12 @@ import ( "path/filepath" "time" - "github.com/Azure/azure-sdk-for-go/sdk/azidentity" "github.com/sirupsen/logrus" "github.com/spf13/cobra" "go.goms.io/aks/AKSFlexNode/pkg/bootstrapper" "go.goms.io/aks/AKSFlexNode/pkg/config" "go.goms.io/aks/AKSFlexNode/pkg/logger" - "go.goms.io/aks/AKSFlexNode/pkg/privatecluster" "go.goms.io/aks/AKSFlexNode/pkg/spec" "go.goms.io/aks/AKSFlexNode/pkg/status" ) @@ -88,30 +86,6 @@ func runAgent(ctx context.Context) error { return fmt.Errorf("failed to load config from %s: %w", configPath, err) } - // For private clusters, run Gateway/VPN setup before bootstrap - if cfg.Azure.TargetCluster != nil && cfg.Azure.TargetCluster.IsPrivateCluster { - logger.Info("Private cluster detected, running Gateway/VPN setup...") - if os.Getuid() != 0 { - return fmt.Errorf("private cluster setup requires root privileges, please run with 'sudo'") - } - cred, err := azidentity.NewAzureCLICredential(nil) - if err != nil { - return fmt.Errorf("failed to create Azure CLI credential: %w", err) - } - options := privatecluster.InstallOptions{ - AKSResourceID: cfg.Azure.TargetCluster.ResourceID, - Gateway: privatecluster.DefaultGatewayConfig(), - } - installer, err := privatecluster.NewInstaller(options, cred) - if err != nil { - return fmt.Errorf("failed to create private cluster installer: %w", err) - } - if err := installer.Install(ctx); err != nil { - return fmt.Errorf("private cluster setup failed: %w", err) - } - logger.Info("Private cluster setup completed") - } - bootstrapExecutor := bootstrapper.New(cfg, logger) result, err := bootstrapExecutor.Bootstrap(ctx) if err != nil { @@ -144,46 +118,11 @@ func runUnbootstrap(ctx context.Context) error { return fmt.Errorf("failed to load config from %s: %w", configPath, err) } - // For private clusters, run VPN/Gateway cleanup first - if cfg.Azure.TargetCluster != nil && cfg.Azure.TargetCluster.IsPrivateCluster { - logger.Info("Private cluster detected, running VPN/Gateway cleanup...") - - // Validate cleanup mode - var mode privatecluster.CleanupMode - switch cleanupMode { - case "local": - mode = privatecluster.CleanupModeLocal - case "full": - mode = privatecluster.CleanupModeFull - default: - return fmt.Errorf("invalid cleanup mode: %s (use 'local' or 'full')", cleanupMode) - } - - // Check root privileges for private cluster cleanup - if os.Getuid() != 0 { - return fmt.Errorf("private cluster cleanup requires root privileges, please run with 'sudo'") - } - - options := privatecluster.UninstallOptions{ - Mode: mode, - AKSResourceID: cfg.Azure.TargetCluster.ResourceID, - } - cred, err := azidentity.NewAzureCLICredential(nil) - if err != nil { - return fmt.Errorf("failed to create Azure CLI credential: %w", err) - } - uninstaller, err := privatecluster.NewUninstaller(options, cred) - if err != nil { - return fmt.Errorf("failed to create private cluster uninstaller: %w", err) - } - if err := uninstaller.Uninstall(ctx); err != nil { - logger.Warnf("Private cluster cleanup had errors: %v", err) - // Continue with normal unbootstrap even if private cleanup has issues - } - logger.Info("Private cluster cleanup completed") + // Pass cleanup mode to config so the PrivateClusterUninstall step can read it + if cfg.Azure.TargetCluster != nil { + cfg.Azure.TargetCluster.CleanupMode = cleanupMode } - // Run normal unbootstrap bootstrapExecutor := bootstrapper.New(cfg, logger) result, err := bootstrapExecutor.Unbootstrap(ctx) if err != nil { diff --git a/pkg/bootstrapper/bootstrapper.go b/pkg/bootstrapper/bootstrapper.go index d719a4c..c9b266b 100644 --- a/pkg/bootstrapper/bootstrapper.go +++ b/pkg/bootstrapper/bootstrapper.go @@ -15,6 +15,7 @@ import ( "go.goms.io/aks/AKSFlexNode/pkg/components/services" "go.goms.io/aks/AKSFlexNode/pkg/components/system_configuration" "go.goms.io/aks/AKSFlexNode/pkg/config" + "go.goms.io/aks/AKSFlexNode/pkg/privatecluster" ) // Bootstrapper executes bootstrap steps sequentially @@ -31,8 +32,8 @@ func New(cfg *config.Config, logger *logrus.Logger) *Bootstrapper { // Bootstrap executes all bootstrap steps sequentially func (b *Bootstrapper) Bootstrap(ctx context.Context) (*ExecutionResult, error) { - // Define the bootstrap steps in order - using modules directly steps := []Executor{ + privatecluster.NewInstaller(b.logger), // VPN/Gateway setup (if private cluster) arc.NewInstaller(b.logger), // Setup Arc services.NewUnInstaller(b.logger), // Stop kubelet before setup system_configuration.NewInstaller(b.logger), // Configure system (early) @@ -51,6 +52,7 @@ func (b *Bootstrapper) Bootstrap(ctx context.Context) (*ExecutionResult, error) // Unbootstrap executes all cleanup steps sequentially (in reverse order of bootstrap) func (b *Bootstrapper) Unbootstrap(ctx context.Context) (*ExecutionResult, error) { steps := []Executor{ + privatecluster.NewUninstaller(b.logger), // Node removal + VPN teardown (if private cluster) services.NewUnInstaller(b.logger), // Stop services first npd.NewUnInstaller(b.logger), // Uninstall Node Problem Detector kubelet.NewUnInstaller(b.logger), // Clean kubelet configuration @@ -59,7 +61,7 @@ func (b *Bootstrapper) Unbootstrap(ctx context.Context) (*ExecutionResult, error containerd.NewUnInstaller(b.logger), // Uninstall containerd binary runc.NewUnInstaller(b.logger), // Uninstall runc binary system_configuration.NewUnInstaller(b.logger), // Clean system settings - arc.NewUnInstaller(b.logger), // Uninstall Arc (after cleanup) + arc.NewUnInstaller(b.logger), // Uninstall Arc (after cleanup, uses public internet) } return b.ExecuteSteps(ctx, steps, "unbootstrap") diff --git a/pkg/config/structs.go b/pkg/config/structs.go index 40e22f2..de7bce9 100644 --- a/pkg/config/structs.go +++ b/pkg/config/structs.go @@ -3,7 +3,6 @@ package config import "os" // Config represents the complete agent configuration structure. -// It contains Azure-specific settings and agent operational settings. type Config struct { Azure AzureConfig `json:"azure"` Agent AgentConfig `json:"agent"` @@ -15,13 +14,11 @@ type Config struct { Paths PathsConfig `json:"paths"` Npd NPDConfig `json:"npd"` - // Internal field to track if ManagedIdentity was explicitly set in config - // This is necessary because viper unmarshals empty JSON objects {} as nil + // Tracks if ManagedIdentity was explicitly set (viper unmarshals empty {} as nil) isMIExplicitlySet bool `json:"-"` } -// AzureConfig holds Azure-specific configuration required for connecting to Azure services. -// All fields except Cloud are required for proper operation. +// AzureConfig holds Azure-specific configuration for connecting to Azure services. type AzureConfig struct { SubscriptionID string `json:"subscriptionId"` // Azure subscription ID TenantID string `json:"tenantId"` // Azure tenant ID @@ -34,7 +31,6 @@ type AzureConfig struct { } // ServicePrincipalConfig holds Azure service principal authentication configuration. -// When provided, service principal authentication will be used instead of Azure CLI. type ServicePrincipalConfig struct { TenantID string `json:"tenantId"` // Azure AD tenant ID ClientID string `json:"clientId"` // Azure AD application (client) ID @@ -42,22 +38,23 @@ type ServicePrincipalConfig struct { } // ManagedIdentityConfig holds managed identity authentication configuration. -// It can only be used when the agent is running on an Azure VM with a managed identity assigned. type ManagedIdentityConfig struct { ClientID string `json:"clientId,omitempty"` // Client ID of the managed identity (optional, for VMs with multiple identities) } // BootstrapTokenConfig holds Kubernetes bootstrap token authentication configuration. -// Bootstrap tokens provide a lightweight authentication method for node joining. type BootstrapTokenConfig struct { Token string `json:"token"` // Bootstrap token in format: . } // TargetClusterConfig holds configuration for the target AKS cluster the ARC machine will connect to. type TargetClusterConfig struct { - ResourceID string `json:"resourceId"` // Full resource ID of the target AKS cluster - Location string `json:"location"` // Azure region of the cluster (e.g., "eastus", "westus2") - IsPrivateCluster bool `json:"private"` // Whether this is a private AKS cluster (requires Gateway/VPN setup) + ResourceID string `json:"resourceId"` // Full resource ID of the target AKS cluster + Location string `json:"location"` // Azure region of the cluster (e.g., "eastus", "westus2") + IsPrivateCluster bool `json:"private" mapstructure:"private"` // Whether this is a private AKS cluster (requires Gateway/VPN setup) + GatewayVMSize string `json:"gatewayVMSize,omitempty" mapstructure:"gatewayVMSize"` // VPN Gateway VM size (defaults to "Standard_D2s_v3") + GatewayPort int `json:"gatewayPort,omitempty" mapstructure:"gatewayPort"` // VPN Gateway port (defaults to 51820) + CleanupMode string `json:"-"` // Runtime-only, set by CLI flag for unbootstrap Name string // will be populated from ResourceID ResourceGroup string // will be populated from ResourceID SubscriptionID string // will be populated from ResourceID @@ -131,7 +128,7 @@ type KubernetesPathsConfig struct { KubeletDir string `json:"kubeletDir"` } -// CNIPathsConfig holds file system paths related to CNI plugins and configurations. +// CNIConfig holds configuration settings for CNI plugins. type CNIConfig struct { Version string `json:"version"` } @@ -150,7 +147,6 @@ func (cfg *Config) IsSPConfigured() bool { } // IsMIConfigured checks if managed identity configuration is provided in the configuration -// Uses internal flag set during config loading to handle viper's empty object behavior func (cfg *Config) IsMIConfigured() bool { return cfg.isMIExplicitlySet } diff --git a/pkg/privatecluster/azure_client.go b/pkg/privatecluster/azure_client.go index f236a89..fa25151 100644 --- a/pkg/privatecluster/azure_client.go +++ b/pkg/privatecluster/azure_client.go @@ -15,11 +15,12 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions" + "github.com/sirupsen/logrus" ) -// AzureClient provides Azure operations using the Azure SDK for Go (Track 2). +// AzureClient provides Azure operations using the Azure SDK for Go. type AzureClient struct { - logger *Logger + logger *logrus.Logger subscriptionID string vmClient *armcompute.VirtualMachinesClient @@ -35,7 +36,7 @@ type AzureClient struct { } // NewAzureClient creates a new AzureClient with all sub-clients initialized. -func NewAzureClient(cred azcore.TokenCredential, subscriptionID string, logger *Logger) (*AzureClient, error) { +func NewAzureClient(cred azcore.TokenCredential, subscriptionID string, logger *logrus.Logger) (*AzureClient, error) { c := &AzureClient{ logger: logger, subscriptionID: subscriptionID, @@ -108,12 +109,10 @@ func (c *AzureClient) GetAKSClusterInfo(ctx context.Context, resourceGroup, clus return nil, fmt.Errorf("AKS cluster properties are nil") } - // Check AAD enabled if props.AADProfile == nil || props.AADProfile.Managed == nil || !*props.AADProfile.Managed { return nil, fmt.Errorf("AKS cluster AAD not enabled, please enable: az aks update --enable-aad") } - // Check Azure RBAC enabled if props.AADProfile.EnableAzureRBAC == nil || !*props.AADProfile.EnableAzureRBAC { return nil, fmt.Errorf("AKS cluster Azure RBAC not enabled, please enable: az aks update --enable-azure-rbac") } @@ -136,9 +135,7 @@ func (c *AzureClient) GetAKSClusterInfo(ctx context.Context, resourceGroup, clus return info, nil } -// GetVNetInfo discovers VNet name and resource group by inspecting VMSS subnet configuration -// in the node resource group. This works for both default and BYO VNet scenarios since the -// VNet resource group is extracted from the VMSS subnet ID, not assumed to be nodeResourceGroup. +// GetVNetInfo discovers VNet name and resource group from VMSS subnet configuration. func (c *AzureClient) GetVNetInfo(ctx context.Context, nodeResourceGroup string) (vnetName, vnetRG string, err error) { pager := c.vmssClient.NewListPager(nodeResourceGroup, nil) for pager.More() { @@ -214,7 +211,7 @@ func (c *AzureClient) GetVMPublicIP(ctx context.Context, resourceGroup, vmName s func (c *AzureClient) CreateSubnet(ctx context.Context, vnetRG, vnetName, subnetName, addressPrefix string) error { _, err := c.subnetClient.Get(ctx, vnetRG, vnetName, subnetName, nil) if err == nil { - c.logger.Info("Subnet %s already exists", subnetName) + c.logger.Infof("Subnet %s already exists", subnetName) return nil } @@ -236,7 +233,7 @@ func (c *AzureClient) CreateSubnet(ctx context.Context, vnetRG, vnetName, subnet func (c *AzureClient) CreateNSG(ctx context.Context, resourceGroup, nsgName, location string, vpnPort int) error { _, err := c.nsgClient.Get(ctx, resourceGroup, nsgName, nil) if err == nil { - c.logger.Info("NSG %s already exists", nsgName) + c.logger.Infof("NSG %s already exists", nsgName) return nil } @@ -288,7 +285,7 @@ func (c *AzureClient) CreateNSG(ctx context.Context, resourceGroup, nsgName, loc func (c *AzureClient) CreatePublicIP(ctx context.Context, resourceGroup, pipName, location string) error { _, err := c.pipClient.Get(ctx, resourceGroup, pipName, nil) if err == nil { - c.logger.Info("Public IP %s already exists", pipName) + c.logger.Infof("Public IP %s already exists", pipName) return nil } @@ -322,17 +319,14 @@ func (c *AzureClient) GetPublicIPAddress(ctx context.Context, resourceGroup, pip return *resp.Properties.IPAddress, nil } -// CreateVM creates a VM with the specified configuration. -// It first creates a NIC, then creates the VM referencing that NIC. +// CreateVM creates a NIC and VM with the specified configuration. func (c *AzureClient) CreateVM(ctx context.Context, resourceGroup, vmName, location, vnetRG, vnetName, subnetName, nsgName, pipName, sshKeyPath, vmSize string) error { - // Read SSH public key pubKeyData, err := ReadFileContent(sshKeyPath + ".pub") if err != nil { return fmt.Errorf("failed to read SSH public key: %w", err) } pubKey := strings.TrimSpace(pubKeyData) - // Build resource IDs subnetID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/%s", c.subscriptionID, vnetRG, vnetName, subnetName) nsgID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/securityGroups/%s", @@ -340,7 +334,6 @@ func (c *AzureClient) CreateVM(ctx context.Context, resourceGroup, vmName, locat pipID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/publicIPAddresses/%s", c.subscriptionID, resourceGroup, pipName) - // Create NIC nicName := vmName + "VMNic" nicPoller, err := c.nicClient.BeginCreateOrUpdate(ctx, resourceGroup, nicName, armnetwork.Interface{ Location: ptr(location), @@ -372,7 +365,6 @@ func (c *AzureClient) CreateVM(ctx context.Context, resourceGroup, vmName, locat return fmt.Errorf("failed to create NIC: %w", err) } - // Create VM vm := armcompute.VirtualMachine{ Location: ptr(location), Zones: []*string{ptr("1")}, @@ -383,9 +375,9 @@ func (c *AzureClient) CreateVM(ctx context.Context, resourceGroup, vmName, locat StorageProfile: &armcompute.StorageProfile{ ImageReference: &armcompute.ImageReference{ Publisher: ptr("Canonical"), - Offer: ptr("0001-com-ubuntu-server-jammy"), - SKU: ptr("22_04-lts-gen2"), - Version: ptr("latest"), + Offer: ptr("0001-com-ubuntu-server-jammy"), + SKU: ptr("22_04-lts-gen2"), + Version: ptr("latest"), }, OSDisk: &armcompute.OSDisk{ CreateOption: ptr(armcompute.DiskCreateOptionTypesFromImage), @@ -588,8 +580,6 @@ func (c *AzureClient) GetAKSCredentials(ctx context.Context, resourceGroup, clus return nil } -// --- Helper functions --- - // ptr returns a pointer to the given value. func ptr[T any](v T) *T { return &v diff --git a/pkg/privatecluster/installer.go b/pkg/privatecluster/installer.go index bca41d6..867c0a3 100644 --- a/pkg/privatecluster/installer.go +++ b/pkg/privatecluster/installer.go @@ -3,112 +3,148 @@ package privatecluster import ( "context" "fmt" + "os" "time" - "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "github.com/sirupsen/logrus" + + "go.goms.io/aks/AKSFlexNode/pkg/auth" + "go.goms.io/aks/AKSFlexNode/pkg/config" ) -// Installer handles private cluster installation +// Installer handles private cluster VPN/Gateway setup, implementing bootstrapper.StepExecutor. type Installer struct { - logger *Logger + config *config.Config + logger *logrus.Logger + authProvider *auth.AuthProvider azureClient *AzureClient toolInstaller *ToolInstaller - options InstallOptions - // State collected during installation clusterInfo *AKSClusterInfo vpnConfig VPNConfig sshKeyPath string gatewayIP string } -// NewInstaller creates a new Installer instance. -// cred is the Azure credential used for SDK calls. -func NewInstaller(options InstallOptions, cred azcore.TokenCredential) (*Installer, error) { - logger := NewLogger(options.Verbose) +// NewInstaller creates a new private cluster Installer. +func NewInstaller(logger *logrus.Logger) *Installer { + return &Installer{ + config: config.GetConfig(), + logger: logger, + authProvider: auth.NewAuthProvider(), + toolInstaller: NewToolInstaller(logger), + vpnConfig: DefaultVPNConfig(), + sshKeyPath: GetSSHKeyPath(), + } +} - // Apply defaults - if options.Gateway.Name == "" { - options.Gateway = DefaultGatewayConfig() +// GetName returns the step name. +func (i *Installer) GetName() string { + return "PrivateClusterInstall" +} + +// Validate checks prerequisites for private cluster installation. +func (i *Installer) Validate(ctx context.Context) error { + if !i.isPrivateCluster() { + return nil } + if os.Getuid() != 0 { + return fmt.Errorf("private cluster setup requires root privileges, please run with 'sudo'") + } + return nil +} - subscriptionID, _, _, err := ParseResourceID(options.AKSResourceID) - if err != nil { - return nil, fmt.Errorf("failed to parse resource ID: %w", err) +// IsCompleted returns true for non-private clusters or when VPN is already connected. +func (i *Installer) IsCompleted(ctx context.Context) bool { + if !i.isPrivateCluster() { + return true } + vpnClient := NewVPNClient(i.vpnConfig, i.logger) + return vpnClient.TestConnection(ctx) +} - azureClient, err := NewAzureClient(cred, subscriptionID, logger) - if err != nil { - return nil, fmt.Errorf("failed to create Azure client: %w", err) +// Execute runs the private cluster installation (Gateway/VPN setup). +func (i *Installer) Execute(ctx context.Context) error { + if !i.isPrivateCluster() { + return nil } - return &Installer{ - logger: logger, - azureClient: azureClient, - toolInstaller: NewToolInstaller(logger), - options: options, - vpnConfig: DefaultVPNConfig(), - sshKeyPath: GetSSHKeyPath(), - }, nil -} + i.logger.Infof("========================================") + i.logger.Infof(" Add Edge Node to Private AKS Cluster") + i.logger.Infof("========================================") -// Install runs the complete installation process -func (i *Installer) Install(ctx context.Context) error { - fmt.Printf("%s========================================%s\n", colorGreen, colorReset) - fmt.Printf("%s Add Edge Node to Private AKS Cluster%s\n", colorGreen, colorReset) - fmt.Printf("%s========================================%s\n\n", colorGreen, colorReset) + cred, err := i.authProvider.UserCredential(i.config) + if err != nil { + return fmt.Errorf("failed to get Azure credential: %w", err) + } - // Parse resource ID - subscriptionID, resourceGroup, clusterName, err := ParseResourceID(i.options.AKSResourceID) + subscriptionID, _, _, err := ParseResourceID(i.config.GetTargetClusterID()) if err != nil { - return err + return fmt.Errorf("failed to parse resource ID: %w", err) } + azureClient, err := NewAzureClient(cred, subscriptionID, i.logger) + if err != nil { + return fmt.Errorf("failed to create Azure client: %w", err) + } + i.azureClient = azureClient + + resourceGroup, clusterName := i.config.GetTargetClusterResourceGroup(), i.config.GetTargetClusterName() i.clusterInfo = &AKSClusterInfo{ - ResourceID: i.options.AKSResourceID, + ResourceID: i.config.GetTargetClusterID(), SubscriptionID: subscriptionID, ResourceGroup: resourceGroup, ClusterName: clusterName, } - // Phase 1: Environment Check if err := i.phase1EnvironmentCheck(ctx); err != nil { return fmt.Errorf("environment check failed: %w", err) } - - // Phase 2: Gateway Setup if err := i.phase2GatewaySetup(ctx); err != nil { return fmt.Errorf("gateway setup failed: %w", err) } - - // Phase 3: Client Configuration if err := i.phase3ClientSetup(ctx); err != nil { return fmt.Errorf("client setup failed: %w", err) } - - // Phase 4: Node Join Preparation if err := i.phase4NodeJoin(ctx); err != nil { return fmt.Errorf("node join failed: %w", err) } - // Phase 5 (Verification) skipped - node needs bootstrap to become Ready - i.logger.Success("Private cluster setup completed. Bootstrap will continue...") + i.logger.Infof("Private cluster setup completed. Bootstrap will continue...") return nil } +// isPrivateCluster checks if the config indicates a private cluster. +func (i *Installer) isPrivateCluster() bool { + return i.config != nil && + i.config.Azure.TargetCluster != nil && + i.config.Azure.TargetCluster.IsPrivateCluster +} + +// gatewayConfig returns the Gateway configuration, applying any overrides from config. +func (i *Installer) gatewayConfig() GatewayConfig { + gw := DefaultGatewayConfig() + if i.config.Azure.TargetCluster.GatewayVMSize != "" { + gw.VMSize = i.config.Azure.TargetCluster.GatewayVMSize + } + if i.config.Azure.TargetCluster.GatewayPort > 0 { + gw.Port = i.config.Azure.TargetCluster.GatewayPort + } + return gw +} + // phase1EnvironmentCheck checks prerequisites func (i *Installer) phase1EnvironmentCheck(ctx context.Context) error { _ = CleanKubeCache() - i.logger.Success("Azure SDK client ready") - i.logger.Success("Subscription: %s", i.clusterInfo.SubscriptionID) + i.logger.Infof("Azure SDK client ready") + i.logger.Infof("Subscription: %s", i.clusterInfo.SubscriptionID) - // Get Tenant ID tenantID, err := i.azureClient.GetTenantID(ctx) if err != nil { return err } i.clusterInfo.TenantID = tenantID - i.logger.Verbose("Tenant ID: %s", tenantID) + i.logger.Debugf("Tenant ID: %s", tenantID) if !i.azureClient.AKSClusterExists(ctx, i.clusterInfo.ResourceGroup, i.clusterInfo.ClusterName) { return fmt.Errorf("AKS cluster '%s' not found", i.clusterInfo.ClusterName) @@ -120,7 +156,7 @@ func (i *Installer) phase1EnvironmentCheck(ctx context.Context) error { i.clusterInfo.Location = clusterInfo.Location i.clusterInfo.NodeResourceGroup = clusterInfo.NodeResourceGroup i.clusterInfo.PrivateFQDN = clusterInfo.PrivateFQDN - i.logger.Success("AKS cluster: %s (AAD/RBAC enabled)", i.clusterInfo.ClusterName) + i.logger.Infof("AKS cluster: %s (AAD/RBAC enabled)", i.clusterInfo.ClusterName) vnetName, vnetRG, err := i.azureClient.GetVNetInfo(ctx, i.clusterInfo.NodeResourceGroup) if err != nil { @@ -128,41 +164,36 @@ func (i *Installer) phase1EnvironmentCheck(ctx context.Context) error { } i.clusterInfo.VNetName = vnetName i.clusterInfo.VNetResourceGroup = vnetRG - i.logger.Success("VNet: %s/%s", vnetRG, vnetName) + i.logger.Infof("VNet: %s/%s", vnetRG, vnetName) if err := InstallVPNTools(ctx, i.logger); err != nil { return fmt.Errorf("failed to install VPN tools: %w", err) } - if !CommandExists("kubectl") || !CommandExists("kubelogin") { - if err := i.toolInstaller.InstallAKSCLI(ctx); err != nil { - return fmt.Errorf("failed to install kubectl/kubelogin: %w", err) - } - } - if !CommandExists("kubectl") { - return fmt.Errorf("kubectl installation failed") + if err := i.toolInstaller.InstallKubectl(ctx, i.config.GetKubernetesVersion()); err != nil { + return fmt.Errorf("failed to install kubectl: %w", err) } - if !CommandExists("kubelogin") { - return fmt.Errorf("kubelogin installation failed") + if err := i.toolInstaller.InstallKubelogin(ctx); err != nil { + return fmt.Errorf("failed to install kubelogin: %w", err) } - _ = i.toolInstaller.InstallConnectedMachineExtension(ctx) - i.logger.Success("Dependencies ready") + i.logger.Infof("Dependencies ready") return nil } // phase2GatewaySetup sets up the VPN Gateway func (i *Installer) phase2GatewaySetup(ctx context.Context) error { + gateway := i.gatewayConfig() gatewayExists := false - if i.azureClient.VMExists(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name) { + if i.azureClient.VMExists(ctx, i.clusterInfo.ResourceGroup, gateway.Name) { gatewayExists = true - ip, err := i.azureClient.GetVMPublicIP(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name) + ip, err := i.azureClient.GetVMPublicIP(ctx, i.clusterInfo.ResourceGroup, gateway.Name) if err != nil { return fmt.Errorf("failed to get Gateway public IP: %w", err) } i.gatewayIP = ip - i.logger.Success("Gateway exists: %s", i.gatewayIP) + i.logger.Infof("Gateway exists: %s", i.gatewayIP) } else { - i.logger.Info("Creating Gateway...") + i.logger.Infof("Creating Gateway...") if err := i.createGatewayInfrastructure(ctx); err != nil { return err } @@ -171,7 +202,7 @@ func (i *Installer) phase2GatewaySetup(ctx context.Context) error { if err := GenerateSSHKey(i.sshKeyPath); err != nil { return fmt.Errorf("failed to generate SSH key: %w", err) } - if err := i.azureClient.AddSSHKeyToVM(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name, i.sshKeyPath); err != nil { + if err := i.azureClient.AddSSHKeyToVM(ctx, i.clusterInfo.ResourceGroup, gateway.Name, i.sshKeyPath); err != nil { return fmt.Errorf("failed to add SSH key to Gateway: %w", err) } @@ -190,15 +221,16 @@ func (i *Installer) phase2GatewaySetup(ctx context.Context) error { // createGatewayInfrastructure creates Gateway VM and related resources func (i *Installer) createGatewayInfrastructure(ctx context.Context) error { - nsgName := i.options.Gateway.Name + "-nsg" - pipName := i.options.Gateway.Name + "-pip" + gateway := i.gatewayConfig() + nsgName := gateway.Name + "-nsg" + pipName := gateway.Name + "-pip" location := i.clusterInfo.Location if err := i.azureClient.CreateSubnet(ctx, i.clusterInfo.VNetResourceGroup, i.clusterInfo.VNetName, - i.options.Gateway.SubnetName, i.options.Gateway.SubnetPrefix); err != nil { + gateway.SubnetName, gateway.SubnetPrefix); err != nil { return fmt.Errorf("failed to create subnet: %w", err) } - if err := i.azureClient.CreateNSG(ctx, i.clusterInfo.ResourceGroup, nsgName, location, i.options.Gateway.Port); err != nil { + if err := i.azureClient.CreateNSG(ctx, i.clusterInfo.ResourceGroup, nsgName, location, gateway.Port); err != nil { return fmt.Errorf("failed to create NSG: %w", err) } if err := i.azureClient.CreatePublicIP(ctx, i.clusterInfo.ResourceGroup, pipName, location); err != nil { @@ -207,10 +239,10 @@ func (i *Installer) createGatewayInfrastructure(ctx context.Context) error { if err := GenerateSSHKey(i.sshKeyPath); err != nil { return fmt.Errorf("failed to generate SSH key: %w", err) } - if err := i.azureClient.CreateVM(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name, + if err := i.azureClient.CreateVM(ctx, i.clusterInfo.ResourceGroup, gateway.Name, location, i.clusterInfo.VNetResourceGroup, i.clusterInfo.VNetName, - i.options.Gateway.SubnetName, nsgName, pipName, - i.sshKeyPath, i.options.Gateway.VMSize); err != nil { + gateway.SubnetName, nsgName, pipName, + i.sshKeyPath, gateway.VMSize); err != nil { return fmt.Errorf("failed to create Gateway VM: %w", err) } @@ -219,9 +251,9 @@ func (i *Installer) createGatewayInfrastructure(ctx context.Context) error { return fmt.Errorf("failed to get public IP address: %w", err) } i.gatewayIP = ip - i.logger.Success("Gateway created: %s", i.gatewayIP) + i.logger.Infof("Gateway created: %s", i.gatewayIP) - i.logger.Info("Waiting for VM to boot (120s)...") + i.logger.Infof("Waiting for VM to boot (120s)...") select { case <-ctx.Done(): return ctx.Err() @@ -237,13 +269,14 @@ func (i *Installer) waitForVMReady(ctx context.Context, gatewayExists bool) erro ssh := NewSSHClient(sshConfig, i.logger) if ssh.TestConnection(ctx) { - i.logger.Success("SSH ready") + i.logger.Infof("SSH ready") return nil } if gatewayExists { - i.logger.Info("Restarting VM...") - _ = i.azureClient.RestartVM(ctx, i.clusterInfo.ResourceGroup, i.options.Gateway.Name) + gateway := i.gatewayConfig() + i.logger.Infof("Restarting VM...") + _ = i.azureClient.RestartVM(ctx, i.clusterInfo.ResourceGroup, gateway.Name) select { case <-ctx.Done(): return ctx.Err() @@ -254,7 +287,7 @@ func (i *Installer) waitForVMReady(ctx context.Context, gatewayExists bool) erro if err := ssh.WaitForConnection(ctx, 18, 10*time.Second); err != nil { return fmt.Errorf("VM SSH connection timeout") } - i.logger.Success("SSH ready") + i.logger.Infof("SSH ready") return nil } @@ -265,7 +298,7 @@ func (i *Installer) configureVPNServer(ctx context.Context) error { vpnServer := NewVPNServerManager(ssh, i.logger) if !vpnServer.IsInstalled(ctx) { - i.logger.Info("Installing VPN on Gateway...") + i.logger.Infof("Installing VPN on Gateway...") if err := vpnServer.Install(ctx); err != nil { return fmt.Errorf("failed to install VPN on Gateway: %w", err) } @@ -286,19 +319,20 @@ func (i *Installer) configureVPNServer(ctx context.Context) error { peerCount, _ := vpnServer.GetPeerCount(ctx) i.vpnConfig.ClientVPNIP = fmt.Sprintf("172.16.0.%d", peerCount+2) - i.logger.Success("VPN server ready, client IP: %s", i.vpnConfig.ClientVPNIP) + i.logger.Infof("VPN server ready, client IP: %s", i.vpnConfig.ClientVPNIP) return nil } // phase3ClientSetup configures the local VPN client func (i *Installer) phase3ClientSetup(ctx context.Context) error { + gateway := i.gatewayConfig() vpnClient := NewVPNClient(i.vpnConfig, i.logger) privateKey, publicKey, err := vpnClient.GenerateKeyPair(ctx) if err != nil { return err } - if err := vpnClient.CreateClientConfig(privateKey, i.options.Gateway.Port); err != nil { + if err := vpnClient.CreateClientConfig(privateKey, gateway.Port); err != nil { return err } @@ -322,7 +356,7 @@ func (i *Installer) phase3ClientSetup(ctx context.Context) error { if !vpnClient.TestConnection(ctx) { return fmt.Errorf("VPN connection failed") } - i.logger.Success("VPN connected: %s", i.vpnConfig.GatewayVPNIP) + i.logger.Infof("VPN connected: %s", i.vpnConfig.GatewayVPNIP) return nil } @@ -341,7 +375,7 @@ func (i *Installer) phase4NodeJoin(ctx context.Context) error { if err := AddHostsEntry(apiServerIP, i.clusterInfo.PrivateFQDN); err != nil { return fmt.Errorf("failed to add hosts entry: %w", err) } - i.logger.Success("API Server: %s (%s)", i.clusterInfo.PrivateFQDN, apiServerIP) + i.logger.Infof("API Server: %s (%s)", i.clusterInfo.PrivateFQDN, apiServerIP) _, _ = RunCommand(ctx, "swapoff", "-a") @@ -358,7 +392,7 @@ func (i *Installer) phase4NodeJoin(ctx context.Context) error { if _, err := RunCommand(ctx, "kubelogin", "convert-kubeconfig", "-l", "azurecli", "--kubeconfig", kubeconfigPath); err != nil { return fmt.Errorf("failed to convert kubeconfig: %w", err) } - i.logger.Success("Kubeconfig ready: %s", kubeconfigPath) + i.logger.Infof("Kubeconfig ready: %s", kubeconfigPath) return nil } diff --git a/pkg/privatecluster/privatecluster_test.go b/pkg/privatecluster/privatecluster_test.go index fd39dbe..79d055f 100644 --- a/pkg/privatecluster/privatecluster_test.go +++ b/pkg/privatecluster/privatecluster_test.go @@ -1,7 +1,10 @@ package privatecluster import ( + "context" "testing" + + "github.com/sirupsen/logrus" ) func TestParseResourceID(t *testing.T) { @@ -83,19 +86,6 @@ func TestDefaultConfigs(t *testing.T) { } } -func TestLogger(t *testing.T) { - // Just test that logger doesn't panic - logger := NewLogger(false) - logger.Info("test info") - logger.Success("test success") - logger.Warning("test warning") - logger.Error("test error") - logger.Verbose("should not print") // verbose=false - - loggerVerbose := NewLogger(true) - loggerVerbose.Verbose("should print") -} - func TestFileExists(t *testing.T) { // Test with existing file if !FileExists("types.go") { @@ -121,37 +111,55 @@ func TestCommandExists(t *testing.T) { } func TestInstallerCreation(t *testing.T) { - options := InstallOptions{ - AKSResourceID: "/subscriptions/xxx/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/cluster", - Verbose: true, - } - - // NewInstaller requires a credential; pass nil to test creation without Azure calls - installer, err := NewInstaller(options, nil) - // Expected to fail since nil credential can't create Azure clients - if err != nil { - t.Skipf("Skipping: NewInstaller requires valid Azure credential: %v", err) - } + logger := logrus.New() + installer := NewInstaller(logger) if installer == nil { t.Fatal("NewInstaller() should not return nil") } - if installer.logger == nil { - t.Error("Installer.logger should not be nil") + if installer.logger != logger { + t.Error("Installer.logger should match the provided logger") } } -func TestUninstallerCreation(t *testing.T) { - options := UninstallOptions{ - Mode: CleanupModeLocal, - AKSResourceID: "", +func TestInstallerGetName(t *testing.T) { + installer := NewInstaller(logrus.New()) + if name := installer.GetName(); name != "PrivateClusterInstall" { + t.Errorf("GetName() = %v, want PrivateClusterInstall", name) } +} - // NewUninstaller with empty resource ID and nil cred skips Azure client creation - uninstaller, err := NewUninstaller(options, nil) - if err != nil { - t.Fatalf("NewUninstaller() returned error: %v", err) +func TestInstallerIsCompletedNonPrivate(t *testing.T) { + // When config is nil (non-private cluster), IsCompleted should return true + installer := NewInstaller(logrus.New()) + installer.config = nil + if !installer.IsCompleted(context.Background()) { + t.Error("IsCompleted() should return true for non-private cluster") } +} + +func TestUninstallerCreation(t *testing.T) { + logger := logrus.New() + uninstaller := NewUninstaller(logger) if uninstaller == nil { - t.Error("NewUninstaller() should not return nil") + t.Fatal("NewUninstaller() should not return nil") + } + if uninstaller.logger != logger { + t.Error("Uninstaller.logger should match the provided logger") + } +} + +func TestUninstallerGetName(t *testing.T) { + uninstaller := NewUninstaller(logrus.New()) + if name := uninstaller.GetName(); name != "PrivateClusterUninstall" { + t.Errorf("GetName() = %v, want PrivateClusterUninstall", name) + } +} + +func TestUninstallerIsCompletedNonPrivate(t *testing.T) { + // When config is nil (non-private cluster), IsCompleted should return true + uninstaller := NewUninstaller(logrus.New()) + uninstaller.config = nil + if !uninstaller.IsCompleted(context.Background()) { + t.Error("IsCompleted() should return true for non-private cluster") } } diff --git a/pkg/privatecluster/ssh.go b/pkg/privatecluster/ssh.go index fe30150..dbc745a 100644 --- a/pkg/privatecluster/ssh.go +++ b/pkg/privatecluster/ssh.go @@ -6,16 +6,18 @@ import ( "os/exec" "strings" "time" + + "github.com/sirupsen/logrus" ) // SSHClient provides SSH operations to a remote host type SSHClient struct { config SSHConfig - logger *Logger + logger *logrus.Logger } // NewSSHClient creates a new SSHClient instance -func NewSSHClient(config SSHConfig, logger *Logger) *SSHClient { +func NewSSHClient(config SSHConfig, logger *logrus.Logger) *SSHClient { return &SSHClient{ config: config, logger: logger, @@ -76,12 +78,11 @@ func (s *SSHClient) TestConnection(ctx context.Context) bool { // WaitForConnection waits for SSH connection to be ready with retries func (s *SSHClient) WaitForConnection(ctx context.Context, maxAttempts int, interval time.Duration) error { - // Quick first check if s.TestConnection(ctx) { return nil } - s.logger.Info("Waiting for SSH connection to be ready...") + s.logger.Infof("Waiting for SSH connection to be ready...") for attempt := 1; attempt <= maxAttempts; attempt++ { select { @@ -94,7 +95,7 @@ func (s *SSHClient) WaitForConnection(ctx context.Context, maxAttempts int, inte return nil } - s.logger.Verbose("Waiting for SSH... (%d/%d)", attempt, maxAttempts) + s.logger.Debugf("Waiting for SSH... (%d/%d)", attempt, maxAttempts) } return fmt.Errorf("SSH connection timeout after %d attempts", maxAttempts) @@ -112,12 +113,10 @@ func (s *SSHClient) CommandExists(ctx context.Context, command string) bool { // GenerateSSHKey generates an SSH key pair func GenerateSSHKey(keyPath string) error { - // Check if key already exists if FileExists(keyPath) { return nil } - // Ensure directory exists if err := EnsureDirectory(GetRealHome() + "/.ssh"); err != nil { return err } @@ -127,7 +126,6 @@ func GenerateSSHKey(keyPath string) error { return fmt.Errorf("failed to generate SSH key: %w\nOutput: %s", err, string(output)) } - // Fix ownership if running with sudo return FixSSHKeyOwnership(keyPath) } diff --git a/pkg/privatecluster/tool_installer.go b/pkg/privatecluster/tool_installer.go index aee2beb..de85d06 100644 --- a/pkg/privatecluster/tool_installer.go +++ b/pkg/privatecluster/tool_installer.go @@ -3,43 +3,92 @@ package privatecluster import ( "context" "fmt" + "os" + "runtime" + "strings" + + "github.com/sirupsen/logrus" +) + +const ( + kubeloginVersion = "0.1.6" + kubeloginURLPattern = "https://github.com/Azure/kubelogin/releases/download/v%s/kubelogin-linux-%s.zip" + kubectlURLPattern = "https://acs-mirror.azureedge.net/kubernetes/v%s/bin/linux/%s/kubectl" ) -// ToolInstaller handles installation of CLI tools that cannot be replaced by SDK calls. +// ToolInstaller handles installation of CLI tools via direct downloads. type ToolInstaller struct { - logger *Logger + logger *logrus.Logger } // NewToolInstaller creates a new ToolInstaller instance. -func NewToolInstaller(logger *Logger) *ToolInstaller { +func NewToolInstaller(logger *logrus.Logger) *ToolInstaller { return &ToolInstaller{logger: logger} } -// InstallAKSCLI installs kubectl and kubelogin via Azure CLI. -func (t *ToolInstaller) InstallAKSCLI(ctx context.Context) error { - _, err := RunCommand(ctx, "az", "aks", "install-cli", - "--install-location", "/usr/local/bin/kubectl", - "--kubelogin-install-location", "/usr/local/bin/kubelogin") - if err != nil { - return fmt.Errorf("failed to install kubectl/kubelogin: %w", err) +// InstallKubelogin downloads and installs kubelogin binary. +func (t *ToolInstaller) InstallKubelogin(ctx context.Context) error { + if CommandExists("kubelogin") { + return nil } - _, _ = RunCommand(ctx, "chmod", "+x", "/usr/local/bin/kubectl", "/usr/local/bin/kubelogin") + arch := goArch() + url := fmt.Sprintf(kubeloginURLPattern, kubeloginVersion, arch) + zipPath := "/tmp/kubelogin.zip" + + t.logger.Infof("Downloading kubelogin v%s...", kubeloginVersion) + + if _, err := RunCommand(ctx, "curl", "-L", "-o", zipPath, url); err != nil { + return fmt.Errorf("failed to download kubelogin: %w", err) + } + defer func() { _ = os.Remove(zipPath) }() + + extractDir := "/tmp/kubelogin-extract" + _ = os.RemoveAll(extractDir) + if _, err := RunCommand(ctx, "unzip", "-o", zipPath, "-d", extractDir); err != nil { + return fmt.Errorf("failed to extract kubelogin: %w", err) + } + defer func() { _ = os.RemoveAll(extractDir) }() + + binaryPath := fmt.Sprintf("%s/bin/linux_%s/kubelogin", extractDir, arch) + if !FileExists(binaryPath) { + return fmt.Errorf("kubelogin binary not found at %s", binaryPath) + } + + if _, err := RunCommand(ctx, "cp", binaryPath, "/usr/local/bin/kubelogin"); err != nil { + return fmt.Errorf("failed to install kubelogin: %w", err) + } + _ = os.Chmod("/usr/local/bin/kubelogin", 0755) + + t.logger.Infof("kubelogin v%s installed", kubeloginVersion) return nil } -// InstallConnectedMachineExtension installs the connectedmachine Azure CLI extension. -func (t *ToolInstaller) InstallConnectedMachineExtension(ctx context.Context) error { - // Check if already installed - if RunCommandSilent(ctx, "az", "extension", "show", "--name", "connectedmachine") { +// InstallKubectl downloads and installs kubectl binary. +func (t *ToolInstaller) InstallKubectl(ctx context.Context, kubernetesVersion string) error { + if CommandExists("kubectl") { return nil } - _, _ = RunCommand(ctx, "az", "config", "set", "extension.dynamic_install_allow_preview=true", "--only-show-errors") + arch := goArch() + url := fmt.Sprintf(kubectlURLPattern, kubernetesVersion, arch) + + t.logger.Infof("Downloading kubectl v%s...", kubernetesVersion) + + if _, err := RunCommand(ctx, "curl", "-L", "-o", "/usr/local/bin/kubectl", url); err != nil { + return fmt.Errorf("failed to download kubectl: %w", err) + } + _ = os.Chmod("/usr/local/bin/kubectl", 0755) + + t.logger.Infof("kubectl v%s installed", kubernetesVersion) + return nil +} - _, err := RunCommand(ctx, "az", "extension", "add", - "--name", "connectedmachine", - "--allow-preview", "true", - "--only-show-errors") - return err +// goArch returns the Go-style architecture string. +func goArch() string { + arch := runtime.GOARCH + if arch == "" { + arch = "amd64" + } + return strings.TrimSpace(arch) } diff --git a/pkg/privatecluster/types.go b/pkg/privatecluster/types.go index 0706db8..04e7fcd 100644 --- a/pkg/privatecluster/types.go +++ b/pkg/privatecluster/types.go @@ -51,19 +51,6 @@ type SSHConfig struct { Timeout int } -// InstallOptions holds options for the install operation -type InstallOptions struct { - AKSResourceID string - Gateway GatewayConfig - Verbose bool -} - -// UninstallOptions holds options for the uninstall operation -type UninstallOptions struct { - Mode CleanupMode - AKSResourceID string -} - // DefaultGatewayConfig returns the default Gateway configuration func DefaultGatewayConfig() GatewayConfig { return GatewayConfig{ diff --git a/pkg/privatecluster/uninstaller.go b/pkg/privatecluster/uninstaller.go index bf3d9d0..953ecc7 100644 --- a/pkg/privatecluster/uninstaller.go +++ b/pkg/privatecluster/uninstaller.go @@ -4,17 +4,19 @@ import ( "context" "fmt" - "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "github.com/sirupsen/logrus" + + "go.goms.io/aks/AKSFlexNode/pkg/auth" + "go.goms.io/aks/AKSFlexNode/pkg/config" ) -// Uninstaller handles private cluster uninstallation +// Uninstaller handles private cluster VPN/Gateway teardown, implementing bootstrapper.Executor. type Uninstaller struct { - logger *Logger - azureClient *AzureClient - toolInstaller *ToolInstaller - options UninstallOptions + config *config.Config + logger *logrus.Logger + authProvider *auth.AuthProvider + azureClient *AzureClient - // State clusterInfo *AKSClusterInfo vpnConfig VPNConfig sshKeyPath string @@ -22,152 +24,144 @@ type Uninstaller struct { clientKey string } -// NewUninstaller creates a new Uninstaller instance. -// cred is the Azure credential used for SDK calls. If nil, Azure resource cleanup will be skipped. -func NewUninstaller(options UninstallOptions, cred azcore.TokenCredential) (*Uninstaller, error) { - logger := NewLogger(false) - - u := &Uninstaller{ - logger: logger, - toolInstaller: NewToolInstaller(logger), - options: options, - vpnConfig: DefaultVPNConfig(), - sshKeyPath: GetSSHKeyPath(), +// NewUninstaller creates a new private cluster Uninstaller. +func NewUninstaller(logger *logrus.Logger) *Uninstaller { + return &Uninstaller{ + config: config.GetConfig(), + logger: logger, + authProvider: auth.NewAuthProvider(), + vpnConfig: DefaultVPNConfig(), + sshKeyPath: GetSSHKeyPath(), } +} - // Only create Azure client if we have a resource ID (needed for full cleanup) - if options.AKSResourceID != "" && cred != nil { - subscriptionID, _, _, err := ParseResourceID(options.AKSResourceID) - if err != nil { - return nil, fmt.Errorf("failed to parse resource ID: %w", err) - } - azureClient, err := NewAzureClient(cred, subscriptionID, logger) - if err != nil { - return nil, fmt.Errorf("failed to create Azure client: %w", err) - } - u.azureClient = azureClient - } +// GetName returns the step name. +func (u *Uninstaller) GetName() string { + return "PrivateClusterUninstall" +} - return u, nil +// IsCompleted returns true for non-private clusters; always false for private clusters. +func (u *Uninstaller) IsCompleted(ctx context.Context) bool { + if !u.isPrivateCluster() { + return true + } + return false // Always attempt cleanup for private clusters } -// Uninstall runs the uninstallation process -func (u *Uninstaller) Uninstall(ctx context.Context) error { - fmt.Printf("%sRemove Edge Node from Private AKS Cluster%s\n", colorYellow, colorReset) - fmt.Printf("%s=====================================%s\n\n", colorYellow, colorReset) +// Execute runs the private cluster uninstallation. +func (u *Uninstaller) Execute(ctx context.Context) error { + if !u.isPrivateCluster() { + return nil + } + + u.logger.Infof("Remove Edge Node from Private AKS Cluster") + u.logger.Infof("=====================================") + + cleanupMode := u.config.Azure.TargetCluster.CleanupMode + var mode CleanupMode + switch cleanupMode { + case "local", "": + mode = CleanupModeLocal + case "full": + mode = CleanupModeFull + default: + return fmt.Errorf("invalid cleanup mode: %s (use 'local' or 'full')", cleanupMode) + } - // Parse resource ID if provided - if u.options.AKSResourceID != "" { - subscriptionID, resourceGroup, clusterName, err := ParseResourceID(u.options.AKSResourceID) + resourceID := u.config.GetTargetClusterID() + if resourceID != "" { + subscriptionID, resourceGroup, clusterName, err := ParseResourceID(resourceID) if err != nil { return err } u.clusterInfo = &AKSClusterInfo{ - ResourceID: u.options.AKSResourceID, + ResourceID: resourceID, SubscriptionID: subscriptionID, ResourceGroup: resourceGroup, ClusterName: clusterName, } - u.logger.Info("Cluster: %s/%s (Subscription: %s)", resourceGroup, clusterName, subscriptionID) + u.logger.Infof("Cluster: %s/%s (Subscription: %s)", resourceGroup, clusterName, subscriptionID) + + if mode == CleanupModeFull { + cred, err := u.authProvider.UserCredential(u.config) + if err != nil { + u.logger.Warnf("Failed to get Azure credential: %v", err) + } else { + azureClient, err := NewAzureClient(cred, subscriptionID, u.logger) + if err != nil { + u.logger.Warnf("Failed to create Azure client: %v", err) + } else { + u.azureClient = azureClient + } + } + } } - _ = u.toolInstaller.InstallConnectedMachineExtension(ctx) - - switch u.options.Mode { + switch mode { case CleanupModeLocal: return u.cleanupLocal(ctx) case CleanupModeFull: return u.cleanupFull(ctx) default: - return fmt.Errorf("invalid cleanup mode: %s", u.options.Mode) + return fmt.Errorf("invalid cleanup mode: %s", mode) } } +// isPrivateCluster checks if the config indicates a private cluster. +func (u *Uninstaller) isPrivateCluster() bool { + return u.config != nil && + u.config.Azure.TargetCluster != nil && + u.config.Azure.TargetCluster.IsPrivateCluster +} + // cleanupLocal performs local cleanup (keeps Gateway) func (u *Uninstaller) cleanupLocal(ctx context.Context) error { - u.logger.Info("Performing local cleanup (keeping Gateway)...") + u.logger.Infof("Performing local cleanup (keeping Gateway)...") hostname, err := GetHostname() if err != nil { return err } - // Get Gateway IP and client key from VPN config (before stopping VPN) u.readVPNConfig() - - // Remove node from cluster while VPN is still connected. - // This must happen here (not in bootstrapper) because the private cluster API server - // is only reachable through the VPN tunnel, which gets torn down below. - u.removeNodeFromCluster(ctx, hostname) - - // Note: stopFlexNodeAgent and removeArcAgent are handled by the bootstrapper's - // services.UnInstaller and arc.UnInstaller steps respectively. - - // Remove client peer from Gateway + u.removeNodeFromCluster(ctx, hostname) // Must happen while VPN is still up u.removeClientPeerFromGateway(ctx) - - // Stop VPN u.stopVPN(ctx) - - // Delete VPN client configuration u.deleteVPNConfig() - - // Clean up hosts entries u.cleanupHostsEntries() - // Note: config.json is preserved for potential re-use - - fmt.Println() - u.logger.Success("Local cleanup completed!") - fmt.Println() - fmt.Println("To rejoin cluster, run:") - fmt.Println(" sudo ./aks-flex-node agent --config config.json # with private: true") + u.logger.Infof("Local cleanup completed!") + u.logger.Infof("To rejoin cluster, run:") + u.logger.Infof(" sudo ./aks-flex-node agent --config config.json # with private: true") return nil } // cleanupFull performs full cleanup (removes all Azure resources) func (u *Uninstaller) cleanupFull(ctx context.Context) error { - u.logger.Info("Performing full cleanup...") + u.logger.Infof("Performing full cleanup...") hostname, err := GetHostname() if err != nil { return err } - // Get Gateway IP and client key from VPN config (before stopping VPN) u.readVPNConfig() - - // Remove node from cluster while VPN is still connected (see comment in cleanupLocal) - u.removeNodeFromCluster(ctx, hostname) - - // Remove client peer from Gateway + u.removeNodeFromCluster(ctx, hostname) // Must happen while VPN is still up u.removeClientPeerFromGateway(ctx) - - // Stop VPN u.stopVPN(ctx) - - // Delete VPN client configuration u.deleteVPNConfig() - - // Clean up hosts entries u.cleanupHostsEntries() - // Delete Azure resources if err := u.deleteAzureResources(ctx); err != nil { - u.logger.Warning("Failed to delete some Azure resources: %v", err) + u.logger.Warnf("Failed to delete some Azure resources: %v", err) } - // Delete SSH keys u.deleteSSHKeys() - // Note: config.json is preserved for potential re-use - - fmt.Println() - u.logger.Success("Full cleanup completed!") - fmt.Println() - fmt.Println("All components and Azure resources have been removed.") - fmt.Println("The local machine is now clean.") + u.logger.Infof("Full cleanup completed!") + u.logger.Infof("All components and Azure resources have been removed.") + u.logger.Infof("The local machine is now clean.") return nil } @@ -188,22 +182,20 @@ func (u *Uninstaller) removeNodeFromCluster(ctx context.Context, nodeName string return } - u.logger.Info("Removing node %s from cluster...", nodeName) + u.logger.Infof("Removing node %s from cluster...", nodeName) - // Try root kubeconfig first if _, err := RunCommand(ctx, "kubectl", "--kubeconfig", "/root/.kube/config", "delete", "node", nodeName, "--ignore-not-found"); err == nil { - u.logger.Success("Node removed from cluster") + u.logger.Infof("Node removed from cluster") return } - // Try default kubeconfig if _, err := RunCommand(ctx, "kubectl", "delete", "node", nodeName, "--ignore-not-found"); err == nil { - u.logger.Success("Node removed from cluster") + u.logger.Infof("Node removed from cluster") return } - u.logger.Warning("Failed to remove node from cluster (may need manual cleanup: kubectl delete node %s)", nodeName) + u.logger.Warnf("Failed to remove node from cluster (may need manual cleanup: kubectl delete node %s)", nodeName) } // removeClientPeerFromGateway removes this client's peer from the Gateway @@ -212,50 +204,48 @@ func (u *Uninstaller) removeClientPeerFromGateway(ctx context.Context) { return } - u.logger.Info("Removing client peer from Gateway...") + u.logger.Infof("Removing client peer from Gateway...") - // Get public key from private key vpnClient := NewVPNClient(u.vpnConfig, u.logger) clientPubKey, err := vpnClient.GetPublicKeyFromPrivate(ctx, u.clientKey) if err != nil || clientPubKey == "" { return } - // Connect to Gateway and remove peer sshConfig := DefaultSSHConfig(u.sshKeyPath, u.gatewayIP) sshConfig.Timeout = 10 ssh := NewSSHClient(sshConfig, u.logger) vpnServer := NewVPNServerManager(ssh, u.logger) _ = vpnServer.RemovePeer(ctx, clientPubKey) - u.logger.Success("Client peer removed from Gateway") + u.logger.Infof("Client peer removed from Gateway") } // stopVPN stops the VPN connection func (u *Uninstaller) stopVPN(ctx context.Context) { vpnClient := NewVPNClient(u.vpnConfig, u.logger) _ = vpnClient.Stop(ctx) - u.logger.Success("VPN connection stopped") + u.logger.Infof("VPN connection stopped") } // deleteVPNConfig deletes the VPN client configuration func (u *Uninstaller) deleteVPNConfig() { vpnClient := NewVPNClient(u.vpnConfig, u.logger) _ = vpnClient.RemoveClientConfig() - u.logger.Success("VPN config deleted") + u.logger.Infof("VPN config deleted") } // cleanupHostsEntries removes AKS-related entries from /etc/hosts func (u *Uninstaller) cleanupHostsEntries() { _ = RemoveHostsEntries("privatelink") _ = RemoveHostsEntries("azmk8s.io") - u.logger.Success("Hosts entries cleaned") + u.logger.Infof("Hosts entries cleaned") } // deleteSSHKeys deletes the Gateway SSH keys func (u *Uninstaller) deleteSSHKeys() { _ = RemoveSSHKeys(u.sshKeyPath) - u.logger.Success("SSH keys deleted") + u.logger.Infof("SSH keys deleted") } // deleteAzureResources deletes all Azure resources created for the Gateway @@ -264,7 +254,7 @@ func (u *Uninstaller) deleteAzureResources(ctx context.Context) error { return fmt.Errorf("cluster info or Azure client not available") } - u.logger.Info("Deleting Azure resources...") + u.logger.Infof("Deleting Azure resources...") gatewayName := "wg-gateway" nicName := gatewayName + "VMNic" @@ -272,16 +262,16 @@ func (u *Uninstaller) deleteAzureResources(ctx context.Context) error { nsgName := gatewayName + "-nsg" if err := u.azureClient.DeleteVM(ctx, u.clusterInfo.ResourceGroup, gatewayName); err != nil { - u.logger.Warning("Delete VM: %v", err) + u.logger.Warnf("Delete VM: %v", err) } if err := u.azureClient.DeleteNIC(ctx, u.clusterInfo.ResourceGroup, nicName); err != nil { - u.logger.Warning("Delete NIC: %v", err) + u.logger.Warnf("Delete NIC: %v", err) } if err := u.azureClient.DeletePublicIP(ctx, u.clusterInfo.ResourceGroup, pipName); err != nil { - u.logger.Warning("Delete Public IP: %v", err) + u.logger.Warnf("Delete Public IP: %v", err) } if err := u.azureClient.DeleteNSG(ctx, u.clusterInfo.ResourceGroup, nsgName); err != nil { - u.logger.Warning("Delete NSG: %v", err) + u.logger.Warnf("Delete NSG: %v", err) } _ = u.azureClient.DeleteDisks(ctx, u.clusterInfo.ResourceGroup, gatewayName) @@ -292,7 +282,7 @@ func (u *Uninstaller) deleteAzureResources(ctx context.Context) error { _ = u.azureClient.DeleteSubnet(ctx, vnetRG, vnetName, "wg-subnet") } } - u.logger.Success("Azure resources deleted") + u.logger.Infof("Azure resources deleted") return nil } diff --git a/pkg/privatecluster/utils.go b/pkg/privatecluster/utils.go index 49f7a2b..e7f20c2 100644 --- a/pkg/privatecluster/utils.go +++ b/pkg/privatecluster/utils.go @@ -11,57 +11,6 @@ import ( "strings" ) -// Color codes for terminal output -const ( - colorRed = "\033[0;31m" - colorGreen = "\033[0;32m" - colorYellow = "\033[1;33m" - colorBlue = "\033[0;34m" - colorReset = "\033[0m" -) - -// Logger provides colored logging for the private cluster operations -type Logger struct { - verbose bool -} - -// NewLogger creates a new Logger instance -func NewLogger(verbose bool) *Logger { - return &Logger{verbose: verbose} -} - -// Info logs an info message -func (l *Logger) Info(format string, args ...interface{}) { - msg := fmt.Sprintf(format, args...) - fmt.Printf("%sINFO:%s %s\n", colorBlue, colorReset, msg) -} - -// Success logs a success message -func (l *Logger) Success(format string, args ...interface{}) { - msg := fmt.Sprintf(format, args...) - fmt.Printf("%sSUCCESS:%s %s\n", colorGreen, colorReset, msg) -} - -// Warning logs a warning message -func (l *Logger) Warning(format string, args ...interface{}) { - msg := fmt.Sprintf(format, args...) - fmt.Printf("%sWARNING:%s %s\n", colorYellow, colorReset, msg) -} - -// Error logs an error message -func (l *Logger) Error(format string, args ...interface{}) { - msg := fmt.Sprintf(format, args...) - fmt.Printf("%sERROR:%s %s\n", colorRed, colorReset, msg) -} - -// Verbose logs a verbose message (only if verbose mode is enabled) -func (l *Logger) Verbose(format string, args ...interface{}) { - if l.verbose { - msg := fmt.Sprintf(format, args...) - fmt.Printf("%sVERBOSE:%s %s\n", colorBlue, colorReset, msg) - } -} - // RunCommand executes a command and returns its output func RunCommand(ctx context.Context, name string, args ...string) (string, error) { cmd := exec.CommandContext(ctx, name, args...) // #nosec G204 -- commands are from trusted internal code @@ -87,13 +36,11 @@ func CommandExists(name string) bool { // GetRealHome returns the real user's home directory (handles sudo) func GetRealHome() string { - // Check if running with sudo if sudoUser := os.Getenv("SUDO_USER"); sudoUser != "" { if u, err := user.Lookup(sudoUser); err == nil { return u.HomeDir } } - // Fallback to current user's home if home := os.Getenv("HOME"); home != "" { return home } @@ -137,7 +84,6 @@ func WriteFileContent(path, content string, perm os.FileMode) error { func AddHostsEntry(ip, hostname string) error { hostsPath := "/etc/hosts" - // Check if entry already exists content, err := ReadFileContent(hostsPath) if err != nil { return fmt.Errorf("failed to read hosts file: %w", err) @@ -189,7 +135,6 @@ func RemoveHostsEntries(pattern string) error { // ParseResourceID parses an Azure resource ID and returns its components func ParseResourceID(resourceID string) (subscriptionID, resourceGroup, resourceName string, err error) { - // Normalize: Azure CLI sometimes returns lowercase 'resourcegroups' resourceID = strings.Replace(resourceID, "/resourcegroups/", "/resourceGroups/", 1) parts := strings.Split(resourceID, "/") @@ -197,7 +142,6 @@ func ParseResourceID(resourceID string) (subscriptionID, resourceGroup, resource return "", "", "", fmt.Errorf("invalid resource ID format: %s", resourceID) } - // Format: /subscriptions/{sub}/resourceGroups/{rg}/providers/{provider}/{type}/{name} subscriptionID = parts[2] resourceGroup = parts[4] resourceName = parts[8] @@ -221,7 +165,6 @@ func FixSSHKeyOwnership(keyPath string) error { return fmt.Errorf("failed to lookup user %s: %w", sudoUser, err) } - // Change ownership of both private and public keys for _, path := range []string{keyPath, keyPath + ".pub"} { if FileExists(path) { cmd := exec.Command("chown", fmt.Sprintf("%s:%s", u.Uid, u.Gid), path) // #nosec G204 -- chown with uid/gid diff --git a/pkg/privatecluster/vpn.go b/pkg/privatecluster/vpn.go index 537a1a7..6b7a103 100644 --- a/pkg/privatecluster/vpn.go +++ b/pkg/privatecluster/vpn.go @@ -6,16 +6,18 @@ import ( "os/exec" "strconv" "strings" + + "github.com/sirupsen/logrus" ) // VPNClient provides VPN (WireGuard) operations type VPNClient struct { config VPNConfig - logger *Logger + logger *logrus.Logger } // NewVPNClient creates a new VPNClient instance -func NewVPNClient(config VPNConfig, logger *Logger) *VPNClient { +func NewVPNClient(config VPNConfig, logger *logrus.Logger) *VPNClient { return &VPNClient{ config: config, logger: logger, @@ -24,7 +26,6 @@ func NewVPNClient(config VPNConfig, logger *Logger) *VPNClient { // GenerateKeyPair generates a WireGuard key pair and returns (privateKey, publicKey) func (v *VPNClient) GenerateKeyPair(ctx context.Context) (string, string, error) { - // Generate private key privateKey, err := RunCommand(ctx, "wg", "genkey") if err != nil { return "", "", fmt.Errorf("failed to generate VPN private key: %w", err) @@ -104,14 +105,12 @@ func (v *VPNClient) GetClientConfigInfo() (gatewayIP, privateKey string, err err return "", "", fmt.Errorf("failed to read VPN config: %w", err) } - // Parse Endpoint to get Gateway IP for _, line := range strings.Split(content, "\n") { line = strings.TrimSpace(line) if strings.HasPrefix(line, "Endpoint") { parts := strings.SplitN(line, "=", 2) if len(parts) == 2 { endpoint := strings.TrimSpace(parts[1]) - // Remove port gatewayIP = strings.Split(endpoint, ":")[0] } } @@ -140,11 +139,11 @@ func (v *VPNClient) GetPublicKeyFromPrivate(ctx context.Context, privateKey stri // VPNServerManager manages VPN server on the Gateway type VPNServerManager struct { ssh *SSHClient - logger *Logger + logger *logrus.Logger } // NewVPNServerManager creates a new VPNServerManager instance -func NewVPNServerManager(ssh *SSHClient, logger *Logger) *VPNServerManager { +func NewVPNServerManager(ssh *SSHClient, logger *logrus.Logger) *VPNServerManager { return &VPNServerManager{ ssh: ssh, logger: logger, @@ -214,13 +213,11 @@ func (m *VPNServerManager) GetPeerCount(ctx context.Context) (int, error) { // AddPeer adds a client peer to the server func (m *VPNServerManager) AddPeer(ctx context.Context, clientPublicKey, clientIP string) error { - // Add peer cmd := fmt.Sprintf("sudo wg set wg0 peer '%s' allowed-ips %s/32", clientPublicKey, clientIP) if _, err := m.ssh.Execute(ctx, cmd); err != nil { return fmt.Errorf("failed to add peer: %w", err) } - // Persist configuration if _, err := m.ssh.Execute(ctx, "sudo wg-quick save wg0"); err != nil { return fmt.Errorf("failed to save VPN config: %w", err) } @@ -246,7 +243,7 @@ func (m *VPNServerManager) ResolveDNS(ctx context.Context, hostname string) (str } // InstallVPNTools installs VPN tools locally -func InstallVPNTools(ctx context.Context, logger *Logger) error { +func InstallVPNTools(ctx context.Context, logger *logrus.Logger) error { if CommandExists("wg") { return nil } @@ -256,4 +253,3 @@ func InstallVPNTools(ctx context.Context, logger *Logger) error { _, err := RunCommand(ctx, "apt-get", "install", "-y", "wireguard-tools") return err } - From b304460c1fcc1689d07adfcf01497439c67a5bda Mon Sep 17 00:00:00 2001 From: weiliu2 Date: Thu, 12 Feb 2026 04:47:26 +1300 Subject: [PATCH 11/11] update docs --- pkg/privatecluster/README.md | 15 +++++++++------ pkg/privatecluster/create_private_cluster.md | 4 ++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pkg/privatecluster/README.md b/pkg/privatecluster/README.md index eba1fd8..9b39518 100644 --- a/pkg/privatecluster/README.md +++ b/pkg/privatecluster/README.md @@ -2,12 +2,14 @@ ## Prerequisites -### 1. Login to Azure CLI as root +### 1. Login to Azure CLI ```bash -sudo az login +az login ``` +> **Note:** When running the agent with `sudo`, use `sudo -E` to preserve your Azure CLI token. Alternatively, run `sudo az login` to login as root directly. + ### 2. Create a Private AKS Cluster Create a Private AKS cluster with AAD and Azure RBAC enabled, and assign the required roles to your user. @@ -29,6 +31,7 @@ Create a `config.json` with `"private": true` in the `targetCluster` section: "private": true }, "arc": { + "enabled": true, "resourceGroup": "", "location": "eastus2" } @@ -60,7 +63,7 @@ go build -o aks-flex-node . When the config has `"private": true`, the `agent` command automatically sets up the Gateway/VPN before bootstrapping: ```bash -sudo ./aks-flex-node agent --config config.json +sudo -E ./aks-flex-node agent --config config.json ``` This will: @@ -72,7 +75,7 @@ This will: ### 3. Verify ```bash -sudo kubectl get nodes +kubectl get nodes ``` ## Leave Private AKS Cluster @@ -87,8 +90,8 @@ sudo ./aks-flex-node unbootstrap --config config.json [--cleanup-mode