diff --git a/pmm/v3/pmm3-ha-eks-cleanup.groovy b/pmm/v3/pmm3-ha-eks-cleanup.groovy
index cada78b616..942407450c 100644
--- a/pmm/v3/pmm3-ha-eks-cleanup.groovy
+++ b/pmm/v3/pmm3-ha-eks-cleanup.groovy
@@ -1,33 +1,60 @@
+/**
+ * PMM HA EKS Cleanup Pipeline
+ *
+ * Manages cleanup of PMM HA test clusters. Supports manual and scheduled runs.
+ * Deletes Route53 records, ALB ingress, and EKS clusters.
+ *
+ * Actions:
+ * - LIST_ONLY: List all test clusters with age
+ * - DELETE_CLUSTER: Delete a specific cluster
+ * - DELETE_ALL: Delete all test clusters (respects SKIP_NEWEST and retention tags)
+ * - DELETE_OLD (cron): Delete expired/untagged clusters + cleanup orphaned resources
+ * - CLEANUP_ORPHANS: Delete orphaned VPCs and failed CF stacks
+ *
+ * Related:
+ * - Create: pmm3-ha-eks.groovy
+ * - Shared library: vars/pmmHaEks.groovy
+ */
+library changelog: false, identifier: 'lib@fix/pmm-ha-eks-access-entries', retriever: modernSCM([
+ $class: 'GitSCMSource',
+ remote: 'https://github.com/Percona-Lab/jenkins-pipelines'
+])
+
pipeline {
agent {
- label 'agent-amd64-ol9'
+ label 'cli'
}
triggers {
- cron('H 0,12 * * *') // Runs twice daily at 00:00 & 12:00
+ cron('H 0,12 * * *')
}
parameters {
choice(
name: 'ACTION',
- choices: ['LIST_ONLY', 'DELETE_CLUSTER', 'DELETE_ALL'],
+ choices: ['LIST_ONLY', 'DELETE_CLUSTER', 'DELETE_ALL', 'CLEANUP_ORPHANS'],
description: '''
LIST_ONLY - list all test clusters
DELETE_CLUSTER - delete a specific cluster (requires CLUSTER_NAME)
- DELETE_ALL - delete all test clusters
+ DELETE_ALL - delete all test clusters
+ CLEANUP_ORPHANS - delete orphaned VPCs and failed CF stacks
Note: Daily cron automatically deletes clusters older than 1 day.
'''
)
string(name: 'CLUSTER_NAME', defaultValue: '', description: 'Required only for DELETE_CLUSTER')
+ booleanParam(name: 'SKIP_NEWEST', defaultValue: true, description: 'Skip the most recent cluster (protects in-progress builds)')
}
options {
buildDiscarder(logRotator(numToKeepStr: '30'))
+ disableConcurrentBuilds()
+ timeout(time: 60, unit: 'MINUTES')
}
environment {
- REGION = "us-east-2"
- CLUSTER_PREFIX = "pmm-ha-test-"
+ REGION = 'us-east-2'
+ CLUSTER_PREFIX = "${pmmHaEks.CLUSTER_PREFIX}"
+ R53_ZONE_NAME = 'cd.percona.com'
}
stages {
@@ -36,14 +63,14 @@ pipeline {
script {
if (currentBuild.getBuildCauses('hudson.triggers.TimerTrigger$TimerTriggerCause')) {
env.ACTION = 'DELETE_OLD'
- echo "Triggered by cron - will delete clusters older than 1 day."
+ echo 'Triggered by cron - will delete clusters older than 1 day.'
} else {
env.ACTION = params.ACTION
echo "Manual run with ACTION=${params.ACTION}"
}
if (env.ACTION == 'DELETE_CLUSTER' && !params.CLUSTER_NAME) {
- error("CLUSTER_NAME is required for DELETE_CLUSTER.")
+ error('CLUSTER_NAME is required for DELETE_CLUSTER.')
}
if (params.CLUSTER_NAME && !params.CLUSTER_NAME.startsWith(env.CLUSTER_PREFIX)) {
error("Cluster name must start with ${env.CLUSTER_PREFIX}")
@@ -56,29 +83,30 @@ pipeline {
when { expression { env.ACTION == 'LIST_ONLY' } }
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
- sh '''
- set +x
-
- CLUSTERS=$(aws eks list-clusters --region "$REGION" \
- --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" \
- --output text)
-
- if [ -z "$CLUSTERS" ]; then
- echo "No clusters found with prefix '${CLUSTER_PREFIX}'."
- exit 0
- fi
-
- for c in $CLUSTERS; do
- CREATED=$(aws eks describe-cluster \
- --name "$c" --region "$REGION" \
- --query "cluster.createdAt" --output text)
-
- CREATED_EPOCH=$(date -d "$CREATED" +%s)
- AGE_HOURS=$(( ( $(date +%s) - CREATED_EPOCH ) / 3600 ))
-
- echo "• $c | Created: $CREATED | Age: ${AGE_HOURS}h"
- done
- '''
+ script {
+ def clusters = pmmHaEks.listClusters(env.REGION)
+
+ if (!clusters) {
+ echo "No clusters found with prefix '${env.CLUSTER_PREFIX}'."
+ return
+ }
+
+ echo "Found ${clusters.size()} cluster(s):"
+ clusters.each { clusterName ->
+ def info = sh(
+ script: """
+ CREATED=\$(aws eks describe-cluster --name ${clusterName} --region ${env.REGION} \
+ --query 'cluster.createdAt' --output text)
+ CREATED_EPOCH=\$(date -d "\${CREATED}" +%s)
+ AGE_HOURS=\$(( ( \$(date +%s) - CREATED_EPOCH ) / 3600 ))
+ echo "\${CREATED}|\${AGE_HOURS}"
+ """,
+ returnStdout: true
+ ).trim()
+ def parts = info.split('\\|')
+ echo "* ${clusterName} | Created: ${parts[0]} | Age: ${parts[1]}h"
+ }
+ }
}
}
}
@@ -87,15 +115,22 @@ pipeline {
when { expression { env.ACTION == 'DELETE_CLUSTER' } }
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
- sh '''
- if ! aws eks describe-cluster --region "${REGION}" --name "${CLUSTER_NAME}" >/dev/null 2>&1; then
- echo "Cluster '${CLUSTER_NAME}' not found in region '${REGION}'."
- exit 0
- fi
-
- eksctl delete cluster --region "${REGION}" --name "${CLUSTER_NAME}" \
- --disable-nodegroup-eviction --wait
- '''
+ script {
+ def clusterExists = sh(
+ script: "aws eks describe-cluster --region ${REGION} --name ${params.CLUSTER_NAME} >/dev/null 2>&1",
+ returnStatus: true
+ ) == 0
+
+ if (clusterExists) {
+ pmmHaEks.deleteCluster(
+ clusterName: params.CLUSTER_NAME,
+ region: env.REGION,
+ r53ZoneName: env.R53_ZONE_NAME
+ )
+ } else {
+ echo "Cluster '${params.CLUSTER_NAME}' not found in region '${REGION}'."
+ }
+ }
}
}
}
@@ -104,20 +139,14 @@ pipeline {
when { expression { env.ACTION == 'DELETE_ALL' } }
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
- sh '''
- CLUSTERS=$(aws eks list-clusters --region "$REGION" \
- --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" --output text)
-
- if [ -z "$CLUSTERS" ]; then
- echo "No clusters found with prefix '${CLUSTER_PREFIX}'."
- exit 0
- fi
-
- for c in $CLUSTERS; do
- eksctl delete cluster --region "$REGION" --name "$c" \
- --disable-nodegroup-eviction --wait
- done
- '''
+ script {
+ pmmHaEks.deleteAllClusters(
+ region: env.REGION,
+ r53ZoneName: env.R53_ZONE_NAME,
+ skipNewest: params.SKIP_NEWEST,
+ maxAgeHours: 0 // Delete all regardless of age
+ )
+ }
}
}
}
@@ -126,36 +155,26 @@ pipeline {
when { expression { env.ACTION == 'DELETE_OLD' } }
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
- sh '''
- CLUSTERS=$(aws eks list-clusters --region "$REGION" \
- --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" --output text)
-
- if [ -z "$CLUSTERS" ]; then
- echo "No clusters found with prefix '${CLUSTER_PREFIX}'."
- exit 0
- fi
-
- CUTOFF=$(date -d "1 day ago" +%s)
-
- for c in $CLUSTERS; do
- CREATED=$(aws eks describe-cluster --name "$c" --region "$REGION" \
- --query "cluster.createdAt" --output text 2>/dev/null || true)
-
- if [ -z "$CREATED" ] || [ "$CREATED" == "None" ]; then
- echo "Unable to fetch creation time for $c — skipping."
- continue
- fi
-
- CREATED_EPOCH=$(date -d "$CREATED" +%s)
+ script {
+ pmmHaEks.deleteAllClusters(
+ region: env.REGION,
+ r53ZoneName: env.R53_ZONE_NAME,
+ skipNewest: true // Always protect newest during cron
+ )
+ // Also clean up orphaned resources during cron
+ pmmHaEks.cleanupOrphans(region: env.REGION)
+ }
+ }
+ }
+ }
- if [ "$CREATED_EPOCH" -lt "$CUTOFF" ]; then
- eksctl delete cluster --region "$REGION" --name "$c" \
- --disable-nodegroup-eviction --wait
- else
- echo "Skipping recent cluster: $c (created within last 24h)"
- fi
- done
- '''
+ stage('Cleanup Orphan Resources') {
+ when { expression { env.ACTION == 'CLEANUP_ORPHANS' } }
+ steps {
+ withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
+ script {
+ pmmHaEks.cleanupOrphans(region: env.REGION)
+ }
}
}
}
diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy
index 580bec6446..21e2f1d69d 100644
--- a/pmm/v3/pmm3-ha-eks.groovy
+++ b/pmm/v3/pmm3-ha-eks.groovy
@@ -1,26 +1,86 @@
+/**
+ * PMM HA EKS Test Pipeline
+ *
+ * Creates an EKS cluster with PMM High Availability deployment for testing.
+ * Includes ALB ingress with ACM certificate and Route53 DNS.
+ *
+ * Related:
+ * - Cleanup: pmm3-ha-eks-cleanup.groovy
+ * - Shared library: vars/pmmHaEks.groovy
+ */
+library changelog: false, identifier: 'lib@fix/pmm-ha-eks-access-entries', retriever: modernSCM([
+ $class: 'GitSCMSource',
+ remote: 'https://github.com/Percona-Lab/jenkins-pipelines'
+])
+
pipeline {
agent {
- label 'agent-amd64-ol9'
+ label 'cli'
+ }
+
+ options {
+ disableConcurrentBuilds()
+ timeout(time: 90, unit: 'MINUTES')
}
parameters {
choice(
name: 'K8S_VERSION',
- choices: ['1.32', '1.31', '1.30', '1.29', '1.28'],
+ choices: ['1.32', '1.33', '1.31', '1.30', '1.29'],
description: 'Select Kubernetes cluster version'
)
+ // PMM HA charts are not yet merged to percona/percona-helm-charts main branch.
+ // theTibi/PMM-14420 contains both pmm-ha and pmm-ha-dependencies charts.
+ // Once merged to percona main, update default to 'main' and swap repo priority.
+ string(
+ name: 'HELM_CHART_BRANCH',
+ defaultValue: 'PMM-14420',
+ description: 'Branch of percona-helm-charts repo (theTibi/PMM-14420 has both pmm-ha and pmm-ha-dependencies)'
+ )
+ string(
+ name: 'PMM_IMAGE_TAG',
+ defaultValue: '',
+ description: 'PMM Server image tag (leave empty for chart default)'
+ )
+ string(
+ name: 'RETENTION_DAYS',
+ defaultValue: '1',
+ description: 'Days to retain cluster before auto-deletion by cleanup job (1-7, default: 1)'
+ )
+ password(
+ name: 'PMM_ADMIN_PASSWORD',
+ defaultValue: '',
+ description: 'PMM admin password (leave empty for auto-generated 16-char password)'
+ )
}
- environment {
- CLUSTER_NAME = "pmm-ha-test-${BUILD_NUMBER}"
- REGION = "us-east-2"
+ environment {
+ CLUSTER_NAME = "${pmmHaEks.CLUSTER_PREFIX}${BUILD_NUMBER}"
+ REGION = 'us-east-2'
KUBECONFIG = "${WORKSPACE}/kubeconfig/config"
+ PMM_NAMESPACE = 'pmm'
+ R53_ZONE_NAME = 'cd.percona.com'
+ PMM_DOMAIN = "${pmmHaEks.CLUSTER_PREFIX}${BUILD_NUMBER}.${R53_ZONE_NAME}"
}
stages {
stage('Write Cluster Config') {
steps {
- sh '''
+ withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
+ script {
+ env.VALIDATED_RETENTION_DAYS = pmmHaEks.validateRetentionDays(params.RETENTION_DAYS)
+ echo "Retention: ${env.VALIDATED_RETENTION_DAYS} days"
+ }
+ sh '''
+ # Calculate delete-after timestamp (epoch seconds)
+ DELETE_AFTER_EPOCH=$(($(date +%s) + (VALIDATED_RETENTION_DAYS * 24 * 60 * 60)))
+ echo "Delete after: $(date -d @${DELETE_AFTER_EPOCH} 2>/dev/null || echo ${DELETE_AFTER_EPOCH})"
+
+ # Discover available AZs dynamically
+ AZS=$(aws ec2 describe-availability-zones --region "${REGION}" \
+ --query 'AvailabilityZones[?State==`available`].ZoneName' \
+ --output json)
+
cat > cluster-config.yaml <= pmmHaEks.MAX_CLUSTERS) {
+ error("Maximum limit of ${pmmHaEks.MAX_CLUSTERS} test clusters reached.")
+ }
- if [ "$EXISTING_COUNT" -ge 5 ]; then
- echo "ERROR: Maximum limit of 5 test clusters reached."
- exit 1
- fi
+ echo "Cluster count: ${count} / ${pmmHaEks.MAX_CLUSTERS}"
+ }
+ }
+ }
+ }
- echo "Existing clusters: $EXISTING_COUNT / 5"
- '''
+ stage('Validate Helm Chart') {
+ steps {
+ script {
+ pmmHaEks.validateHelmChart(params.HELM_CHART_BRANCH)
}
}
}
@@ -98,23 +168,29 @@ EOF
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
sh '''
eksctl create cluster -f cluster-config.yaml --timeout=40m --verbose=4
-
- # Map EKSAdminRole for IAM users
- eksctl create iamidentitymapping \
- --cluster "${CLUSTER_NAME}" \
- --region "${REGION}" \
- --arn arn:aws:iam::119175775298:role/EKSAdminRole \
- --username eks-admin \
- --group system:masters
'''
}
}
}
+ stage('Configure Cluster Access') {
+ steps {
+ withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
+ script {
+ pmmHaEks.configureAccess(
+ clusterName: env.CLUSTER_NAME,
+ region: env.REGION
+ )
+ }
+ }
+ }
+ }
+
stage('Export kubeconfig') {
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
sh '''
+ rm -rf kubeconfig
mkdir -p kubeconfig
aws eks update-kubeconfig \
@@ -129,50 +205,54 @@ EOF
}
}
- stage('Configure GP3 Storage Class') {
+ stage('Setup Infrastructure') {
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
- sh '''
- kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}'
-
- cat </dev/null || echo 'unknown'", returnStdout: true).trim()
+
+ currentBuild.description = "https://${PMM_DOMAIN} | admin / ${creds.pmm} | ${chartRepo}/${HELM_CHART_BRANCH}"
+ }
+ }
}
failure {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
- sh '''
- if eksctl get cluster \
- --region "${REGION}" \
- --name "${CLUSTER_NAME}" >/dev/null 2>&1
- then
- eksctl delete cluster \
- --region "${REGION}" \
- --name "${CLUSTER_NAME}" \
- --disable-nodegroup-eviction \
- --wait
- fi
- '''
+ script {
+ def clusterExists = sh(
+ script: "eksctl get cluster --region ${REGION} --name ${CLUSTER_NAME} >/dev/null 2>&1",
+ returnStatus: true
+ ) == 0
+
+ if (clusterExists) {
+ pmmHaEks.deleteCluster(
+ clusterName: env.CLUSTER_NAME,
+ region: env.REGION,
+ r53ZoneName: env.R53_ZONE_NAME
+ )
+ } else {
+ echo "Cluster ${CLUSTER_NAME} not found, nothing to clean up."
+ }
+ }
}
}
}
diff --git a/vars/pmmHaEks.groovy b/vars/pmmHaEks.groovy
new file mode 100644
index 0000000000..67f739297d
--- /dev/null
+++ b/vars/pmmHaEks.groovy
@@ -0,0 +1,957 @@
+/**
+ * PMM HA EKS Shared Library
+ *
+ * Reusable functions for PMM High Availability testing on EKS clusters.
+ *
+ * Sections:
+ * 1. Constants
+ * 2. Validation Helpers
+ * 3. AWS Resource Resolution
+ * 4. Credential Management
+ * 5. EKS Cluster Setup
+ * 6. PMM Installation
+ * 7. Cluster Lifecycle (list, delete, cleanup)
+ *
+ * Related:
+ * - Create pipeline: pmm/v3/pmm3-ha-eks.groovy
+ * - Cleanup pipeline: pmm/v3/pmm3-ha-eks-cleanup.groovy
+ */
+
+import groovy.transform.Field
+
+// ============================================
+// 1. CONSTANTS
+// ============================================
+
+@Field static final String CLUSTER_PREFIX = 'pmm-ha-test-'
+@Field static final int MAX_CLUSTERS = 5
+@Field static final int DEFAULT_RETENTION_HOURS = 24
+@Field static final int MAX_RETENTION_DAYS = 7
+@Field static final int ALB_WAIT_ATTEMPTS = 30
+@Field static final int ALB_WAIT_INTERVAL_SEC = 10
+
+// ============================================
+// 2. VALIDATION HELPERS
+// ============================================
+
+/**
+ * Validate and normalize retention days parameter.
+ *
+ * @param retentionDays Input value (String or Integer)
+ * @return Integer between 1 and MAX_RETENTION_DAYS
+ */
+def validateRetentionDays(def retentionDays) {
+ def days = 1
+ try {
+ days = retentionDays ? (retentionDays.toString() as int) : 1
+ } catch (Exception e) {
+ echo "WARNING: Invalid RETENTION_DAYS '${retentionDays}', using default 1"
+ days = 1
+ }
+ if (days < 1) {
+ days = 1
+ }
+ if (days > MAX_RETENTION_DAYS) {
+ days = MAX_RETENTION_DAYS
+ }
+ return days
+}
+
+/**
+ * Validate Helm chart branch exists and contains required charts.
+ *
+ * @param chartBranch Branch name to validate
+ * @return String name of repo source ('theTibi' or 'percona')
+ */
+def validateHelmChart(String chartBranch) {
+ def repoSource = sh(
+ script: """
+ set -e
+ rm -rf charts-repo-check
+
+ # Try theTibi fork first (has PMM-14420), then percona repo
+ if git clone --depth 1 --branch "${chartBranch}" "https://github.com/theTibi/percona-helm-charts.git" charts-repo-check 2>/dev/null; then
+ echo "theTibi"
+ elif git clone --depth 1 --branch "${chartBranch}" "https://github.com/percona/percona-helm-charts.git" charts-repo-check 2>/dev/null; then
+ echo "percona"
+ else
+ echo "ERROR: Branch '${chartBranch}' not found in theTibi or percona helm chart repos" >&2
+ exit 1
+ fi
+
+ # Check required charts exist
+ if [ ! -d "charts-repo-check/charts/pmm-ha" ]; then
+ echo "ERROR: pmm-ha chart not found in branch '${chartBranch}'" >&2
+ ls -la charts-repo-check/charts/ >&2 || true
+ rm -rf charts-repo-check
+ exit 1
+ fi
+
+ if [ ! -d "charts-repo-check/charts/pmm-ha-dependencies" ]; then
+ echo "ERROR: pmm-ha-dependencies chart not found in branch '${chartBranch}'" >&2
+ ls -la charts-repo-check/charts/ >&2 || true
+ rm -rf charts-repo-check
+ exit 1
+ fi
+
+ rm -rf charts-repo-check
+ """,
+ returnStdout: true
+ ).trim()
+
+ echo "Helm charts validated: ${repoSource}/${chartBranch}"
+ return repoSource
+}
+
+// ============================================
+// 3. AWS RESOURCE RESOLUTION
+// ============================================
+
+/**
+ * Resolve Route53 hosted zone ID from zone name.
+ *
+ * @param zoneName Route53 zone name (e.g., cd.percona.com)
+ * @param region AWS region
+ * @return Zone ID or empty string if not found
+ */
+def resolveR53ZoneId(String zoneName, String region = 'us-east-2') {
+ def zoneId = sh(
+ script: """
+ R53_ZONE_ID=\$(aws route53 list-hosted-zones-by-name \\
+ --dns-name "${zoneName}" \\
+ --query 'HostedZones[?Config.PrivateZone==`false` && Name==`'"${zoneName}"'.`].Id' \\
+ --output text | sed 's|/hostedzone/||g')
+
+ zone_count=\$(echo "\${R53_ZONE_ID}" | wc -w | tr -d ' ')
+ if [ "\${zone_count}" -eq 1 ] && [ -n "\${R53_ZONE_ID}" ] && [ "\${R53_ZONE_ID}" != "None" ]; then
+ echo "\${R53_ZONE_ID}"
+ else
+ echo ""
+ fi
+ """,
+ returnStdout: true
+ ).trim()
+
+ if (zoneId) {
+ echo "Resolved Route53 zone ID: ${zoneId}"
+ } else {
+ echo "WARNING: Could not resolve Route53 zone for ${zoneName}"
+ }
+ return zoneId
+}
+
+/**
+ * Resolve ACM wildcard certificate ARN for a given zone.
+ *
+ * Finds the first ISSUED wildcard certificate (*.zoneName) in the specified region.
+ *
+ * @param zoneName Route53 zone name (e.g., cd.percona.com)
+ * @param region AWS region (default: us-east-2)
+ * @return Certificate ARN or empty string if not found
+ */
+def resolveAcmCertificate(String zoneName, String region = 'us-east-2') {
+ def wildcardDomain = "*.${zoneName}"
+ def certArn = sh(
+ script: 'aws acm list-certificates --region "' + region + '" ' +
+ '--certificate-statuses ISSUED ' +
+ '--query "CertificateSummaryList[?DomainName==\\`' + wildcardDomain + '\\`].CertificateArn | [0]" ' +
+ '--output text',
+ returnStdout: true
+ ).trim()
+
+ if (certArn && certArn != 'None') {
+ echo "Resolved ACM certificate for ${wildcardDomain}: ${certArn}"
+ return certArn
+ }
+
+ echo "WARNING: No valid ACM wildcard certificate found for ${wildcardDomain}"
+ return ''
+}
+
+// ============================================
+// 4. CREDENTIAL MANAGEMENT
+// ============================================
+
+/**
+ * Get all credentials from pmm-secret.
+ *
+ * @param namespace Kubernetes namespace (default: pmm)
+ * @return Map with pmm, pg, ch_user, ch, vm_user, vm passwords
+ */
+def getCredentials(String namespace = 'pmm') {
+ // Single kubectl call with go-template to decode all secrets
+ def output = sh(
+ script: """
+ kubectl get secret pmm-secret -n ${namespace} -o go-template='pmm={{index .data "PMM_ADMIN_PASSWORD" | base64decode}}
+pg={{index .data "PG_PASSWORD" | base64decode}}
+ch_user={{index .data "PMM_CLICKHOUSE_USER" | base64decode}}
+ch={{index .data "PMM_CLICKHOUSE_PASSWORD" | base64decode}}
+vm_user={{index .data "VMAGENT_remoteWrite_basicAuth_username" | base64decode}}
+vm={{index .data "VMAGENT_remoteWrite_basicAuth_password" | base64decode}}'
+ """,
+ returnStdout: true
+ ).trim()
+
+ def creds = [:]
+ output.split('\n').each { line ->
+ def parts = line.split('=', 2)
+ if (parts.size() == 2) {
+ creds[parts[0]] = parts[1]
+ }
+ }
+ return creds
+}
+
+/**
+ * Write access-info.txt artifact with all credentials.
+ *
+ * @param clusterName EKS cluster name
+ * @param buildNumber Jenkins build number
+ * @param region AWS region
+ * @param domain PMM domain (FQDN)
+ * @param namespace Kubernetes namespace (default: pmm)
+ */
+def writeAccessInfo(Map config) {
+ def clusterName = config.clusterName ?: error('clusterName is required')
+ def buildNumber = config.buildNumber ?: error('buildNumber is required')
+ def region = config.region ?: 'us-east-2'
+ def domain = config.domain ?: error('domain is required')
+ def namespace = config.namespace ?: 'pmm'
+
+ def creds = getCredentials(namespace)
+ def albHostname = sh(
+ script: "kubectl get ingress pmm-ha-alb -n ${namespace} -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || echo 'pending'",
+ returnStdout: true
+ ).trim()
+
+ sh 'mkdir -p pmm-credentials'
+ writeFile file: 'pmm-credentials/access-info.txt', text: """PMM HA Access Information
+=========================
+Cluster: ${clusterName}
+Build: ${buildNumber}
+Region: ${region}
+
+PMM URL: https://${domain}
+ALB: ${albHostname}
+
+PMM/Grafana Login:
+ Username: admin
+ Password: ${creds.pmm}
+
+PostgreSQL:
+ Password: ${creds.pg}
+
+ClickHouse:
+ Username: ${creds.ch_user}
+ Password: ${creds.ch}
+
+VictoriaMetrics:
+ Username: ${creds.vm_user}
+ Password: ${creds.vm}
+
+kubectl access:
+ aws eks update-kubeconfig --name ${clusterName} --region ${region}
+ kubectl get pods -n ${namespace}
+"""
+ return [creds: creds, albHostname: albHostname]
+}
+
+// ============================================
+// 5. EKS CLUSTER SETUP
+// ============================================
+
+/**
+ * Configure EKS Access Entries for cluster authentication.
+ *
+ * Grants cluster admin access to:
+ * - EKSAdminRole (for automation)
+ * - Members of pmm-eks-admins IAM group (dynamically resolved)
+ * - SSO AdministratorAccess role (for console users)
+ *
+ * @param clusterName EKS cluster name (required)
+ * @param region AWS region (default: us-east-2)
+ * @param adminGroupName IAM group for admin access (default: pmm-eks-admins)
+ */
+def configureAccess(Map config) {
+ def clusterName = config.clusterName ?: error('clusterName is required')
+ def region = config.region ?: 'us-east-2'
+ def adminGroupName = config.adminGroupName ?: 'pmm-eks-admins'
+
+ sh """
+ set -euo pipefail
+
+ CLUSTER_NAME="${clusterName}"
+ REGION="${region}"
+
+ # Helper function to grant cluster admin access to a principal
+ grant_cluster_admin() {
+ local principal_arn="\$1"
+ aws eks create-access-entry \\
+ --cluster-name "\${CLUSTER_NAME}" \\
+ --region "\${REGION}" \\
+ --principal-arn "\${principal_arn}" || true
+
+ aws eks associate-access-policy \\
+ --cluster-name "\${CLUSTER_NAME}" \\
+ --region "\${REGION}" \\
+ --principal-arn "\${principal_arn}" \\
+ --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \\
+ --access-scope type=cluster || true
+ }
+
+ ACCOUNT_ID=\$(aws sts get-caller-identity --query Account --output text)
+ echo "AWS Account ID: \${ACCOUNT_ID}"
+
+ # Add EKSAdminRole with cluster admin access
+ echo "Adding EKSAdminRole..."
+ grant_cluster_admin "arn:aws:iam::\${ACCOUNT_ID}:role/EKSAdminRole"
+
+ # Add IAM group members dynamically
+ USERS=\$(aws iam get-group --group-name ${adminGroupName} --query 'Users[].Arn' --output text 2>/dev/null || echo "")
+ for USER_ARN in \${USERS}; do
+ echo "Adding access for \${USER_ARN}..."
+ grant_cluster_admin "\${USER_ARN}"
+ done
+
+ # Add SSO AdministratorAccess role (discover dynamically)
+ SSO_ROLE_ARN=\$(aws iam list-roles \\
+ --query "Roles[?contains(RoleName, 'AWSReservedSSO_AdministratorAccess')].Arn | [0]" \\
+ --output text 2>/dev/null | head -1 | tr -d '[:space:]')
+
+ if [ -n "\${SSO_ROLE_ARN}" ] && [ "\${SSO_ROLE_ARN}" != "None" ]; then
+ echo "Adding SSO role: \${SSO_ROLE_ARN}"
+ grant_cluster_admin "\${SSO_ROLE_ARN}"
+ else
+ echo "No SSO AdministratorAccess role found, skipping"
+ fi
+
+ echo "Access entries configured:"
+ aws eks list-access-entries --cluster-name "\${CLUSTER_NAME}" --region "\${REGION}"
+ """
+}
+
+/**
+ * Setup EKS infrastructure components for PMM HA.
+ *
+ * Installs and configures:
+ * - GP3 storage class (encrypted, default)
+ * - AWS Node Termination Handler (for spot instance draining)
+ * - AWS Load Balancer Controller (for ALB ingress)
+ *
+ * @param clusterName EKS cluster name (required)
+ * @param region AWS region (default: us-east-2)
+ */
+def setupInfrastructure(Map config) {
+ def clusterName = config.clusterName ?: error('clusterName is required')
+ def region = config.region ?: 'us-east-2'
+
+ sh """
+ set -euo pipefail
+
+ CLUSTER_NAME="${clusterName}"
+ REGION="${region}"
+
+ ACCOUNT_ID=\$(aws sts get-caller-identity --query Account --output text)
+ echo "AWS Account ID: \${ACCOUNT_ID}"
+
+ # Configure GP3 as default storage class
+ kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' || true
+
+ cat </dev/null; then
+ echo "Cloned from: \${TIBI_REPO}"
+ echo "theTibi" > .chart-repo-source
+ elif git clone --depth 1 --branch "\${HELM_CHART_BRANCH}" "\${PERCONA_REPO}" charts-repo 2>/dev/null; then
+ echo "Cloned from: \${PERCONA_REPO}"
+ echo "percona" > .chart-repo-source
+ else
+ echo "ERROR: Branch \${HELM_CHART_BRANCH} not found in either repository"
+ exit 1
+ fi
+
+ # Add required Helm repos
+ helm repo add percona https://percona.github.io/percona-helm-charts/ || true
+ helm repo add vm https://victoriametrics.github.io/helm-charts/ || true
+ helm repo add altinity https://docs.altinity.com/helm-charts/ || true
+ helm repo update
+
+ # Install PMM HA dependencies (operators)
+ helm dependency update charts-repo/charts/pmm-ha-dependencies
+ helm upgrade --install pmm-operators charts-repo/charts/pmm-ha-dependencies \\
+ --namespace "\${PMM_NAMESPACE}" \\
+ --create-namespace \\
+ --wait \\
+ --timeout 10m
+
+ echo "Waiting for operators to be ready..."
+ kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=victoria-metrics-operator -n "\${PMM_NAMESPACE}" --timeout=300s || true
+ kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=altinity-clickhouse-operator -n "\${PMM_NAMESPACE}" --timeout=300s || true
+ kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=pg-operator -n "\${PMM_NAMESPACE}" --timeout=300s || true
+
+ # Generate passwords (use provided admin password if set)
+ if [ -n "\${PMM_ADMIN_PASSWORD_INPUT}" ]; then
+ PMM_ADMIN_PASSWORD="\${PMM_ADMIN_PASSWORD_INPUT}"
+ echo "Using user-provided PMM admin password"
+ else
+ PMM_ADMIN_PASSWORD=\$(openssl rand -base64 16 | tr -dc 'a-zA-Z0-9' | head -c 16)
+ echo "Generated PMM admin password"
+ fi
+ PG_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24)
+ GF_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24)
+ CH_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24)
+ VM_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24)
+
+ # Pre-create pmm-secret before helm install
+ # The chart's pg-user-credentials-secrets.yaml uses lookup() at template time
+ # GF_SECURITY_ADMIN_PASSWORD is needed because with secret.create=false,
+ # the chart doesn't explicitly set this env var (only secretRef is used)
+ kubectl create secret generic pmm-secret \\
+ --namespace "\${PMM_NAMESPACE}" \\
+ --from-literal=PMM_ADMIN_PASSWORD="\${PMM_ADMIN_PASSWORD}" \\
+ --from-literal=GF_SECURITY_ADMIN_PASSWORD="\${PMM_ADMIN_PASSWORD}" \\
+ --from-literal=PG_PASSWORD="\${PG_PASSWORD}" \\
+ --from-literal=GF_PASSWORD="\${GF_PASSWORD}" \\
+ --from-literal=PMM_CLICKHOUSE_USER="clickhouse_pmm" \\
+ --from-literal=PMM_CLICKHOUSE_PASSWORD="\${CH_PASSWORD}" \\
+ --from-literal=VMAGENT_remoteWrite_basicAuth_username="victoriametrics_pmm" \\
+ --from-literal=VMAGENT_remoteWrite_basicAuth_password="\${VM_PASSWORD}" \\
+ --dry-run=client -o yaml | kubectl apply -f -
+
+ helm dependency update charts-repo/charts/pmm-ha
+
+ HELM_CMD="helm upgrade --install pmm-ha charts-repo/charts/pmm-ha"
+ HELM_CMD="\${HELM_CMD} --namespace \${PMM_NAMESPACE}"
+ HELM_CMD="\${HELM_CMD} --set secret.create=false"
+ HELM_CMD="\${HELM_CMD} --set secret.name=pmm-secret"
+ # Increase ClickHouse memory for merge operations (default 4Gi is insufficient)
+ HELM_CMD="\${HELM_CMD} --set clickhouse.resources.requests.memory=4Gi"
+ HELM_CMD="\${HELM_CMD} --set clickhouse.resources.limits.memory=10Gi"
+ if [ -n "\${PMM_IMAGE_TAG}" ]; then
+ HELM_CMD="\${HELM_CMD} --set image.tag=\${PMM_IMAGE_TAG}"
+ fi
+ HELM_CMD="\${HELM_CMD} --wait --timeout 15m"
+
+ eval "\${HELM_CMD}"
+
+ echo "Waiting for PMM HA components..."
+ kubectl rollout status statefulset/pmm-ha -n "\${PMM_NAMESPACE}" --timeout=600s || true
+ kubectl wait --for=condition=ready pod -l clickhouse.altinity.com/chi=pmm-ha -n "\${PMM_NAMESPACE}" --timeout=600s || true
+ kubectl wait --for=condition=ready pod -l app.kubernetes.io/component=vmselect -n "\${PMM_NAMESPACE}" --timeout=300s || true
+ kubectl wait --for=condition=ready pod -l app.kubernetes.io/component=vmstorage -n "\${PMM_NAMESPACE}" --timeout=300s || true
+
+ echo "PMM HA installed"
+ kubectl get pods -n "\${PMM_NAMESPACE}"
+ """
+}
+
+/**
+ * Create ALB Ingress and Route53 DNS record for PMM HA.
+ *
+ * Creates:
+ * - ALB Ingress with ACM certificate (HTTPS)
+ * - Route53 alias record pointing to ALB
+ *
+ * Waits up to 5 minutes for ALB provisioning.
+ *
+ * @param namespace Kubernetes namespace (default: pmm)
+ * @param domain FQDN for PMM access (required)
+ * @param certArn ACM certificate ARN for TLS (required)
+ * @param r53ZoneName Route53 hosted zone name (required, e.g., cd.percona.com)
+ * @param region AWS region (default: us-east-2)
+ */
+def createIngress(Map config) {
+ def namespace = config.namespace ?: 'pmm'
+ def domain = config.domain ?: error('domain is required')
+ def certArn = config.certArn ?: error('certArn is required')
+ def r53ZoneName = config.r53ZoneName ?: error('r53ZoneName is required')
+ def region = config.region ?: 'us-east-2'
+
+ def r53ZoneId = resolveR53ZoneId(r53ZoneName, region)
+ if (!r53ZoneId) {
+ error("No public Route53 zone found for ${r53ZoneName}")
+ }
+
+ sh """
+ set -euo pipefail
+
+ PMM_NAMESPACE="${namespace}"
+ PMM_DOMAIN="${domain}"
+ ACM_CERT_ARN="${certArn}"
+ R53_ZONE_ID="${r53ZoneId}"
+ REGION="${region}"
+
+ # Create ALB Ingress
+ cat </dev/null || echo "")
+ if [ -n "\${ALB_HOSTNAME}" ]; then
+ echo "ALB provisioned: \${ALB_HOSTNAME}"
+ break
+ fi
+ echo "Waiting for ALB... (\${attempt}/30)"
+ sleep 10
+ done
+
+ if [ -z "\${ALB_HOSTNAME}" ]; then
+ echo "WARNING: ALB not provisioned within timeout"
+ kubectl describe ingress pmm-ha-alb -n "\${PMM_NAMESPACE}"
+ exit 1
+ fi
+
+ ALB_ZONE_ID=\$(aws elbv2 describe-load-balancers --region "\${REGION}" \\
+ --query "LoadBalancers[?DNSName=='\${ALB_HOSTNAME}'].CanonicalHostedZoneId" \\
+ --output text)
+
+ if [ -n "\${ALB_ZONE_ID}" ]; then
+ aws route53 change-resource-record-sets \\
+ --hosted-zone-id "\${R53_ZONE_ID}" \\
+ --change-batch '{
+ "Changes": [{
+ "Action": "UPSERT",
+ "ResourceRecordSet": {
+ "Name": "'"\${PMM_DOMAIN}"'",
+ "Type": "A",
+ "AliasTarget": {
+ "HostedZoneId": "'"\${ALB_ZONE_ID}"'",
+ "DNSName": "'"\${ALB_HOSTNAME}"'",
+ "EvaluateTargetHealth": true
+ }
+ }
+ }]
+ }'
+ echo "Route53 record created: \${PMM_DOMAIN} -> \${ALB_HOSTNAME}"
+ else
+ echo "WARNING: Could not get ALB zone ID, skipping Route53 record"
+ fi
+ """
+}
+
+// ============================================
+// 7. CLUSTER LIFECYCLE
+// ============================================
+
+/**
+ * List PMM HA test clusters sorted by creation time (newest first).
+ *
+ * @param region AWS region (default: us-east-2)
+ * @return List of cluster names sorted newest first, empty list if none found
+ */
+def listClusters(String region = 'us-east-2') {
+ def output = sh(
+ script: """
+ aws eks list-clusters --region ${region} --output json 2>/dev/null | \\
+ jq -r '.clusters[] | select(startswith("${CLUSTER_PREFIX}"))' | \\
+ while read cluster; do
+ CREATED=\$(aws eks describe-cluster --name "\$cluster" --region ${region} \\
+ --query 'cluster.createdAt' --output text 2>/dev/null)
+ [ -n "\$CREATED" ] && [ "\$CREATED" != "None" ] && echo "\$CREATED|\$cluster"
+ done | sort -r | cut -d'|' -f2
+ """,
+ returnStdout: true
+ ).trim()
+
+ if (!output) {
+ return []
+ }
+
+ return output.split('\n').findAll { it }
+}
+
+/**
+ * Delete PMM HA EKS cluster and all associated AWS resources.
+ *
+ * Cleanup order (to avoid dependency errors):
+ * 1. Route53 alias record
+ * 2. ALB Ingress (triggers ALB deletion)
+ * 3. EKS cluster via eksctl
+ *
+ * @param clusterName EKS cluster name (required)
+ * @param region AWS region (default: us-east-2)
+ * @param r53ZoneName Route53 hosted zone name (default: cd.percona.com)
+ */
+def deleteCluster(Map config) {
+ def clusterName = config.clusterName ?: error('clusterName is required')
+ def region = config.region ?: 'us-east-2'
+ def r53ZoneName = config.r53ZoneName ?: 'cd.percona.com'
+
+ def r53ZoneId = resolveR53ZoneId(r53ZoneName, region)
+
+ sh """
+ set -euo pipefail
+
+ cluster_name="${clusterName}"
+ REGION="${region}"
+ R53_ZONE_ID="${r53ZoneId}"
+ R53_ZONE_NAME="${r53ZoneName}"
+
+ echo "============================================"
+ echo "Cleaning up cluster: \${cluster_name}"
+ echo "============================================"
+
+ # Delete Route53 record (if zone was resolved)
+ domain_name="\${cluster_name}.\${R53_ZONE_NAME}"
+ if [ -n "\${R53_ZONE_ID}" ]; then
+ echo "Deleting Route53 record for \${domain_name}..."
+ record=\$(aws route53 list-resource-record-sets \\
+ --hosted-zone-id "\${R53_ZONE_ID}" \\
+ --query "ResourceRecordSets[?Name=='\${domain_name}.']" \\
+ --output json 2>/dev/null || echo "[]")
+
+ if [ "\${record}" != "[]" ] && [ -n "\${record}" ]; then
+ record_type=\$(echo "\${record}" | jq -r '.[0].Type')
+ if [ "\${record_type}" = "A" ]; then
+ alias_target=\$(echo "\${record}" | jq -r '.[0].AliasTarget')
+ aws route53 change-resource-record-sets \\
+ --hosted-zone-id "\${R53_ZONE_ID}" \\
+ --change-batch '{
+ "Changes": [{
+ "Action": "DELETE",
+ "ResourceRecordSet": {
+ "Name": "'"\${domain_name}"'",
+ "Type": "A",
+ "AliasTarget": '"\${alias_target}"'
+ }
+ }]
+ }' && echo "Route53 record deleted" || echo "Warning: Failed to delete Route53 record"
+ fi
+ else
+ echo "No Route53 record found for \${domain_name}"
+ fi
+ else
+ echo "Skipping Route53 record deletion (zone not resolved)"
+ fi
+
+ # Delete ALB ingress (triggers ALB deletion)
+ # Use per-cluster kubeconfig to avoid race conditions during parallel deletions
+ echo "Deleting ALB ingress..."
+ TEMP_KUBECONFIG="\$(mktemp)"
+ if aws eks update-kubeconfig --name "\${cluster_name}" --region "\${REGION}" --kubeconfig "\${TEMP_KUBECONFIG}" 2>/dev/null; then
+ KUBECONFIG="\${TEMP_KUBECONFIG}" kubectl delete ingress pmm-ha-alb -n pmm --ignore-not-found=true || true
+ fi
+ rm -f "\${TEMP_KUBECONFIG}"
+
+ # Wait for ALB cleanup
+ echo "Waiting for ALB cleanup..."
+ sleep 30
+
+ # Disable termination protection on all CloudFormation stacks for this cluster
+ echo "Disabling termination protection on CloudFormation stacks..."
+ for stack_name in \$(aws cloudformation list-stacks --region "\${REGION}" \\
+ --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE \\
+ --query "StackSummaries[?starts_with(StackName, 'eksctl-\${cluster_name}')].StackName" \\
+ --output text 2>/dev/null); do
+ echo " Disabling protection: \${stack_name}"
+ aws cloudformation update-termination-protection \\
+ --region "\${REGION}" \\
+ --stack-name "\${stack_name}" \\
+ --no-enable-termination-protection 2>/dev/null || true
+ done
+
+ echo "Deleting EKS cluster \${cluster_name}..."
+ eksctl delete cluster --region "\${REGION}" --name "\${cluster_name}" \\
+ --disable-nodegroup-eviction --wait
+ """
+}
+
+/**
+ * Delete multiple clusters with optional SKIP_NEWEST and retention-aware filtering.
+ *
+ * Deletion Rules:
+ * Cluster Type | Action
+ * ------------------------------|------------------------------
+ * Unexpired delete-after tag | KEEP (unless respectRetention=false)
+ * Expired delete-after tag | DELETE
+ * No delete-after tag | DELETE
+ * Newest (skipNewest=true) | SKIP
+ *
+ * @param region AWS region (default: us-east-2)
+ * @param r53ZoneName Route53 hosted zone name (default: cd.percona.com)
+ * @param skipNewest Skip the most recent cluster (default: true)
+ * @param respectRetention Respect delete-after tags; false = force delete (default: true)
+ */
+def deleteAllClusters(Map config = [:]) {
+ def region = config.region ?: 'us-east-2'
+ def r53ZoneName = config.r53ZoneName ?: 'cd.percona.com'
+ def skipNewest = config.skipNewest != null ? config.skipNewest : true
+ def respectRetention = config.respectRetention != null ? config.respectRetention : true
+
+ def clusterList = listClusters(region)
+
+ if (!clusterList) {
+ echo "No clusters found with prefix '${CLUSTER_PREFIX}'."
+ return
+ }
+
+ def clustersToDelete = clusterList
+ if (skipNewest) {
+ clustersToDelete = clusterList.drop(1)
+ echo "Skipping newest cluster: ${clusterList[0]} (SKIP_NEWEST=true)"
+ }
+
+ // Filter by retention tags
+ if (respectRetention) {
+ def nowEpoch = (long)(System.currentTimeMillis() / 1000)
+ def filtered = []
+
+ clustersToDelete.each { clusterName ->
+ def deleteAfterTag = sh(
+ script: """
+ aws eks describe-cluster --name ${clusterName} --region ${region} \\
+ --query 'cluster.tags."delete-after"' --output text 2>/dev/null || echo ''
+ """,
+ returnStdout: true
+ ).trim()
+
+ if (deleteAfterTag && deleteAfterTag != 'None' && deleteAfterTag != 'null' && deleteAfterTag != '') {
+ // Cluster has retention tag - check if expired
+ def deleteAfterEpoch = deleteAfterTag as long
+ if (nowEpoch > deleteAfterEpoch) {
+ echo "DELETE: ${clusterName} - retention expired (delete-after: ${deleteAfterEpoch})"
+ filtered.add(clusterName)
+ } else if (respectRetention) {
+ def hoursLeft = (int)((deleteAfterEpoch - nowEpoch) / 3600)
+ echo "KEEP: ${clusterName} - ${hoursLeft}h retention remaining"
+ } else {
+ echo "DELETE: ${clusterName} - retention override (respectRetention=false)"
+ filtered.add(clusterName)
+ }
+ } else {
+ // No delete-after tag - delete immediately
+ echo "DELETE: ${clusterName} - no retention tag"
+ filtered.add(clusterName)
+ }
+ }
+ clustersToDelete = filtered
+ }
+
+ if (!clustersToDelete) {
+ echo 'No clusters to delete after applying filters.'
+ return
+ }
+
+ // Delete clusters in parallel
+ def parallelStages = [:]
+ clustersToDelete.each { clusterName ->
+ parallelStages["Delete ${clusterName}"] = {
+ deleteCluster(
+ clusterName: clusterName,
+ region: region,
+ r53ZoneName: r53ZoneName
+ )
+ }
+ }
+ parallel parallelStages
+}
+
+/**
+ * Clean up orphaned VPCs and failed CloudFormation stacks.
+ *
+ * Finds:
+ * - VPCs tagged with purpose=pmm-ha-testing but no matching EKS cluster
+ * - CloudFormation stacks in DELETE_FAILED or ROLLBACK_COMPLETE state
+ *
+ * @param region AWS region (default: us-east-2)
+ */
+def cleanupOrphans(Map config = [:]) {
+ def region = config.region ?: 'us-east-2'
+
+ // Get list of active EKS clusters
+ def activeClusters = sh(
+ script: """
+ aws eks list-clusters --region ${region} \\
+ --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" \\
+ --output text 2>/dev/null || echo ''
+ """,
+ returnStdout: true
+ ).trim().split(/\s+/).findAll { it }
+
+ echo "Active EKS clusters: ${activeClusters}"
+
+ // Find orphaned VPCs by tag (more reliable than name pattern matching).
+ // VPCs inherit tags from eksctl cluster config. When the EKS cluster is deleted
+ // via AWS console or eksctl fails midway, the VPC and CF stacks remain.
+ // We extract cluster name from the Name tag and use eksctl to clean up.
+ def orphanedVpcs = sh(
+ script: """
+ aws ec2 describe-vpcs --region ${region} \\
+ --filters "Name=tag:purpose,Values=pmm-ha-testing" \\
+ --query 'Vpcs[*].[VpcId,Tags[?Key==`Name`].Value|[0]]' \\
+ --output text 2>/dev/null || echo ''
+ """,
+ returnStdout: true
+ ).trim()
+
+ if (orphanedVpcs) {
+ orphanedVpcs.split('\n').each { line ->
+ def parts = line.split('\t')
+ if (parts.size() >= 2) {
+ def vpcId = parts[0]
+ def vpcName = parts[1] ?: ''
+ // Extract cluster name from VPC name (eksctl-pmm-ha-test-XX-cluster/VPC)
+ def matcher = vpcName =~ /eksctl-(${CLUSTER_PREFIX}\d+)-cluster/
+ if (matcher) {
+ def clusterName = matcher[0][1]
+ if (!activeClusters.contains(clusterName)) {
+ echo "Found orphaned VPC: ${vpcId} (${vpcName}) - cluster ${clusterName} does not exist"
+ sh """
+ eksctl delete cluster --name ${clusterName} --region ${region} --wait=false 2>/dev/null || true
+ """
+ }
+ }
+ }
+ }
+ } else {
+ echo 'No orphaned VPCs found.'
+ }
+
+ // Find and delete failed CloudFormation stacks
+ def failedStacks = sh(
+ script: """
+ aws cloudformation list-stacks --region ${region} \\
+ --stack-status-filter DELETE_FAILED ROLLBACK_COMPLETE \\
+ --query "StackSummaries[?contains(StackName, '${CLUSTER_PREFIX}')].StackName" \\
+ --output text 2>/dev/null || echo ''
+ """,
+ returnStdout: true
+ ).trim()
+
+ if (failedStacks) {
+ failedStacks.split(/\s+/).each { stackName ->
+ echo "Deleting failed stack: ${stackName}"
+ sh "aws cloudformation delete-stack --region ${region} --stack-name ${stackName} || true"
+ }
+ } else {
+ echo 'No failed CloudFormation stacks found.'
+ }
+}