[EKS Scalability Testing] Implement workflows to execute large scale performance test (#1794)

JayPolanco · web-flow · commit dae12ee5fce2 · 2025-07-30T13:56:38.000-04:00
diff --git a/.github/workflows/eks-performance-cluster-addon-install.yml b/.github/workflows/eks-performance-cluster-addon-install.yml
@@ -67,6 +67,8 @@ env:
 
   # Github repository environment variables
   OPERATOR_GITHUB_REPO_NAME: "aws/amazon-cloudwatch-agent-operator"
+  CWA_GITHUB_TEST_REPO_NAME: "aws/amazon-cloudwatch-agent-test"
+  CWA_GITHUB_TEST_REPO_BRANCH: "main"
 
 jobs:
   # Check if this workflow should run
@@ -171,11 +173,27 @@ jobs:
         run: |
           aws eks update-kubeconfig --name $CLUSTER_NAME --region $AWS_REGION
 
+      # TODO: Revert to using main helm branch when changes from leader-election are merged in
       - name: Clone Helm Charts Repository
         run: |
           rm -rf ./helm-charts
-          git clone -b ${{ inputs.helm-charts-branch || 'main' }} https://github.com/aws-observability/helm-charts.git ./helm-charts
+          git clone -b ${{ inputs.helm-charts-branch || 'sky333999/leader-election' }} https://github.com/aws-observability/helm-charts.git ./helm-charts
 
+      - name: Clone Test Repo
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ env.CWA_GITHUB_TEST_REPO_NAME }}
+          ref: ${{ env.CWA_GITHUB_TEST_REPO_BRANCH }}
+          path: ./test-repo
+
+      - name: Replace hostname in override files
+        run: |
+          HOSTNAME=$(kubectl get nodes -l eks.amazonaws.com/nodegroup=$CLUSTER_NAME-leader-node -o jsonpath='{.items[0].metadata.name}')
+          for file in ./test-repo/test/performance/eks/resources/leader_election_overrides/*; do
+            sed -i "s/<hostname>/$HOSTNAME/g" "$file"
+          done
+
+      # TODO: Revert to using workflow built agent image once required changes are made on main branch
       - name: Check node count and manage Helm chart
         run: |
           NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
@@ -196,18 +214,19 @@ jobs:
             echo "MANAGER_TAG: ${{ inputs.cloudwatch_agent_operator_tag || needs.GetLatestOperatorCommitSHA.outputs.operator_commit_sha }}"
             echo "MANAGER_REPOSITORY_DOMAIN: ${{ steps.login-ecr.outputs.registry }}"
           
-            helm upgrade --install amazon-cloudwatch-observability \
+            helm upgrade --install --wait amazon-cloudwatch-observability \
               ./helm-charts/charts/amazon-cloudwatch-observability \
               --namespace amazon-cloudwatch \
               --create-namespace \
               --set clusterName=${{ inputs.cluster_name ||env.CLUSTER_NAME }} \
               --set region=${{ inputs.region || env.AWS_REGION }} \
-              --set agent.image.repository=${{ inputs.cloudwatch_agent_repository || env.AGENT_ECR_TEST_REPO }} \
-              --set agent.image.tag=${{ inputs.cloudwatch_agent_tag || github.sha }} \
-              --set agent.image.repositoryDomainMap.public=${{ steps.login-ecr.outputs.registry }} \
+              --set agent.image.repository="cloudwatch-agent" \
+              --set agent.image.tag="latest" \
+              --set agent.image.repositoryDomainMap.public="public.ecr.aws/q4e2d9n7" \
               --set manager.image.repository=${{ inputs.cloudwatch_agent_operator_repository || env.OPERATOR_ECR_TEST_REPO }} \
               --set manager.image.tag=${{ inputs.cloudwatch_agent_operator_tag || needs.GetLatestOperatorCommitSHA.outputs.operator_commit_sha }} \
-              --set manager.image.repositoryDomainMap.public=${{ steps.login-ecr.outputs.registry }}
+              --set manager.image.repositoryDomainMap.public=${{ steps.login-ecr.outputs.registry }} \
+              --values ./test-repo/test/performance/eks/resources/leader_election_overrides/base-overrides.yml
           fi
 
   cleanup-on-failure:
diff --git a/.github/workflows/eks-performance-cluster-scaling.yml b/.github/workflows/eks-performance-cluster-scaling.yml
@@ -27,14 +27,19 @@ on:
         description: 'Count of node groups'
         type: number
         default: 10
+      leader_node_desired_capacity:
+        description: 'Desired capacity for leader node group (manual execution only)'
+        type: number
+        default: 1
 
 env:
   AWS_REGION: ${{ inputs.region || 'us-west-2' }}
   CLUSTER_NAME: ${{ inputs.cluster_name || 'eks-performance' }}
   NODE_GROUP_COUNT: ${{ inputs.node_group_count || 10 }}
   DESIRED_CAPACITY_PER_NODEGROUP: ${{ inputs.desired_capacity_per_nodegroup || 500 }}
+  LEADER_NODE_DESIRED_CAPACITY: ${{ inputs.leader_node_desired_capacity || 1 }}
   TERRAFORM_AWS_ASSUME_ROLE: ${{ vars.TERRAFORM_AWS_ASSUME_ROLE }}
-  TERRAFORM_AWS_ASSUME_ROLE_DURATION: 3600  # 1 hour duration
+  TERRAFORM_AWS_ASSUME_ROLE_DURATION: 14400  # 4 hour duration
   CWA_GITHUB_TEST_REPO_NAME: "aws/amazon-cloudwatch-agent-test"
   CWA_GITHUB_TEST_REPO_URL: "https://github.com/aws/amazon-cloudwatch-agent-test.git"
   CWA_GITHUB_TEST_REPO_BRANCH: "main"
@@ -54,7 +59,7 @@ jobs:
       - name: Configure AWS Credentials
         uses: aws-actions/configure-aws-credentials@v4
         with:
-          role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE}}
+          role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE }}
           aws-region: ${{ inputs.region || 'us-west-2' }}
           role-duration-seconds: ${{ env.TERRAFORM_AWS_ASSUME_ROLE_DURATION }}
 
@@ -72,6 +77,17 @@ jobs:
         run: |
           echo "Starting scale UP operation with desired capacity: $DESIRED_CAPACITY_PER_NODEGROUP"
           
+          # Scale leader node to 1
+          echo "Scaling leader node group: $CLUSTER_NAME-leader-node to 1"
+          aws eks update-nodegroup-config \
+            --cluster-name $CLUSTER_NAME \
+            --nodegroup-name $CLUSTER_NAME-leader-node \
+            --region $AWS_REGION \
+            --scaling-config desiredSize=1
+          
+          echo "Waiting 1 minute before scaling regular node groups..."
+          sleep 60
+          
           for i in $(seq 1 $NODE_GROUP_COUNT); do
             echo "Scaling node group: $CLUSTER_NAME-node-${i} to $DESIRED_CAPACITY_PER_NODEGROUP"
             aws eks update-nodegroup-config \
@@ -89,6 +105,17 @@ jobs:
         run: |
           echo "Starting scale DOWN operation with desired capacity: 0"
           
+          # Scale leader node to 0
+          echo "Scaling leader node group: $CLUSTER_NAME-leader-node to 0"
+          aws eks update-nodegroup-config \
+            --cluster-name $CLUSTER_NAME \
+            --nodegroup-name $CLUSTER_NAME-leader-node \
+            --region $AWS_REGION \
+            --scaling-config desiredSize=0
+          
+          echo "Waiting 1 minute before scaling regular node groups..."
+          sleep 60
+          
           for i in $(seq 1 $NODE_GROUP_COUNT); do
             echo "Scaling node group: $CLUSTER_NAME-node-${i} to 0"
             aws eks update-nodegroup-config \
@@ -105,6 +132,18 @@ jobs:
         if: github.event_name == 'workflow_dispatch'
         run: |
           echo "Starting manual scaling operation with desired capacity: $DESIRED_CAPACITY_PER_NODEGROUP"
+          echo "Leader node desired capacity: $LEADER_NODE_DESIRED_CAPACITY"
+          
+          # Scale leader node to specified capacity
+          echo "Scaling leader node group: $CLUSTER_NAME-leader-node to $LEADER_NODE_DESIRED_CAPACITY"
+          aws eks update-nodegroup-config \
+            --cluster-name $CLUSTER_NAME \
+            --nodegroup-name $CLUSTER_NAME-leader-node \
+            --region $AWS_REGION \
+            --scaling-config desiredSize=$LEADER_NODE_DESIRED_CAPACITY
+          
+          echo "Waiting 1 minute before scaling regular node groups..."
+          sleep 60
           
           for i in $(seq 1 $NODE_GROUP_COUNT); do
             echo "Scaling node group: $CLUSTER_NAME-node-${i} to $DESIRED_CAPACITY_PER_NODEGROUP"
@@ -118,17 +157,17 @@ jobs:
 
       - name: Validate total node count
         run: |
-          echo "Waiting 20 minutes for scaling operations to complete..."
-          sleep 1200
+          echo "Waiting 30 minutes for scaling operations to complete and stabilize..."
+          sleep 1800
           
           echo "Validating total number of nodes in the cluster..."
           ACTUAL_NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
           
           # Determine expected count based on trigger type
           if [ "${{ github.event.schedule }}" = "0 21 * * 1" ]; then
-            EXPECTED_NODE_COUNT=$(($NODE_GROUP_COUNT * 0))
+            EXPECTED_NODE_COUNT=0
           else
-            EXPECTED_NODE_COUNT=$(($NODE_GROUP_COUNT * $DESIRED_CAPACITY_PER_NODEGROUP))
+            EXPECTED_NODE_COUNT=$(($NODE_GROUP_COUNT * $DESIRED_CAPACITY_PER_NODEGROUP + $LEADER_NODE_DESIRED_CAPACITY))
           fi
           
           echo "Expected total nodes: $EXPECTED_NODE_COUNT"
diff --git a/.github/workflows/eks-performance-cluster-tests.yml b/.github/workflows/eks-performance-cluster-tests.yml
@@ -0,0 +1,181 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT
+name: "EKS Performance Cluster Tests"
+on:
+  # Use workflow_run to trigger this workflow after the scaling workflow completes
+  workflow_run:
+    workflows: [ "EKS Performance Test Run" ]
+    types:
+      - completed
+    branches:
+      - main  # Adjust this if your default branch is different
+
+  # Keep the manual trigger option
+  workflow_dispatch:
+    inputs:
+      # Required Core Settings
+      cluster_name:
+        description: 'EKS Cluster Name'
+        required: true
+        type: string
+        default: 'eks-performance'
+      region:
+        description: 'AWS Region'
+        required: true
+        type: string
+        default: 'us-west-2'
+      metric_map:
+        description: 'Map containing metrics to validate'
+        type: string
+
+      # Optional Settings
+      terraform_assume_role:
+        description: 'AWS IAM Role to assume'
+        type: string
+      test_repo_name:
+        description: 'Agent test repo'
+        type: string
+      test_repo_branch:
+        description: 'Agent test repo branch'
+        type: string
+      test_dir:
+        description: 'Agent test directory'
+        type: string
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  # Cluster environment variables
+  AWS_REGION: ${{ inputs.region || 'us-west-2' }}
+  CLUSTER_NAME: ${{ inputs.cluster_name || 'eks-performance' }}
+  TERRAFORM_AWS_ASSUME_ROLE: ${{ inputs.terraform_assume_role || vars.TERRAFORM_AWS_ASSUME_ROLE }}
+  TERRAFORM_AWS_ASSUME_ROLE_DURATION: 14400  # 4 hour duration
+
+  # Agent test repo environment variables
+  CWA_GITHUB_TEST_REPO_NAME: ${{ inputs.test_repo_name || 'aws/amazon-cloudwatch-agent-test' }}
+  CWA_GITHUB_TEST_REPO_BRANCH: ${{ inputs.test_repo_branch || 'main' }}
+  CWA_TEST_DIRECTORY: ${{ inputs.test_dir || './test/performance/eks' }}
+
+jobs:
+  # Check if this workflow should run, doesn't need to run test if no nodes exist
+  check-trigger:
+    runs-on: ubuntu-latest
+    if: ${{ github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.event == 'schedule') }}
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Check trigger type
+        id: check-trigger
+        run: |
+          if [ "${{ github.event_name }}" == "workflow_run" ]; then
+            echo "Triggered by workflow_run from a scheduled event"
+          else
+            echo "Triggered manually via workflow_dispatch"
+          fi
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE}}
+          aws-region: ${{ env.AWS_REGION}}
+          role-duration-seconds: ${{ env.TERRAFORM_AWS_ASSUME_ROLE_DURATION }}
+
+      - name: Install kubectl
+        uses: azure/setup-kubectl@v3
+        with:
+          version: 'latest'
+
+      - name: Update kubeconfig
+        run: |
+          aws eks update-kubeconfig --name $CLUSTER_NAME --region $AWS_REGION
+
+      - name: Override should_continue based on node count
+        id: final-check
+        run: |
+          NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
+          echo "Node count: $NODE_COUNT"
+
+          if [ "$NODE_COUNT" -eq 0 ]; then
+            echo "No nodes available, setting should_continue to false"
+            echo "should_continue=false" >> "$GITHUB_OUTPUT"
+          else
+            echo "Nodes available, setting should_continue as true"
+            echo "should_continue=true" >> "$GITHUB_OUTPUT"
+          fi
+
+    outputs:
+      should_continue: ${{ steps.final-check.outputs.should_continue }}
+
+  EKSPerformanceBaseTest:
+    name: EKSPerformanceBaseTest
+    needs: [ check-trigger ]
+    if: ${{ needs.check-trigger.outputs.should_continue == 'true' }}
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Set up Go 1.x
+        uses: actions/setup-go@v4
+        with:
+          go-version: ~1.24.4
+
+      - uses: actions/checkout@v4
+        with:
+          repository: ${{ env.CWA_GITHUB_TEST_REPO_NAME }}
+          ref: ${{ env.CWA_GITHUB_TEST_REPO_BRANCH }}
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE}}
+          aws-region: ${{ env.AWS_REGION}}
+          role-duration-seconds: ${{ env.TERRAFORM_AWS_ASSUME_ROLE_DURATION }}
+
+      - name: Login ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v2
+
+      - name: Install kubectl
+        uses: azure/setup-kubectl@v3
+        with:
+          version: 'latest'
+
+      - name: Update kubeconfig
+        run: |
+          aws eks update-kubeconfig --name $CLUSTER_NAME --region $AWS_REGION
+
+      - name: Install Sample Application
+        uses: nick-fields/retry@v2
+        with:
+          max_attempts: 2
+          timeout_minutes: 20
+          command: |
+            cd test/performance/eks/resources
+            kubectl apply -f petclinic-sample-app
+            echo "Waiting 15 minutes for the application to initialize..."
+            sleep 900
+
+      - name: Run Performance Test
+        uses: nick-fields/retry@v2
+        with:
+          max_attempts: 2
+          timeout_minutes: 20
+          command: |
+            go test -timeout 30m -v $CWA_TEST_DIRECTORY \
+              -computeType=EKS \
+              -eksClusterName=$CLUSTER_NAME \
+              -performanceMetricMapName=${{ inputs.metric_map || 'base-performance-metrics-map.json' }} \
+              -performanceTestName=EKSPerformanceBaseTest
+
+      - name: Cleanup Sample Application
+        if: always()
+        run: |
+          cd test/performance/eks/resources
+          kubectl delete -f petclinic-sample-app
+          echo "Sample application resources have been deleted"
+