Skip to content

Commit dae12ee

Browse files
authored
[EKS Scalability Testing] Implement workflows to execute large scale performance test (#1794)
1 parent 7215c51 commit dae12ee

File tree

3 files changed

+251
-12
lines changed

3 files changed

+251
-12
lines changed

.github/workflows/eks-performance-cluster-addon-install.yml

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ env:
6767

6868
# Github repository environment variables
6969
OPERATOR_GITHUB_REPO_NAME: "aws/amazon-cloudwatch-agent-operator"
70+
CWA_GITHUB_TEST_REPO_NAME: "aws/amazon-cloudwatch-agent-test"
71+
CWA_GITHUB_TEST_REPO_BRANCH: "main"
7072

7173
jobs:
7274
# Check if this workflow should run
@@ -171,11 +173,27 @@ jobs:
171173
run: |
172174
aws eks update-kubeconfig --name $CLUSTER_NAME --region $AWS_REGION
173175
176+
# TODO: Revert to using main helm branch when changes from leader-election are merged in
174177
- name: Clone Helm Charts Repository
175178
run: |
176179
rm -rf ./helm-charts
177-
git clone -b ${{ inputs.helm-charts-branch || 'main' }} https://github.com/aws-observability/helm-charts.git ./helm-charts
180+
git clone -b ${{ inputs.helm-charts-branch || 'sky333999/leader-election' }} https://github.com/aws-observability/helm-charts.git ./helm-charts
178181
182+
- name: Clone Test Repo
183+
uses: actions/checkout@v4
184+
with:
185+
repository: ${{ env.CWA_GITHUB_TEST_REPO_NAME }}
186+
ref: ${{ env.CWA_GITHUB_TEST_REPO_BRANCH }}
187+
path: ./test-repo
188+
189+
- name: Replace hostname in override files
190+
run: |
191+
HOSTNAME=$(kubectl get nodes -l eks.amazonaws.com/nodegroup=$CLUSTER_NAME-leader-node -o jsonpath='{.items[0].metadata.name}')
192+
for file in ./test-repo/test/performance/eks/resources/leader_election_overrides/*; do
193+
sed -i "s/<hostname>/$HOSTNAME/g" "$file"
194+
done
195+
196+
# TODO: Revert to using workflow built agent image once required changes are made on main branch
179197
- name: Check node count and manage Helm chart
180198
run: |
181199
NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
@@ -196,18 +214,19 @@ jobs:
196214
echo "MANAGER_TAG: ${{ inputs.cloudwatch_agent_operator_tag || needs.GetLatestOperatorCommitSHA.outputs.operator_commit_sha }}"
197215
echo "MANAGER_REPOSITORY_DOMAIN: ${{ steps.login-ecr.outputs.registry }}"
198216
199-
helm upgrade --install amazon-cloudwatch-observability \
217+
helm upgrade --install --wait amazon-cloudwatch-observability \
200218
./helm-charts/charts/amazon-cloudwatch-observability \
201219
--namespace amazon-cloudwatch \
202220
--create-namespace \
203221
--set clusterName=${{ inputs.cluster_name ||env.CLUSTER_NAME }} \
204222
--set region=${{ inputs.region || env.AWS_REGION }} \
205-
--set agent.image.repository=${{ inputs.cloudwatch_agent_repository || env.AGENT_ECR_TEST_REPO }} \
206-
--set agent.image.tag=${{ inputs.cloudwatch_agent_tag || github.sha }} \
207-
--set agent.image.repositoryDomainMap.public=${{ steps.login-ecr.outputs.registry }} \
223+
--set agent.image.repository="cloudwatch-agent" \
224+
--set agent.image.tag="latest" \
225+
--set agent.image.repositoryDomainMap.public="public.ecr.aws/q4e2d9n7" \
208226
--set manager.image.repository=${{ inputs.cloudwatch_agent_operator_repository || env.OPERATOR_ECR_TEST_REPO }} \
209227
--set manager.image.tag=${{ inputs.cloudwatch_agent_operator_tag || needs.GetLatestOperatorCommitSHA.outputs.operator_commit_sha }} \
210-
--set manager.image.repositoryDomainMap.public=${{ steps.login-ecr.outputs.registry }}
228+
--set manager.image.repositoryDomainMap.public=${{ steps.login-ecr.outputs.registry }} \
229+
--values ./test-repo/test/performance/eks/resources/leader_election_overrides/base-overrides.yml
211230
fi
212231
213232
cleanup-on-failure:

.github/workflows/eks-performance-cluster-scaling.yml

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,19 @@ on:
2727
description: 'Count of node groups'
2828
type: number
2929
default: 10
30+
leader_node_desired_capacity:
31+
description: 'Desired capacity for leader node group (manual execution only)'
32+
type: number
33+
default: 1
3034

3135
env:
3236
AWS_REGION: ${{ inputs.region || 'us-west-2' }}
3337
CLUSTER_NAME: ${{ inputs.cluster_name || 'eks-performance' }}
3438
NODE_GROUP_COUNT: ${{ inputs.node_group_count || 10 }}
3539
DESIRED_CAPACITY_PER_NODEGROUP: ${{ inputs.desired_capacity_per_nodegroup || 500 }}
40+
LEADER_NODE_DESIRED_CAPACITY: ${{ inputs.leader_node_desired_capacity || 1 }}
3641
TERRAFORM_AWS_ASSUME_ROLE: ${{ vars.TERRAFORM_AWS_ASSUME_ROLE }}
37-
TERRAFORM_AWS_ASSUME_ROLE_DURATION: 3600 # 1 hour duration
42+
TERRAFORM_AWS_ASSUME_ROLE_DURATION: 14400 # 4 hour duration
3843
CWA_GITHUB_TEST_REPO_NAME: "aws/amazon-cloudwatch-agent-test"
3944
CWA_GITHUB_TEST_REPO_URL: "https://github.com/aws/amazon-cloudwatch-agent-test.git"
4045
CWA_GITHUB_TEST_REPO_BRANCH: "main"
@@ -54,7 +59,7 @@ jobs:
5459
- name: Configure AWS Credentials
5560
uses: aws-actions/configure-aws-credentials@v4
5661
with:
57-
role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE}}
62+
role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE }}
5863
aws-region: ${{ inputs.region || 'us-west-2' }}
5964
role-duration-seconds: ${{ env.TERRAFORM_AWS_ASSUME_ROLE_DURATION }}
6065

@@ -72,6 +77,17 @@ jobs:
7277
run: |
7378
echo "Starting scale UP operation with desired capacity: $DESIRED_CAPACITY_PER_NODEGROUP"
7479
80+
# Scale leader node to 1
81+
echo "Scaling leader node group: $CLUSTER_NAME-leader-node to 1"
82+
aws eks update-nodegroup-config \
83+
--cluster-name $CLUSTER_NAME \
84+
--nodegroup-name $CLUSTER_NAME-leader-node \
85+
--region $AWS_REGION \
86+
--scaling-config desiredSize=1
87+
88+
echo "Waiting 1 minute before scaling regular node groups..."
89+
sleep 60
90+
7591
for i in $(seq 1 $NODE_GROUP_COUNT); do
7692
echo "Scaling node group: $CLUSTER_NAME-node-${i} to $DESIRED_CAPACITY_PER_NODEGROUP"
7793
aws eks update-nodegroup-config \
@@ -89,6 +105,17 @@ jobs:
89105
run: |
90106
echo "Starting scale DOWN operation with desired capacity: 0"
91107
108+
# Scale leader node to 0
109+
echo "Scaling leader node group: $CLUSTER_NAME-leader-node to 0"
110+
aws eks update-nodegroup-config \
111+
--cluster-name $CLUSTER_NAME \
112+
--nodegroup-name $CLUSTER_NAME-leader-node \
113+
--region $AWS_REGION \
114+
--scaling-config desiredSize=0
115+
116+
echo "Waiting 1 minute before scaling regular node groups..."
117+
sleep 60
118+
92119
for i in $(seq 1 $NODE_GROUP_COUNT); do
93120
echo "Scaling node group: $CLUSTER_NAME-node-${i} to 0"
94121
aws eks update-nodegroup-config \
@@ -105,6 +132,18 @@ jobs:
105132
if: github.event_name == 'workflow_dispatch'
106133
run: |
107134
echo "Starting manual scaling operation with desired capacity: $DESIRED_CAPACITY_PER_NODEGROUP"
135+
echo "Leader node desired capacity: $LEADER_NODE_DESIRED_CAPACITY"
136+
137+
# Scale leader node to specified capacity
138+
echo "Scaling leader node group: $CLUSTER_NAME-leader-node to $LEADER_NODE_DESIRED_CAPACITY"
139+
aws eks update-nodegroup-config \
140+
--cluster-name $CLUSTER_NAME \
141+
--nodegroup-name $CLUSTER_NAME-leader-node \
142+
--region $AWS_REGION \
143+
--scaling-config desiredSize=$LEADER_NODE_DESIRED_CAPACITY
144+
145+
echo "Waiting 1 minute before scaling regular node groups..."
146+
sleep 60
108147
109148
for i in $(seq 1 $NODE_GROUP_COUNT); do
110149
echo "Scaling node group: $CLUSTER_NAME-node-${i} to $DESIRED_CAPACITY_PER_NODEGROUP"
@@ -118,17 +157,17 @@ jobs:
118157
119158
- name: Validate total node count
120159
run: |
121-
echo "Waiting 20 minutes for scaling operations to complete..."
122-
sleep 1200
160+
echo "Waiting 30 minutes for scaling operations to complete and stabilize..."
161+
sleep 1800
123162
124163
echo "Validating total number of nodes in the cluster..."
125164
ACTUAL_NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
126165
127166
# Determine expected count based on trigger type
128167
if [ "${{ github.event.schedule }}" = "0 21 * * 1" ]; then
129-
EXPECTED_NODE_COUNT=$(($NODE_GROUP_COUNT * 0))
168+
EXPECTED_NODE_COUNT=0
130169
else
131-
EXPECTED_NODE_COUNT=$(($NODE_GROUP_COUNT * $DESIRED_CAPACITY_PER_NODEGROUP))
170+
EXPECTED_NODE_COUNT=$(($NODE_GROUP_COUNT * $DESIRED_CAPACITY_PER_NODEGROUP + $LEADER_NODE_DESIRED_CAPACITY))
132171
fi
133172
134173
echo "Expected total nodes: $EXPECTED_NODE_COUNT"
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: MIT
3+
name: "EKS Performance Cluster Tests"
4+
on:
5+
# Use workflow_run to trigger this workflow after the scaling workflow completes
6+
workflow_run:
7+
workflows: [ "EKS Performance Test Run" ]
8+
types:
9+
- completed
10+
branches:
11+
- main # Adjust this if your default branch is different
12+
13+
# Keep the manual trigger option
14+
workflow_dispatch:
15+
inputs:
16+
# Required Core Settings
17+
cluster_name:
18+
description: 'EKS Cluster Name'
19+
required: true
20+
type: string
21+
default: 'eks-performance'
22+
region:
23+
description: 'AWS Region'
24+
required: true
25+
type: string
26+
default: 'us-west-2'
27+
metric_map:
28+
description: 'Map containing metrics to validate'
29+
type: string
30+
31+
# Optional Settings
32+
terraform_assume_role:
33+
description: 'AWS IAM Role to assume'
34+
type: string
35+
test_repo_name:
36+
description: 'Agent test repo'
37+
type: string
38+
test_repo_branch:
39+
description: 'Agent test repo branch'
40+
type: string
41+
test_dir:
42+
description: 'Agent test directory'
43+
type: string
44+
45+
46+
concurrency:
47+
group: ${{ github.workflow }}-${{ github.ref }}
48+
cancel-in-progress: true
49+
50+
env:
51+
# Cluster environment variables
52+
AWS_REGION: ${{ inputs.region || 'us-west-2' }}
53+
CLUSTER_NAME: ${{ inputs.cluster_name || 'eks-performance' }}
54+
TERRAFORM_AWS_ASSUME_ROLE: ${{ inputs.terraform_assume_role || vars.TERRAFORM_AWS_ASSUME_ROLE }}
55+
TERRAFORM_AWS_ASSUME_ROLE_DURATION: 14400 # 4 hour duration
56+
57+
# Agent test repo environment variables
58+
CWA_GITHUB_TEST_REPO_NAME: ${{ inputs.test_repo_name || 'aws/amazon-cloudwatch-agent-test' }}
59+
CWA_GITHUB_TEST_REPO_BRANCH: ${{ inputs.test_repo_branch || 'main' }}
60+
CWA_TEST_DIRECTORY: ${{ inputs.test_dir || './test/performance/eks' }}
61+
62+
jobs:
63+
# Check if this workflow should run, doesn't need to run test if no nodes exist
64+
check-trigger:
65+
runs-on: ubuntu-latest
66+
if: ${{ github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.event == 'schedule') }}
67+
permissions:
68+
id-token: write
69+
contents: read
70+
steps:
71+
- name: Check trigger type
72+
id: check-trigger
73+
run: |
74+
if [ "${{ github.event_name }}" == "workflow_run" ]; then
75+
echo "Triggered by workflow_run from a scheduled event"
76+
else
77+
echo "Triggered manually via workflow_dispatch"
78+
fi
79+
80+
- name: Configure AWS Credentials
81+
uses: aws-actions/configure-aws-credentials@v4
82+
with:
83+
role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE}}
84+
aws-region: ${{ env.AWS_REGION}}
85+
role-duration-seconds: ${{ env.TERRAFORM_AWS_ASSUME_ROLE_DURATION }}
86+
87+
- name: Install kubectl
88+
uses: azure/setup-kubectl@v3
89+
with:
90+
version: 'latest'
91+
92+
- name: Update kubeconfig
93+
run: |
94+
aws eks update-kubeconfig --name $CLUSTER_NAME --region $AWS_REGION
95+
96+
- name: Override should_continue based on node count
97+
id: final-check
98+
run: |
99+
NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
100+
echo "Node count: $NODE_COUNT"
101+
102+
if [ "$NODE_COUNT" -eq 0 ]; then
103+
echo "No nodes available, setting should_continue to false"
104+
echo "should_continue=false" >> "$GITHUB_OUTPUT"
105+
else
106+
echo "Nodes available, setting should_continue as true"
107+
echo "should_continue=true" >> "$GITHUB_OUTPUT"
108+
fi
109+
110+
outputs:
111+
should_continue: ${{ steps.final-check.outputs.should_continue }}
112+
113+
EKSPerformanceBaseTest:
114+
name: EKSPerformanceBaseTest
115+
needs: [ check-trigger ]
116+
if: ${{ needs.check-trigger.outputs.should_continue == 'true' }}
117+
runs-on: ubuntu-latest
118+
permissions:
119+
id-token: write
120+
contents: read
121+
steps:
122+
- name: Set up Go 1.x
123+
uses: actions/setup-go@v4
124+
with:
125+
go-version: ~1.24.4
126+
127+
- uses: actions/checkout@v4
128+
with:
129+
repository: ${{ env.CWA_GITHUB_TEST_REPO_NAME }}
130+
ref: ${{ env.CWA_GITHUB_TEST_REPO_BRANCH }}
131+
132+
- name: Configure AWS Credentials
133+
uses: aws-actions/configure-aws-credentials@v4
134+
with:
135+
role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE}}
136+
aws-region: ${{ env.AWS_REGION}}
137+
role-duration-seconds: ${{ env.TERRAFORM_AWS_ASSUME_ROLE_DURATION }}
138+
139+
- name: Login ECR
140+
id: login-ecr
141+
uses: aws-actions/amazon-ecr-login@v2
142+
143+
- name: Install kubectl
144+
uses: azure/setup-kubectl@v3
145+
with:
146+
version: 'latest'
147+
148+
- name: Update kubeconfig
149+
run: |
150+
aws eks update-kubeconfig --name $CLUSTER_NAME --region $AWS_REGION
151+
152+
- name: Install Sample Application
153+
uses: nick-fields/retry@v2
154+
with:
155+
max_attempts: 2
156+
timeout_minutes: 20
157+
command: |
158+
cd test/performance/eks/resources
159+
kubectl apply -f petclinic-sample-app
160+
echo "Waiting 15 minutes for the application to initialize..."
161+
sleep 900
162+
163+
- name: Run Performance Test
164+
uses: nick-fields/retry@v2
165+
with:
166+
max_attempts: 2
167+
timeout_minutes: 20
168+
command: |
169+
go test -timeout 30m -v $CWA_TEST_DIRECTORY \
170+
-computeType=EKS \
171+
-eksClusterName=$CLUSTER_NAME \
172+
-performanceMetricMapName=${{ inputs.metric_map || 'base-performance-metrics-map.json' }} \
173+
-performanceTestName=EKSPerformanceBaseTest
174+
175+
- name: Cleanup Sample Application
176+
if: always()
177+
run: |
178+
cd test/performance/eks/resources
179+
kubectl delete -f petclinic-sample-app
180+
echo "Sample application resources have been deleted"
181+

0 commit comments

Comments
 (0)