Skip to content

Commit 3c5dfb8

Browse files
Add Task to scale down replicas for addons in order to fix the aiml-load Pipeline (#576)
Description / Motivation: Addons like coredns and ebs-csi-controllers have PDB set due to which Karpenter is not able to scale down the nodepools. Added Task to scale these addons replicas to 0 before scaling down nodepools in aiml-load pipeline. In order to stop karpenter logs Task after nodepools are scaled down, I am adding another step in the Task which sets the Karpenter replicas to zero. This is done to force stop karpenter logs which keeps running even when job is done and prevents the Teardown step to run. Desktop Testing: Tested by triggering Tekton test run. Co-authored-by: Chithresh Azad <[email protected]>
1 parent 3ec214d commit 3c5dfb8

File tree

2 files changed

+117
-9
lines changed

2 files changed

+117
-9
lines changed

tests/tekton-resources/pipelines/eks/awscli-eks-aiml-load.yaml

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,6 @@ spec:
4848
- default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_node_role.json
4949
name: node-role-cfn-url
5050
type: string
51-
- name: manifest-id
52-
type: string
5351
- default: ""
5452
name: eksadm-s3-path
5553
type: string
@@ -194,8 +192,6 @@ spec:
194192
value: $(params.endpoint)
195193
- name: vpc-stack-name
196194
value: $(params.cluster-name)
197-
- name: manifest-id
198-
value: $(params.manifest-id)
199195
- name: eksadm-s3-path
200196
value: $(params.eksadm-s3-path)
201197
- name: kubernetes-version
@@ -602,6 +598,24 @@ spec:
602598
taskRef:
603599
kind: Task
604600
name: load-aiml-multiple-fine-tuning
601+
- name: scale-down-addons
602+
params:
603+
- name: cluster-name
604+
value: $(params.cluster-name)
605+
- name: coredns-replicas
606+
value: 0
607+
- name: ebs-csi-replicas
608+
value: 0
609+
- name: endpoint
610+
value: $(params.endpoint)
611+
runAfter:
612+
- load-aiml-multiple-fine-tuning
613+
taskRef:
614+
kind: Task
615+
name: scale-addons
616+
workspaces:
617+
- name: config
618+
workspace: config
605619
- name: scale-down-training
606620
params:
607621
- name: cluster-name
@@ -613,7 +627,7 @@ spec:
613627
- name: replicas
614628
value: 0
615629
runAfter:
616-
- load-aiml-multiple-fine-tuning
630+
- scale-down-addons
617631
taskRef:
618632
kind: Task
619633
name: scale-nodepool
@@ -643,7 +657,7 @@ spec:
643657
- name: replicas
644658
value: 0
645659
runAfter:
646-
- load-aiml-multiple-fine-tuning
660+
- scale-down-addons
647661
taskRef:
648662
kind: Task
649663
name: scale-nodepool
@@ -673,7 +687,7 @@ spec:
673687
- name: replicas
674688
value: 0
675689
runAfter:
676-
- load-aiml-multiple-fine-tuning
690+
- scale-down-addons
677691
taskRef:
678692
kind: Task
679693
name: scale-nodepool
@@ -703,7 +717,7 @@ spec:
703717
- name: replicas
704718
value: 0
705719
runAfter:
706-
- load-aiml-multiple-fine-tuning
720+
- scale-down-addons
707721
taskRef:
708722
kind: Task
709723
name: scale-nodepool
@@ -733,7 +747,7 @@ spec:
733747
- name: replicas
734748
value: 0
735749
runAfter:
736-
- load-aiml-multiple-fine-tuning
750+
- scale-down-addons
737751
taskRef:
738752
kind: Task
739753
name: scale-nodepool
@@ -752,6 +766,26 @@ spec:
752766
taskRef:
753767
kind: Task
754768
name: nodepool-replicas-wait
769+
- name: stop-karpenter-logs
770+
params:
771+
- name: cluster-name
772+
value: $(params.cluster-name)
773+
- name: endpoint
774+
value: $(params.endpoint)
775+
- name: karpenter-replicas
776+
value: "0"
777+
runAfter:
778+
- wait-for-scale-down-training
779+
- wait-for-scale-down-inference
780+
- wait-for-scale-down-operator
781+
- wait-for-scale-down-monitoring
782+
- wait-for-scale-down-titan-pool
783+
taskRef:
784+
kind: Task
785+
name: scale-addons
786+
workspaces:
787+
- name: config
788+
workspace: config
755789
finally:
756790
- name: teardown
757791
retries: 10 # To deal with throttling during deletion
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
apiVersion: tekton.dev/v1beta1
2+
kind: Task
3+
metadata:
4+
name: scale-addons
5+
namespace: scalability
6+
spec:
7+
description: |
8+
Optionally scales CoreDNS, EBS CSI Controller, and Karpenter deployments to the specified number of replicas.
9+
This task configures kubectl access to the EKS cluster and scales only the deployments for which replica counts are provided.
10+
Waits for the scaling operations to complete.
11+
params:
12+
- name: coredns-replicas
13+
description: Number of replicas to scale CoreDNS to (optional, skipped if empty)
14+
default: ""
15+
- name: ebs-csi-replicas
16+
description: Number of replicas to scale EBS CSI Controller to (optional, skipped if empty)
17+
default: ""
18+
- name: karpenter-replicas
19+
description: Number of replicas to scale Karpenter to (optional, skipped if empty)
20+
default: ""
21+
- name: cluster-name
22+
description: The name of the EKS cluster
23+
- name: endpoint
24+
description: EKS cluster endpoint URL (optional)
25+
default: ""
26+
- name: aws-region
27+
description: AWS region where the cluster is located
28+
default: "us-west-2"
29+
- name: timeout
30+
description: Timeout for rollout status in seconds
31+
default: "1800"
32+
workspaces:
33+
- name: config
34+
mountPath: /config/
35+
stepTemplate:
36+
env:
37+
- name: KUBECONFIG
38+
value: /config/kubeconfig
39+
steps:
40+
- name: update-kubeconfig
41+
image: alpine/k8s:1.35.0
42+
script: |
43+
ENDPOINT_FLAG=""
44+
if [ -n "$(params.endpoint)" ]; then
45+
ENDPOINT_FLAG="--endpoint $(params.endpoint)"
46+
fi
47+
aws eks $ENDPOINT_FLAG update-kubeconfig --name $(params.cluster-name) --region $(params.aws-region)
48+
- name: scale-coredns
49+
image: alpine/k8s:1.35.0
50+
script: |
51+
if [ -n "$(params.coredns-replicas)" ]; then
52+
kubectl scale deployment coredns -n kube-system --replicas=$(params.coredns-replicas)
53+
kubectl rollout status deployment/coredns -n kube-system --timeout=$(params.timeout)s
54+
else
55+
echo "Skipping CoreDNS scaling (coredns-replicas not provided)"
56+
fi
57+
- name: scale-ebs-csi-controller
58+
image: alpine/k8s:1.35.0
59+
script: |
60+
if [ -n "$(params.ebs-csi-replicas)" ]; then
61+
kubectl scale deployment ebs-csi-controller -n kube-system --replicas=$(params.ebs-csi-replicas)
62+
kubectl rollout status deployment/ebs-csi-controller -n kube-system --timeout=$(params.timeout)s
63+
else
64+
echo "Skipping EBS CSI Controller scaling (ebs-csi-replicas not provided)"
65+
fi
66+
- name: scale-karpenter
67+
image: alpine/k8s:1.35.0
68+
script: |
69+
if [ -n "$(params.karpenter-replicas)" ]; then
70+
kubectl scale deployment karpenter -n karpenter --replicas=$(params.karpenter-replicas)
71+
echo "Karpenter scaled to $(params.karpenter-replicas) replicas"
72+
else
73+
echo "Skipping Karpenter scaling (karpenter-replicas not provided)"
74+
fi

0 commit comments

Comments
 (0)