Skip to content

Commit 1b921ca

Browse files
authored
Improve Karpenter cleanup workflow with timeouts
Enhanced error handling and timeouts for Karpenter cleanup operations, including scaling down workloads and deleting resources. Improved namespace deletion process with force cleanup for stuck namespaces.
1 parent e755cd3 commit 1b921ca

File tree

1 file changed

+100
-91
lines changed

1 file changed

+100
-91
lines changed

.github/workflows/destroy.yml

Lines changed: 100 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -29,18 +29,6 @@ jobs:
2929
echo "KARPENTER_NODE_ROLE: ${{ vars.KARPENTER_NODE_ROLE }}"
3030
echo "KARPENTER_INSTANCE_PROFILE: ${{ vars.KARPENTER_INSTANCE_PROFILE }}"
3131
echo "KARPENTER_NAMESPACE: ${{ vars.KARPENTER_NAMESPACE }}"
32-
if [[ -z "${{ vars.KARPENTER_NODEPOOL_NAME }}" ]]; then
33-
echo "WARNING: KARPENTER_NODEPOOL_NAME variable not found. Karpenter resources may not be deployed."
34-
fi
35-
if [[ -z "${{ vars.KARPENTER_NODE_ROLE }}" ]]; then
36-
echo "WARNING: KARPENTER_NODE_ROLE variable not found. Karpenter resources may not be deployed."
37-
fi
38-
if [[ -z "${{ vars.KARPENTER_INSTANCE_PROFILE }}" ]]; then
39-
echo "WARNING: KARPENTER_INSTANCE_PROFILE variable not found. Karpenter resources may not be deployed."
40-
fi
41-
if [[ -z "${{ vars.KARPENTER_NAMESPACE }}" ]]; then
42-
echo "WARNING: KARPENTER_NAMESPACE variable not found. Karpenter resources may not be deployed."
43-
fi
4432
if [[ -z "${{ vars.CLUSTER_NAME }}" ]]; then
4533
echo "ERROR: CLUSTER_NAME variable not found. Infrastructure may not be deployed."
4634
exit 1
@@ -72,81 +60,69 @@ jobs:
7260
continue-on-error: true
7361

7462
# ---------------------------
75-
# Delete ArgoCD Applications
63+
# Delete ArgoCD Applications with timeout
7664
# ---------------------------
7765
- name: Delete ArgoCD Applications
7866
run: |
79-
kubectl delete application ${{ vars.APP_NAME }} -n ${{ vars.ARGOCD_NAMESPACE }} --ignore-not-found
80-
kubectl delete application kube-prometheus-stack -n ${{ vars.ARGOCD_NAMESPACE }} --ignore-not-found
67+
kubectl delete application ${{ vars.APP_NAME }} -n ${{ vars.ARGOCD_NAMESPACE }} --ignore-not-found --timeout=60s || true
68+
kubectl delete application kube-prometheus-stack -n ${{ vars.ARGOCD_NAMESPACE }} --ignore-not-found --timeout=60s || true
8169
continue-on-error: true
8270

8371
# ---------------------------
84-
# Delete Karpenter Resources (Updated)
85-
# ---------------------------
86-
87-
# Replace the Karpenter cleanup section in your workflow with this enhanced version
88-
89-
# ---------------------------
90-
# Scale down workloads FIRST
72+
# Scale down workloads FIRST with timeouts
9173
# ---------------------------
9274
- name: Scale down all workloads before cleanup
9375
run: |
9476
echo "Scaling down all deployments and deleting services to trigger LB cleanup..."
9577
96-
# Scale down ALL deployments across ALL namespaces (not just app namespace)
78+
# Scale down ALL deployments across ALL namespaces with timeout
9779
kubectl get deployments --all-namespaces -o json | jq -r '.items[] | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace deployment; do
9880
echo "Scaling down deployment $deployment in namespace $namespace"
99-
kubectl scale deployment $deployment --replicas=0 -n $namespace || true
81+
kubectl scale deployment $deployment --replicas=0 -n $namespace --timeout=30s || true
10082
done
10183
10284
# Scale down daemonsets that might be running
10385
kubectl get daemonsets --all-namespaces -o json | jq -r '.items[] | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace daemonset; do
10486
echo "Deleting daemonset $daemonset in namespace $namespace"
105-
kubectl delete daemonset $daemonset -n $namespace --ignore-not-found || true
87+
kubectl delete daemonset $daemonset -n $namespace --ignore-not-found --timeout=60s || true
10688
done
10789
10890
# Delete ALL services of type LoadBalancer IMMEDIATELY
10991
echo "Deleting LoadBalancer services..."
11092
kubectl get services --all-namespaces -o json | jq -r '.items[] | select(.spec.type=="LoadBalancer") | "\(.metadata.namespace) \(.metadata.name)"' | while read namespace service; do
11193
echo "Deleting service $service in namespace $namespace"
112-
kubectl delete service $service -n $namespace --ignore-not-found || true
94+
kubectl delete service $service -n $namespace --ignore-not-found --timeout=60s || true
11395
done
96+
97+
# Wait for LoadBalancers to be cleaned up
98+
echo "Waiting for LoadBalancers to be cleaned up..."
99+
sleep 60
114100
101+
# ---------------------------
102+
# Clean up Karpenter Resources with enhanced error handling
103+
# ---------------------------
115104
- name: Delete Karpenter Provisioners and AWSNodeTemplates
116105
run: |
117106
echo "Deleting Karpenter Provisioners..."
118-
kubectl delete provisioner ${{ vars.KARPENTER_NODEPOOL_NAME }} --ignore-not-found -n ${{ vars.KARPENTER_NAMESPACE }} || true
119-
kubectl delete provisioner --all -n ${{ vars.KARPENTER_NAMESPACE }} --ignore-not-found || true
120-
107+
kubectl delete provisioner ${{ vars.KARPENTER_NODEPOOL_NAME }} --ignore-not-found -n ${{ vars.KARPENTER_NAMESPACE }} --timeout=60s || true
108+
kubectl delete provisioner --all -n ${{ vars.KARPENTER_NAMESPACE }} --ignore-not-found --timeout=60s || true
109+
121110
echo "Deleting Karpenter AWSNodeTemplates..."
122-
kubectl delete awsnodetemplate ${{ vars.KARPENTER_NODECLASS_NAME }} --ignore-not-found -n ${{ vars.KARPENTER_NAMESPACE }} || true
123-
kubectl delete awsnodetemplate --all -n ${{ vars.KARPENTER_NAMESPACE }} --ignore-not-found || true
124-
125-
echo "Waiting for resources to be cleaned up..."
126-
sleep 10
127-
continue-on-error: true
128-
129-
- name: Clean up Karpenter CRDs and Webhooks
130-
run: |
131-
echo "Deleting Karpenter CRDs..."
132-
kubectl delete crd provisioners.karpenter.sh --ignore-not-found || true
133-
kubectl delete crd awsnodetemplates.karpenter.k8s.aws --ignore-not-found || true
111+
kubectl delete awsnodetemplate ${{ vars.KARPENTER_NODECLASS_NAME }} --ignore-not-found -n ${{ vars.KARPENTER_NAMESPACE }} --timeout=60s || true
112+
kubectl delete awsnodetemplate --all -n ${{ vars.KARPENTER_NAMESPACE }} --ignore-not-found --timeout=60s || true
134113
135-
echo "Deleting Karpenter webhooks..."
136-
kubectl delete validatingwebhookconfiguration defaulting.webhook.karpenter.sh --ignore-not-found || true
137-
kubectl delete validatingwebhookconfiguration validation.webhook.karpenter.sh --ignore-not-found || true
138-
kubectl delete mutatingwebhookconfiguration defaulting.webhook.karpenter.sh --ignore-not-found || true
114+
echo "Deleting new Karpenter resources (NodePools, EC2NodeClasses)..."
115+
kubectl delete nodepool --all --ignore-not-found --timeout=60s || true
116+
kubectl delete ec2nodeclass --all --ignore-not-found --timeout=60s || true
139117
140-
echo "Deleting Karpenter finalizers if stuck..."
141-
kubectl patch crd provisioners.karpenter.sh -p '{"metadata":{"finalizers":[]}}' --type=merge || true
142-
kubectl patch crd awsnodetemplates.karpenter.k8s.aws -p '{"metadata":{"finalizers":[]}}' --type=merge || true
118+
echo "Waiting for resources to be cleaned up..."
119+
sleep 30
143120
continue-on-error: true
144121

145-
146122
- name: Uninstall Karpenter Helm Release
147123
run: |
148124
echo "Uninstalling Karpenter Helm release..."
149-
helm uninstall karpenter -n ${{ vars.KARPENTER_NAMESPACE }} || true
125+
helm uninstall karpenter -n ${{ vars.KARPENTER_NAMESPACE }} --timeout=300s || true
150126
151127
echo "Waiting for pods to terminate..."
152128
kubectl wait --for=delete pod -l app.kubernetes.io/name=karpenter -n ${{ vars.KARPENTER_NAMESPACE }} --timeout=120s || true
@@ -158,82 +134,115 @@ jobs:
158134
- name: Clean up Karpenter CRDs and Webhooks
159135
run: |
160136
echo "Deleting Karpenter CRDs..."
161-
kubectl delete crd nodepools.karpenter.sh --ignore-not-found || true
162-
kubectl delete crd provisioners.karpenter.sh --ignore-not-found || true
163-
kubectl delete crd awsnodetemplates.karpenter.k8s.aws --ignore-not-found || true
164-
kubectl delete crd ec2nodeclasses.karpenter.k8s.aws --ignore-not-found || true
137+
kubectl delete crd provisioners.karpenter.sh --ignore-not-found --timeout=60s || true
138+
kubectl delete crd awsnodetemplates.karpenter.k8s.aws --ignore-not-found --timeout=60s || true
139+
kubectl delete crd nodepools.karpenter.sh --ignore-not-found --timeout=60s || true
140+
kubectl delete crd ec2nodeclasses.karpenter.k8s.aws --ignore-not-found --timeout=60s || true
165141
166142
echo "Deleting Karpenter webhooks..."
167143
kubectl delete validatingwebhookconfiguration defaulting.webhook.karpenter.sh --ignore-not-found || true
168144
kubectl delete validatingwebhookconfiguration validation.webhook.karpenter.sh --ignore-not-found || true
169145
kubectl delete mutatingwebhookconfiguration defaulting.webhook.karpenter.sh --ignore-not-found || true
170146
171-
echo "Deleting Karpenter finalizers if stuck..."
147+
echo "Removing finalizers from stuck CRDs..."
148+
kubectl patch crd provisioners.karpenter.sh -p '{"metadata":{"finalizers":[]}}' --type=merge || true
149+
kubectl patch crd awsnodetemplates.karpenter.k8s.aws -p '{"metadata":{"finalizers":[]}}' --type=merge || true
172150
kubectl patch crd nodepools.karpenter.sh -p '{"metadata":{"finalizers":[]}}' --type=merge || true
173151
kubectl patch crd ec2nodeclasses.karpenter.k8s.aws -p '{"metadata":{"finalizers":[]}}' --type=merge || true
174152
continue-on-error: true
175153

176154
# ---------------------------
177-
# Uninstall Other Helm Releases
155+
# Uninstall Other Helm Releases with timeouts
178156
# ---------------------------
179157
- name: Uninstall Helm Releases
180158
run: |
181-
helm uninstall ${{ vars.APP_NAME }} -n ${{ vars.APP_NAMESPACE }} || true
182-
helm uninstall kube-prometheus-stack -n ${{ vars.MONITORING_NAMESPACE }} || true
183-
helm uninstall ingress-nginx -n ingress-nginx || true
184-
helm uninstall argocd -n ${{ vars.ARGOCD_NAMESPACE }} || true
159+
helm uninstall ${{ vars.APP_NAME }} -n ${{ vars.APP_NAMESPACE }} --timeout=300s || true
160+
helm uninstall kube-prometheus-stack -n ${{ vars.MONITORING_NAMESPACE }} --timeout=300s || true
161+
helm uninstall ingress-nginx -n ingress-nginx --timeout=300s || true
162+
helm uninstall argocd -n ${{ vars.ARGOCD_NAMESPACE }} --timeout=300s || true
185163
continue-on-error: true
186164

187165
# ---------------------------
188-
# Delete Namespaces
189-
# ---------------------------
190-
- name: Delete Namespaces
191-
run: |
192-
kubectl delete namespace ${{ vars.APP_NAMESPACE }} --ignore-not-found
193-
kubectl delete namespace ${{ vars.MONITORING_NAMESPACE }} --ignore-not-found
194-
kubectl delete namespace ${{ vars.ARGOCD_NAMESPACE }} --ignore-not-found
195-
kubectl delete namespace ingress-nginx --ignore-not-found
196-
197-
# Delete Karpenter namespace last and force if needed
198-
kubectl delete namespace ${{ vars.KARPENTER_NAMESPACE }} --ignore-not-found --timeout=60s || true
199-
kubectl delete namespace ${{ vars.KARPENTER_NAMESPACE }} --force --grace-period=0 --ignore-not-found || true
200-
continue-on-error: true
201-
202-
# ---------------------------
203-
# Delete CRDs (Prometheus & Grafana)
166+
# Delete CRDs (Prometheus & Grafana) before namespace deletion
204167
# ---------------------------
205168
- name: Delete Monitoring CRDs
206169
run: |
207-
kubectl get crd -o name | grep -E 'prometheus|grafana|alertmanager|servicemonitor|prometheusrule' | xargs -r kubectl delete || true
170+
echo "Deleting monitoring CRDs..."
171+
kubectl get crd -o name | grep -E 'prometheus|grafana|alertmanager|servicemonitor|prometheusrule' | xargs -r kubectl delete --timeout=60s || true
172+
173+
echo "Deleting ArgoCD CRDs..."
174+
kubectl get crd -o name | grep 'argoproj.io' | xargs -r kubectl delete --timeout=60s || true
208175
continue-on-error: true
209176

210177
# ---------------------------
211-
# Cleanup PVCs & PVs
178+
# Cleanup PVCs & PVs before namespace deletion
212179
# ---------------------------
213180
- name: Cleanup Persistent Storage
214181
run: |
215-
kubectl delete pvc --all -A || true
216-
kubectl delete pv --all || true
182+
echo "Deleting PVCs..."
183+
kubectl delete pvc --all -A --timeout=120s || true
184+
echo "Deleting PVs..."
185+
kubectl delete pv --all --timeout=120s || true
217186
continue-on-error: true
218187

219188
# ---------------------------
220-
# Final cleanup verification
189+
# Delete Namespaces with FORCE cleanup for stuck ones
221190
# ---------------------------
222-
- name: Verify Karpenter cleanup
191+
- name: Delete Namespaces with Force Cleanup
223192
run: |
224-
echo "Verifying Karpenter cleanup..."
225-
kubectl get pods -n ${{ vars.KARPENTER_NAMESPACE }} || echo "Karpenter namespace not found (expected)"
226-
kubectl get crd | grep karpenter || echo "No Karpenter CRDs found (expected)"
227-
kubectl get validatingwebhookconfiguration | grep karpenter || echo "No Karpenter webhooks found (expected)"
228-
kubectl get mutatingwebhookconfiguration | grep karpenter || echo "No Karpenter webhooks found (expected)"
229-
helm list -n ${{ vars.KARPENTER_NAMESPACE }} || echo "No Helm releases in karpenter namespace (expected)"
193+
echo "Deleting namespaces with proper cleanup..."
194+
195+
# Function to force delete a namespace if it gets stuck
196+
force_delete_namespace() {
197+
local ns=$1
198+
echo "Processing namespace: $ns"
199+
200+
if kubectl get namespace $ns --ignore-not-found 2>/dev/null; then
201+
# Try normal deletion first with timeout
202+
kubectl delete namespace $ns --ignore-not-found --timeout=120s || {
203+
echo "Normal deletion failed for $ns, trying force deletion..."
204+
205+
# Remove finalizers and force delete
206+
kubectl get namespace $ns -o json | \
207+
jq '.spec.finalizers = []' | \
208+
kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f - || true
209+
210+
# Wait a moment
211+
sleep 10
212+
213+
# Verify deletion
214+
if kubectl get namespace $ns --ignore-not-found 2>/dev/null; then
215+
echo "WARNING: Namespace $ns still exists after force deletion"
216+
else
217+
echo "Successfully force deleted namespace $ns"
218+
fi
219+
}
220+
else
221+
echo "Namespace $ns does not exist"
222+
fi
223+
}
224+
225+
# Delete namespaces one by one with force cleanup
226+
for ns in ${{ vars.APP_NAMESPACE }} ${{ vars.MONITORING_NAMESPACE }} ${{ vars.ARGOCD_NAMESPACE }} ingress-nginx ${{ vars.KARPENTER_NAMESPACE }}; do
227+
force_delete_namespace $ns
228+
done
229+
230+
echo "Final namespace check:"
231+
kubectl get namespaces || true
230232
continue-on-error: true
231233

232234
# ---------------------------
233235
# Wait for cleanup to complete
234236
# ---------------------------
235-
- name: Wait for cleanup
236-
run: sleep 30
237+
- name: Wait for cleanup and verify
238+
run: |
239+
echo "Waiting for cleanup to complete..."
240+
sleep 60
241+
242+
echo "Verifying cleanup..."
243+
kubectl get pods -n ${{ vars.KARPENTER_NAMESPACE }} 2>/dev/null || echo "Karpenter namespace not found (expected)"
244+
kubectl get crd | grep karpenter || echo "No Karpenter CRDs found (expected)"
245+
kubectl get namespaces | grep -E "${{ vars.APP_NAMESPACE }}|${{ vars.MONITORING_NAMESPACE }}|${{ vars.ARGOCD_NAMESPACE }}" || echo "Target namespaces deleted (expected)"
237246
238247
# ---------------------------
239248
# Terraform Destroy
@@ -267,4 +276,4 @@ jobs:
267276
gh variable delete KARPENTER_NAMESPACE --repo $GITHUB_REPOSITORY || true
268277
env:
269278
GITHUB_TOKEN: ${{ secrets.PAT_GITHUB }}
270-
continue-on-error: true
279+
continue-on-error: true

0 commit comments

Comments
 (0)