Skip to content

Commit f96aa68

Browse files
committed
Refactor Karpenter cleanup steps in destroy workflow for clarity and legacy support; enhance monitoring workflow with ingress wait and endpoint printing; update Terraform variables for node instance types and dependencies in VPC module.
1 parent 13002fc commit f96aa68

File tree

8 files changed

+185
-105
lines changed

8 files changed

+185
-105
lines changed

.github/workflows/destroy.yml

Lines changed: 45 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -124,66 +124,73 @@ jobs:
124124
# ==================================================
125125
# PHASE 3: CLEAN UP KARPENTER RESOURCES
126126
# ==================================================
127-
- name: Delete Karpenter Resources
127+
- name: Delete Karpenter Resources (Legacy API)
128128
run: |
129-
echo "🚗 Cleaning up Karpenter resources..."
129+
echo "Cleaning up Karpenter resources (legacy API versions)..."
130130
131-
# Delete Karpenter custom resources first
132-
echo "Deleting Karpenter NodePools and EC2NodeClasses..."
133-
kubectl delete nodepool --all --ignore-not-found --timeout=60s || true
134-
kubectl delete ec2nodeclass --all --ignore-not-found --timeout=60s || true
131+
# Delete LEGACY Karpenter Provisioners (v1alpha5) - THIS IS WHAT YOU'RE USING
132+
echo "Deleting legacy Karpenter Provisioners (v1alpha5)..."
133+
kubectl delete provisioner ${{ vars.KARPENTER_NODEPOOL_NAME }} --ignore-not-found --timeout=60s || true
134+
kubectl delete provisioner --all --ignore-not-found --timeout=60s || true
135135
136-
# Delete legacy Karpenter resources
137-
echo "Deleting legacy Karpenter Provisioners and AWSNodeTemplates..."
138-
kubectl delete provisioner ${{ vars.KARPENTER_NODEPOOL_NAME }} --ignore-not-found -n ${{ vars.KARPENTER_NAMESPACE }} --timeout=60s || true
139-
kubectl delete provisioner --all -n ${{ vars.KARPENTER_NAMESPACE }} --ignore-not-found --timeout=60s || true
140-
kubectl delete awsnodetemplate ${{ vars.KARPENTER_NODECLASS_NAME }} --ignore-not-found -n ${{ vars.KARPENTER_NAMESPACE }} --timeout=60s || true
141-
kubectl delete awsnodetemplate --all -n ${{ vars.KARPENTER_NAMESPACE }} --ignore-not-found --timeout=60s || true
136+
# Delete LEGACY AWSNodeTemplates (v1alpha1) - THIS IS WHAT YOU'RE USING
137+
echo "Deleting legacy AWSNodeTemplates (v1alpha1)..."
138+
kubectl delete awsnodetemplate ${{ vars.KARPENTER_NODECLASS_NAME }} --ignore-not-found --timeout=60s || true
139+
kubectl delete awsnodetemplate --all --ignore-not-found --timeout=60s || true
142140
143-
echo "⏳ Waiting for Karpenter resources to be cleaned up..."
144-
sleep 30
145-
echo "✅ Karpenter resources cleanup completed"
141+
# Remove finalizers from legacy resources if they're stuck
142+
echo "Removing finalizers from stuck Provisioners..."
143+
kubectl get provisioner -o name | xargs -r -I {} kubectl patch {} -p '{"metadata":{"finalizers":[]}}' --type=merge || true
144+
145+
echo "Removing finalizers from stuck AWSNodeTemplates..."
146+
kubectl get awsnodetemplate -o name | xargs -r -I {} kubectl patch {} -p '{"metadata":{"finalizers":[]}}' --type=merge || true
147+
148+
echo "Waiting for Karpenter resources to be cleaned up..."
149+
sleep 60
150+
echo "Karpenter resources cleanup completed"
146151
continue-on-error: true
147152

148-
- name: Uninstall Karpenter Helm Release
153+
- name: Force Delete Karpenter Nodes from Kubernetes
149154
run: |
150-
echo "📦 Uninstalling Karpenter Helm release..."
151-
helm uninstall karpenter -n ${{ vars.KARPENTER_NAMESPACE }} --timeout=300s || true
155+
echo "Removing Karpenter-managed nodes from Kubernetes cluster..."
152156
153-
echo "⏳ Waiting for Karpenter pods to terminate..."
154-
kubectl wait --for=delete pod -l app.kubernetes.io/name=karpenter -n ${{ vars.KARPENTER_NAMESPACE }} --timeout=120s || true
157+
# Delete nodes that have Karpenter labels
158+
kubectl get nodes -l karpenter.sh/provisioner-name --no-headers -o custom-columns=":metadata.name" | xargs -r kubectl delete node --timeout=60s || true
159+
kubectl get nodes -l karpenter.sh/cluster=${{ vars.CLUSTER_NAME }} --no-headers -o custom-columns=":metadata.name" | xargs -r kubectl delete node --timeout=60s || true
155160
156-
echo "💥 Force deleting any remaining Karpenter pods..."
157-
kubectl delete pods --all -n ${{ vars.KARPENTER_NAMESPACE }} --force --grace-period=0 || true
158-
echo "✅ Karpenter Helm release uninstalled"
161+
# Also try generic Karpenter node labels
162+
kubectl get nodes -l node.kubernetes.io/instance-type --no-headers -o custom-columns=":metadata.name" | while read node; do
163+
if kubectl describe node $node | grep -q "karpenter"; then
164+
echo "Deleting Karpenter node: $node"
165+
kubectl delete node $node --timeout=60s || true
166+
fi
167+
done || true
168+
169+
echo "Node cleanup completed"
159170
continue-on-error: true
160171

161-
- name: Clean up Karpenter CRDs and Webhooks
172+
- name: Clean up Karpenter CRDs and Webhooks (Kubectl Only)
162173
run: |
163-
echo "🧹 Cleaning up Karpenter CRDs and webhooks..."
164-
165-
# Delete Karpenter CRDs
166-
echo "Deleting Karpenter CRDs..."
167-
kubectl delete crd provisioners.karpenter.sh --ignore-not-found --timeout=60s || true
168-
kubectl delete crd awsnodetemplates.karpenter.k8s.aws --ignore-not-found --timeout=60s || true
169-
kubectl delete crd nodepools.karpenter.sh --ignore-not-found --timeout=60s || true
170-
kubectl delete crd ec2nodeclasses.karpenter.k8s.aws --ignore-not-found --timeout=60s || true
174+
echo "Cleaning up Karpenter CRDs and webhooks..."
171175
172-
# Delete Karpenter webhooks
176+
# Delete Karpenter webhooks first (this is critical)
173177
echo "Deleting Karpenter webhooks..."
174-
kubectl delete validatingwebhookconfiguration defaulting.webhook.karpenter.sh --ignore-not-found || true
175178
kubectl delete validatingwebhookconfiguration validation.webhook.karpenter.sh --ignore-not-found || true
179+
kubectl delete validatingwebhookconfiguration defaulting.webhook.karpenter.sh --ignore-not-found || true
176180
kubectl delete mutatingwebhookconfiguration defaulting.webhook.karpenter.sh --ignore-not-found || true
177181
182+
# Delete LEGACY Karpenter CRDs (what you're actually using)
183+
echo "Deleting LEGACY Karpenter CRDs..."
184+
kubectl delete crd provisioners.karpenter.sh --ignore-not-found --timeout=60s || true
185+
kubectl delete crd awsnodetemplates.karpenter.k8s.aws --ignore-not-found --timeout=60s || true
186+
178187
# Remove finalizers from stuck CRDs
179188
echo "Removing finalizers from stuck Karpenter CRDs..."
180189
kubectl patch crd provisioners.karpenter.sh -p '{"metadata":{"finalizers":[]}}' --type=merge || true
181190
kubectl patch crd awsnodetemplates.karpenter.k8s.aws -p '{"metadata":{"finalizers":[]}}' --type=merge || true
182-
kubectl patch crd nodepools.karpenter.sh -p '{"metadata":{"finalizers":[]}}' --type=merge || true
183-
kubectl patch crd ec2nodeclasses.karpenter.k8s.aws -p '{"metadata":{"finalizers":[]}}' --type=merge || true
184-
echo "✅ Karpenter CRDs and webhooks cleanup completed"
191+
192+
echo "Karpenter CRDs and webhooks cleanup completed"
185193
continue-on-error: true
186-
187194
# ==================================================
188195
# PHASE 4: UNINSTALL HELM RELEASES
189196
# ==================================================

.github/workflows/monitoring.yml

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,4 +93,33 @@ jobs:
9393
export ARGOCD_NAMESPACE=${{ inputs.argocd_namespace }}
9494
envsubst < ./argocd/monitoring.yml | \
9595
sed "s/monitoring\.yourdomain\.com/${{ steps.nginx-lb.outputs.nginx_hostname }}/g" | \
96-
kubectl apply -f -
96+
kubectl apply -f -
97+
98+
- name: Wait for Monitoring Ingress and Print Endpoints
99+
run: |
100+
# Wait for monitoring ingress
101+
MONITORING_INGRESS=""
102+
for i in {1..30}; do
103+
MONITORING_INGRESS=$(kubectl get ingress -n ${{ inputs.monitoring_namespace }} -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}' 2>/dev/null)
104+
if [[ ! -z "$MONITORING_INGRESS" ]]; then
105+
echo "Monitoring ingress is ready: $MONITORING_INGRESS"
106+
break
107+
else
108+
echo "Waiting for monitoring ingress to be ready ($i/30)..."
109+
sleep 10
110+
fi
111+
done
112+
113+
# Print all service endpoints
114+
ARGOCD_HOST=$(kubectl get svc argocd-server -n ${{ inputs.argocd_namespace }} -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || echo 'Not found')
115+
APP_HOST=$(kubectl get svc ${{ inputs.app_name }}-svc -n ${{ inputs.app_namespace }} -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || echo 'Not found')
116+
echo "🚀 ArgoCD: http://$ARGOCD_HOST"
117+
echo "🌟 App: http://$APP_HOST"
118+
if [[ -z "$MONITORING_INGRESS" ]]; then
119+
echo "⚠️ Monitoring Ingress not ready"
120+
else
121+
echo "📊 Monitoring Services:"
122+
echo "Prometheus: http://$MONITORING_INGRESS/prometheus"
123+
echo "Grafana: http://$MONITORING_INGRESS/grafana"
124+
echo "Alertmanager: http://$MONITORING_INGRESS/alertmanager"
125+
fi

Terraform/4-variables.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ variable "node_groups" {
5151

5252
default = {
5353
default = {
54-
instance_types = ["t3.micro"]
54+
instance_types = ["t3.medium"]
5555
capacity_type = "ON_DEMAND"
5656
scaling_config = {
5757
desired_size = 1

Terraform/modules/eks/main.tf

Lines changed: 59 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,4 @@
1-
resource "aws_eks_cluster" "main" {
2-
name = var.cluster_name
3-
version = var.cluster_version
4-
role_arn = aws_iam_role.cluster.arn
5-
6-
access_config {
7-
authentication_mode = "API_AND_CONFIG_MAP"
8-
bootstrap_cluster_creator_admin_permissions = true
9-
}
10-
11-
vpc_config {
12-
subnet_ids = var.cluster_subnet_ids # Changed from var.subnet_ids
13-
}
14-
15-
depends_on = [
16-
aws_iam_role_policy_attachment.cluster_policy
17-
]
18-
19-
tags = {
20-
"karpenter.sh/discovery" = var.cluster_name
21-
"Name" = var.cluster_name
22-
}
23-
}
24-
25-
resource "aws_ec2_tag" "cluster_sg_karpenter" {
26-
resource_id = aws_eks_cluster.main.vpc_config[0].cluster_security_group_id
27-
key = "karpenter.sh/discovery"
28-
value = var.cluster_name
29-
}
30-
# EKS OIDC Identity Provider
31-
resource "aws_iam_openid_connect_provider" "eks" {
32-
client_id_list = ["sts.amazonaws.com"]
33-
thumbprint_list = ["9e99a48a9960b14926bb7f3b02e22da2b0ab7280"]
34-
url = aws_eks_cluster.main.identity[0].oidc[0].issuer
35-
}
1+
data "aws_caller_identity" "current" {}
362

373
resource "aws_iam_role" "cluster" {
384
name = "${var.cluster_name}-cluster-role"
@@ -56,10 +22,10 @@ resource "aws_iam_role" "cluster" {
5622
resource "aws_iam_role_policy_attachment" "cluster_policy" {
5723
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
5824
role = aws_iam_role.cluster.name
25+
26+
depends_on = [aws_iam_role.cluster]
5927
}
6028

61-
62-
6329
resource "aws_iam_role" "node" {
6430
name = "${var.cluster_name}-node-role"
6531
assume_role_policy = jsonencode({
@@ -86,16 +52,59 @@ resource "aws_iam_role_policy_attachment" "node_policy" {
8652

8753
policy_arn = each.value
8854
role = aws_iam_role.node.name
55+
56+
depends_on = [aws_iam_role.node]
57+
}
58+
59+
resource "aws_eks_cluster" "main" {
60+
name = var.cluster_name
61+
version = var.cluster_version
62+
role_arn = aws_iam_role.cluster.arn
63+
64+
access_config {
65+
authentication_mode = "API_AND_CONFIG_MAP"
66+
bootstrap_cluster_creator_admin_permissions = true
67+
}
68+
69+
vpc_config {
70+
subnet_ids = var.cluster_subnet_ids # Changed from var.subnet_ids
71+
}
72+
73+
depends_on = [
74+
aws_iam_role.cluster,
75+
aws_iam_role_policy_attachment.cluster_policy
76+
]
77+
78+
tags = {
79+
"karpenter.sh/discovery" = var.cluster_name
80+
"Name" = var.cluster_name
81+
}
8982
}
9083

84+
# EKS OIDC Identity Provider
85+
resource "aws_iam_openid_connect_provider" "eks" {
86+
client_id_list = ["sts.amazonaws.com"]
87+
thumbprint_list = ["9e99a48a9960b14926bb7f3b02e22da2b0ab7280"]
88+
url = aws_eks_cluster.main.identity[0].oidc[0].issuer
89+
90+
depends_on = [aws_eks_cluster.main]
91+
}
9192

93+
resource "aws_ec2_tag" "cluster_sg_karpenter" {
94+
resource_id = aws_eks_cluster.main.vpc_config[0].cluster_security_group_id
95+
key = "karpenter.sh/discovery"
96+
value = var.cluster_name
97+
98+
depends_on = [aws_eks_cluster.main]
99+
}
92100

93101
resource "aws_eks_node_group" "main" {
94102
for_each = var.node_groups
95103
cluster_name = aws_eks_cluster.main.name
96104
node_group_name = each.key
97105
node_role_arn = aws_iam_role.node.arn
98106
subnet_ids = var.node_subnet_ids # Changed from var.subnet_ids
107+
99108
scaling_config {
100109
desired_size = each.value.scaling_config.desired_size
101110
max_size = each.value.scaling_config.max_size
@@ -107,14 +116,18 @@ resource "aws_eks_node_group" "main" {
107116
}
108117

109118
depends_on = [
119+
aws_eks_cluster.main,
120+
aws_iam_role.node,
110121
aws_iam_role_policy_attachment.node_policy
111122
]
112123
}
113124

114125
# Karpenter Node Instance Profile (reuse existing node role)
115126
resource "aws_iam_instance_profile" "karpenter_node" {
116-
name = "KarpenterNodeInstanceProfile"
127+
name = "KarpenterNodeInstanceProfile-${var.cluster_name}"
117128
role = aws_iam_role.node.name
129+
130+
depends_on = [aws_iam_role.node]
118131
}
119132

120133
# Karpenter Controller IAM Role
@@ -139,7 +152,13 @@ resource "aws_iam_role" "karpenter_controller" {
139152
}
140153
]
141154
})
155+
156+
depends_on = [
157+
aws_eks_cluster.main,
158+
aws_iam_openid_connect_provider.eks
159+
]
142160
}
161+
143162
resource "aws_iam_role_policy" "karpenter_controller" {
144163
name = "KarpenterControllerPolicy"
145164
role = aws_iam_role.karpenter_controller.id
@@ -193,12 +212,12 @@ resource "aws_iam_role_policy" "karpenter_controller" {
193212
}
194213
]
195214
})
215+
216+
depends_on = [aws_iam_role.karpenter_controller]
196217
}
197218

198219
# SQS Queue for Spot Interruption Notifications
199220
resource "aws_sqs_queue" "karpenter_interruption" {
200221
name = "karpenter-interruption-queue-${var.cluster_name}"
201222
message_retention_seconds = 300
202223
}
203-
204-
data "aws_caller_identity" "current" {}

0 commit comments

Comments
 (0)