Skip to content

Commit 1a1f6b4

Browse files
committed
Add enable-out-of-service-taint flag
1 parent e191577 commit 1a1f6b4

File tree

9 files changed

+107
-9
lines changed

9 files changed

+107
-9
lines changed

config/helm/aws-node-termination-handler/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ The configuration in this table applies to all AWS Node Termination Handler mode
9595
| `webhookTemplateConfigMapName` | Pass the webhook template file as a configmap. | "``" |
9696
| `webhookTemplateConfigMapKey` | Name of the Configmap key storing the template file. | `""` |
9797
| `enableSqsTerminationDraining` | If `true`, this turns on queue-processor mode which drains nodes when an SQS termination event is received. | `false` |
98+
| `enableOutOfServiceTaint` | If `true`, this will add out-of-service taint to node after cordon/drain process which would forcefully evict pods without matching tolerations and detach persistent volumes. | `false` |
9899

99100
### Queue-Processor Mode Configuration
100101

config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ spec:
9999
value: {{ .Values.cordonOnly | quote }}
100100
- name: TAINT_NODE
101101
value: {{ .Values.taintNode | quote }}
102+
- name: ENABLE_OUT_OF_SERVICE_TAINT
103+
value: {{ .Values.enableOutOfServiceTaint | quote }}
102104
- name: EXCLUDE_FROM_LOAD_BALANCERS
103105
value: {{ .Values.excludeFromLoadBalancers | quote }}
104106
- name: DELETE_LOCAL_DATA

config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ spec:
9999
value: {{ .Values.cordonOnly | quote }}
100100
- name: TAINT_NODE
101101
value: {{ .Values.taintNode | quote }}
102+
- name: ENABLE_OUT_OF_SERVICE_TAINT
103+
value: {{ .Values.enableOutOfServiceTaint | quote }}
102104
- name: EXCLUDE_FROM_LOAD_BALANCERS
103105
value: {{ .Values.excludeFromLoadBalancers | quote }}
104106
- name: DELETE_LOCAL_DATA

config/helm/aws-node-termination-handler/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ spec:
102102
value: {{ .Values.cordonOnly | quote }}
103103
- name: TAINT_NODE
104104
value: {{ .Values.taintNode | quote }}
105+
- name: ENABLE_OUT_OF_SERVICE_TAINT
106+
value: {{ .Values.enableOutOfServiceTaint | quote }}
105107
- name: EXCLUDE_FROM_LOAD_BALANCERS
106108
value: {{ .Values.excludeFromLoadBalancers | quote }}
107109
- name: DELETE_LOCAL_DATA

pkg/config/config.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ const (
7777
taintNode = "TAINT_NODE"
7878
taintEffectDefault = "NoSchedule"
7979
taintEffect = "TAINT_EFFECT"
80+
enableOutOfServiceTaintConfigKey = "ENABLE_OUT_OF_SERVICE_TAINT"
81+
enableOutOfServiceTaintDefault = false
8082
excludeFromLoadBalancers = "EXCLUDE_FROM_LOAD_BALANCERS"
8183
jsonLoggingConfigKey = "JSON_LOGGING"
8284
jsonLoggingDefault = false
@@ -149,6 +151,7 @@ type Config struct {
149151
CordonOnly bool
150152
TaintNode bool
151153
TaintEffect string
154+
EnableOutOfServiceTaint bool
152155
ExcludeFromLoadBalancers bool
153156
JsonLogging bool
154157
LogLevel string
@@ -215,6 +218,7 @@ func ParseCliArgs() (config Config, err error) {
215218
flag.BoolVar(&config.CordonOnly, "cordon-only", getBoolEnv(cordonOnly, false), "If true, nodes will be cordoned but not drained when an interruption event occurs.")
216219
flag.BoolVar(&config.TaintNode, "taint-node", getBoolEnv(taintNode, false), "If true, nodes will be tainted when an interruption event occurs.")
217220
flag.StringVar(&config.TaintEffect, "taint-effect", getEnv(taintEffect, taintEffectDefault), "Sets the effect when a node is tainted.")
221+
flag.BoolVar(&config.EnableOutOfServiceTaint, "enable-out-of-service-taint", getBoolEnv(enableOutOfServiceTaintConfigKey, enableOutOfServiceTaintDefault), "If ture, nodes will be tainted as out-of-service after we cordon/drain the nodes when an interruption event occurs.")
218222
flag.BoolVar(&config.ExcludeFromLoadBalancers, "exclude-from-load-balancers", getBoolEnv(excludeFromLoadBalancers, false), "If true, nodes will be marked for exclusion from load balancers when an interruption event occurs.")
219223
flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.")
220224
flag.StringVar(&config.LogLevel, "log-level", getEnv(logLevelConfigKey, logLevelDefault), "Sets the log level (INFO, DEBUG, or ERROR)")
@@ -344,6 +348,7 @@ func (c Config) PrintJsonConfigArgs() {
344348
Bool("cordon_only", c.CordonOnly).
345349
Bool("taint_node", c.TaintNode).
346350
Str("taint_effect", c.TaintEffect).
351+
Bool("enable_out_of_service_taint", c.EnableOutOfServiceTaint).
347352
Bool("exclude_from_load_balancers", c.ExcludeFromLoadBalancers).
348353
Bool("json_logging", c.JsonLogging).
349354
Str("log_level", c.LogLevel).
@@ -395,6 +400,7 @@ func (c Config) PrintHumanConfigArgs() {
395400
"\tcordon-only: %t,\n"+
396401
"\ttaint-node: %t,\n"+
397402
"\ttaint-effect: %s,\n"+
403+
"\tenable-out-of-service-taint: %t,\n"+
398404
"\texclude-from-load-balancers: %t,\n"+
399405
"\tjson-logging: %t,\n"+
400406
"\tlog-level: %s,\n"+
@@ -437,6 +443,7 @@ func (c Config) PrintHumanConfigArgs() {
437443
c.CordonOnly,
438444
c.TaintNode,
439445
c.TaintEffect,
446+
c.EnableOutOfServiceTaint,
440447
c.ExcludeFromLoadBalancers,
441448
c.JsonLogging,
442449
c.LogLevel,

pkg/interruptionevent/draincordon/handler.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,15 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error {
119119
if (err == nil || (!nodeFound && h.commonHandler.NthConfig.DeleteSqsMsgIfNodeNotFound)) && drainEvent.PostDrainTask != nil {
120120
h.commonHandler.RunPostDrainTask(nodeName, drainEvent)
121121
}
122+
123+
// Only add out-of-service taint if ENABLE_OUT_OF_SERVICE_TAINT flag is true, and CORDON_ONLY flag is false
124+
if err == nil && h.commonHandler.NthConfig.EnableOutOfServiceTaint && !h.commonHandler.NthConfig.CordonOnly {
125+
err = h.commonHandler.Node.TaintOutOfService(nodeName)
126+
if err != nil {
127+
return fmt.Errorf("cannot add out-of-service taint on node %s: %w", nodeName, err)
128+
}
129+
}
130+
122131
return nil
123132
}
124133

pkg/node/node.go

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ const (
5959
ASGLifecycleTerminationTaint = "aws-node-termination-handler/asg-lifecycle-termination"
6060
// RebalanceRecommendationTaint is a taint used to make spot instance unschedulable
6161
RebalanceRecommendationTaint = "aws-node-termination-handler/rebalance-recommendation"
62+
// OutOfServiceTaint is a taint used to forcefully evict pods without matching tolerations and detach persistent volumes
63+
OutOfServiceTaintKey = "node.kubernetes.io/out-of-service"
64+
OutOfServiceTaintValue = "nodeshutdown"
65+
OutOfServiceTaintEffectType = "NoExecute"
6266

6367
maxTaintValueLength = 63
6468
daemonSet = "DaemonSet"
@@ -447,7 +451,7 @@ func (n Node) TaintSpotItn(nodeName string, eventID string) error {
447451
eventID = eventID[:maxTaintValueLength]
448452
}
449453

450-
return addTaint(k8sNode, n, SpotInterruptionTaint, eventID)
454+
return addTaint(k8sNode, n, SpotInterruptionTaint, eventID, n.nthConfig.TaintEffect)
451455
}
452456

453457
// TaintASGLifecycleTermination adds the spot termination notice taint onto a node
@@ -465,7 +469,7 @@ func (n Node) TaintASGLifecycleTermination(nodeName string, eventID string) erro
465469
eventID = eventID[:maxTaintValueLength]
466470
}
467471

468-
return addTaint(k8sNode, n, ASGLifecycleTerminationTaint, eventID)
472+
return addTaint(k8sNode, n, ASGLifecycleTerminationTaint, eventID, n.nthConfig.TaintEffect)
469473
}
470474

471475
// TaintRebalanceRecommendation adds the rebalance recommendation notice taint onto a node
@@ -483,7 +487,7 @@ func (n Node) TaintRebalanceRecommendation(nodeName string, eventID string) erro
483487
eventID = eventID[:maxTaintValueLength]
484488
}
485489

486-
return addTaint(k8sNode, n, RebalanceRecommendationTaint, eventID)
490+
return addTaint(k8sNode, n, RebalanceRecommendationTaint, eventID, n.nthConfig.TaintEffect)
487491
}
488492

489493
// LogPods logs all the pod names on a node
@@ -525,7 +529,21 @@ func (n Node) TaintScheduledMaintenance(nodeName string, eventID string) error {
525529
eventID = eventID[:maxTaintValueLength]
526530
}
527531

528-
return addTaint(k8sNode, n, ScheduledMaintenanceTaint, eventID)
532+
return addTaint(k8sNode, n, ScheduledMaintenanceTaint, eventID, n.nthConfig.TaintEffect)
533+
}
534+
535+
// TaintOutOfService adds the out-of-service taint (NoExecute) onto a node
536+
func (n Node) TaintOutOfService(nodeName string) error {
537+
if !n.nthConfig.EnableOutOfServiceTaint || n.nthConfig.CordonOnly {
538+
return nil
539+
}
540+
541+
k8sNode, err := n.fetchKubernetesNode(nodeName)
542+
if err != nil {
543+
return fmt.Errorf("Unable to fetch kubernetes node from API: %w", err)
544+
}
545+
546+
return addTaint(k8sNode, n, OutOfServiceTaintKey, OutOfServiceTaintValue, OutOfServiceTaintEffectType)
529547
}
530548

531549
// RemoveNTHTaints removes NTH-specific taints from a node
@@ -711,8 +729,8 @@ func getTaintEffect(effect string) corev1.TaintEffect {
711729
}
712730
}
713731

714-
func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string) error {
715-
effect := getTaintEffect(nth.nthConfig.TaintEffect)
732+
func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, effectType string) error {
733+
effect := getTaintEffect(effectType)
716734
if nth.nthConfig.DryRun {
717735
log.Info().Msgf("Would have added taint (%s=%s:%s) to node %s, but dry-run flag was set", taintKey, taintValue, effect, nth.nthConfig.NodeName)
718736
return nil

pkg/node/node_test.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ import (
3737
// Size of the fakeRecorder buffer
3838
const recorderBufferSize = 10
3939

40+
const outOfServiceTaintKey = "node.kubernetes.io/out-of-service"
41+
const outOfServiceTaintValue = "nodeshutdown"
42+
4043
var nodeName = "NAME"
4144

4245
func getDrainHelper(client *fake.Clientset) *drain.Helper {
@@ -418,3 +421,40 @@ func TestFilterOutDaemonSetPods(t *testing.T) {
418421
filteredMockPodList := tNode.FilterOutDaemonSetPods(mockPodList)
419422
h.Equals(t, 2, len(filteredMockPodList.Items))
420423
}
424+
425+
func TestTaintOutOfService(t *testing.T) {
426+
client := fake.NewSimpleClientset()
427+
_, err := client.CoreV1().Nodes().Create(
428+
context.Background(),
429+
&v1.Node{
430+
ObjectMeta: metav1.ObjectMeta{Name: nodeName},
431+
},
432+
metav1.CreateOptions{})
433+
h.Ok(t, err)
434+
435+
tNode, err := newNode(config.Config{EnableOutOfServiceTaint: true}, client)
436+
h.Ok(t, err)
437+
h.Equals(t, true, tNode.GetNthConfig().EnableOutOfServiceTaint)
438+
h.Equals(t, false, tNode.GetNthConfig().CordonOnly)
439+
440+
err = tNode.TaintOutOfService(nodeName)
441+
h.Ok(t, err)
442+
443+
updatedNode, err := client.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
444+
h.Ok(t, err)
445+
taintFound := false
446+
expectedTaint := v1.Taint{
447+
Key: outOfServiceTaintKey,
448+
Value: outOfServiceTaintValue,
449+
Effect: corev1.TaintEffectNoExecute,
450+
}
451+
for _, taint := range updatedNode.Spec.Taints {
452+
if taint.Key == expectedTaint.Key &&
453+
taint.Value == expectedTaint.Value &&
454+
taint.Effect == expectedTaint.Effect {
455+
taintFound = true
456+
break
457+
}
458+
}
459+
h.Equals(t, true, taintFound)
460+
}

test/e2e/spot-interruption-test

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ function fail_and_exit {
1717
exit "${1:-1}"
1818
}
1919

20+
function remove_out_of_service_taint {
21+
local node=$1
22+
echo "Removing out-of-service taint from node ${node}"
23+
kubectl taint nodes "${node}" node.kubernetes.io/out-of-service:NoExecute- || true
24+
}
25+
2026
echo "Starting Spot Interruption Test for Node Termination Handler"
2127

2228
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
@@ -37,6 +43,7 @@ anth_helm_args=(
3743
--set enableScheduledEventDraining="false"
3844
--set enableSpotInterruptionDraining="true"
3945
--set taintNode="true"
46+
--set enableOutOfServiceTaint="true"
4047
--set daemonsetTolerations=""
4148
--wait
4249
--force
@@ -110,6 +117,7 @@ fi
110117

111118
cordoned=0
112119
tainted=0
120+
outOfServiceTainted=0
113121
test_node=${TEST_NODE:-$CLUSTER_NAME-worker}
114122
for i in $(seq 1 $TAINT_CHECK_CYCLES); do
115123
if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then
@@ -118,13 +126,19 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do
118126
fi
119127

120128
if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes "${test_node}" -o json | grep -q "aws-node-termination-handler/spot-itn" >/dev/null; then
121-
echo "✅ Verified the worked node was tainted!"
122-
tainted=1
129+
echo "✅ Verified the worked node was tainted!"
130+
tainted=1
131+
fi
132+
133+
if [[ $cordoned -eq 1 && $tainted -eq 1 ]] && kubectl get nodes "${test_node}" -o json | grep -q "node.kubernetes.io/out-of-service" >/dev/null; then
134+
echo "✅ Verified the worked node was tainted as out-of-service!"
135+
outOfServiceTainted=1
123136
fi
124137

125-
if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
138+
if [[ $tainted -eq 1 && $outOfServiceTainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
126139
echo "✅ Verified the regular-pod-test pod was evicted!"
127140
echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅"
141+
remove_out_of_service_taint "${test_node}"
128142
exit 0
129143
fi
130144
echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
@@ -135,8 +149,11 @@ if [[ $cordoned -eq 0 ]]; then
135149
echo "❌ Worker node was not cordoned"
136150
elif [[ $tainted -eq 0 ]]; then
137151
echo "❌ Worker node was not tainted"
152+
elif [[ $outOfServiceTainted -eq 0 ]]; then
153+
echo "❌ Worker node was not tainted as out-of-service"
138154
else
139155
echo "❌ regular-pod-test pod was not evicted"
140156
fi
141157

158+
remove_out_of_service_taint "${test_node}"
142159
fail_and_exit 1

0 commit comments

Comments
 (0)