Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/helm/aws-node-termination-handler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ The configuration in this table applies to all AWS Node Termination Handler mode
| `webhookTemplateConfigMapName` | Pass the webhook template file as a configmap. | "``" |
| `webhookTemplateConfigMapKey` | Name of the Configmap key storing the template file. | `""` |
| `enableSqsTerminationDraining` | If `true`, this turns on queue-processor mode which drains nodes when an SQS termination event is received. | `false` |
| `enableOutOfServiceTaint` | If `true`, this will add out-of-service taint to node after cordon/drain process which would forcefully evict pods without matching tolerations and detach persistent volumes. | `false` |

### Queue-Processor Mode Configuration

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ spec:
value: {{ .Values.cordonOnly | quote }}
- name: TAINT_NODE
value: {{ .Values.taintNode | quote }}
- name: ENABLE_OUT_OF_SERVICE_TAINT
value: {{ .Values.enableOutOfServiceTaint | quote }}
- name: EXCLUDE_FROM_LOAD_BALANCERS
value: {{ .Values.excludeFromLoadBalancers | quote }}
- name: DELETE_LOCAL_DATA
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ spec:
value: {{ .Values.cordonOnly | quote }}
- name: TAINT_NODE
value: {{ .Values.taintNode | quote }}
- name: ENABLE_OUT_OF_SERVICE_TAINT
value: {{ .Values.enableOutOfServiceTaint | quote }}
- name: EXCLUDE_FROM_LOAD_BALANCERS
value: {{ .Values.excludeFromLoadBalancers | quote }}
- name: DELETE_LOCAL_DATA
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ spec:
value: {{ .Values.cordonOnly | quote }}
- name: TAINT_NODE
value: {{ .Values.taintNode | quote }}
- name: ENABLE_OUT_OF_SERVICE_TAINT
value: {{ .Values.enableOutOfServiceTaint | quote }}
- name: EXCLUDE_FROM_LOAD_BALANCERS
value: {{ .Values.excludeFromLoadBalancers | quote }}
- name: DELETE_LOCAL_DATA
Expand Down
7 changes: 7 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ const (
taintNode = "TAINT_NODE"
taintEffectDefault = "NoSchedule"
taintEffect = "TAINT_EFFECT"
enableOutOfServiceTaintConfigKey = "ENABLE_OUT_OF_SERVICE_TAINT"
enableOutOfServiceTaintDefault = false
excludeFromLoadBalancers = "EXCLUDE_FROM_LOAD_BALANCERS"
jsonLoggingConfigKey = "JSON_LOGGING"
jsonLoggingDefault = false
Expand Down Expand Up @@ -149,6 +151,7 @@ type Config struct {
CordonOnly bool
TaintNode bool
TaintEffect string
EnableOutOfServiceTaint bool
ExcludeFromLoadBalancers bool
JsonLogging bool
LogLevel string
Expand Down Expand Up @@ -215,6 +218,7 @@ func ParseCliArgs() (config Config, err error) {
flag.BoolVar(&config.CordonOnly, "cordon-only", getBoolEnv(cordonOnly, false), "If true, nodes will be cordoned but not drained when an interruption event occurs.")
flag.BoolVar(&config.TaintNode, "taint-node", getBoolEnv(taintNode, false), "If true, nodes will be tainted when an interruption event occurs.")
flag.StringVar(&config.TaintEffect, "taint-effect", getEnv(taintEffect, taintEffectDefault), "Sets the effect when a node is tainted.")
flag.BoolVar(&config.EnableOutOfServiceTaint, "enable-out-of-service-taint", getBoolEnv(enableOutOfServiceTaintConfigKey, enableOutOfServiceTaintDefault), "If ture, nodes will be tainted as out-of-service after we cordon/drain the nodes when an interruption event occurs.")
flag.BoolVar(&config.ExcludeFromLoadBalancers, "exclude-from-load-balancers", getBoolEnv(excludeFromLoadBalancers, false), "If true, nodes will be marked for exclusion from load balancers when an interruption event occurs.")
flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.")
flag.StringVar(&config.LogLevel, "log-level", getEnv(logLevelConfigKey, logLevelDefault), "Sets the log level (INFO, DEBUG, or ERROR)")
Expand Down Expand Up @@ -344,6 +348,7 @@ func (c Config) PrintJsonConfigArgs() {
Bool("cordon_only", c.CordonOnly).
Bool("taint_node", c.TaintNode).
Str("taint_effect", c.TaintEffect).
Bool("enable_out_of_service_taint", c.EnableOutOfServiceTaint).
Bool("exclude_from_load_balancers", c.ExcludeFromLoadBalancers).
Bool("json_logging", c.JsonLogging).
Str("log_level", c.LogLevel).
Expand Down Expand Up @@ -395,6 +400,7 @@ func (c Config) PrintHumanConfigArgs() {
"\tcordon-only: %t,\n"+
"\ttaint-node: %t,\n"+
"\ttaint-effect: %s,\n"+
"\tenable-out-of-service-taint: %t,\n"+
"\texclude-from-load-balancers: %t,\n"+
"\tjson-logging: %t,\n"+
"\tlog-level: %s,\n"+
Expand Down Expand Up @@ -437,6 +443,7 @@ func (c Config) PrintHumanConfigArgs() {
c.CordonOnly,
c.TaintNode,
c.TaintEffect,
c.EnableOutOfServiceTaint,
c.ExcludeFromLoadBalancers,
c.JsonLogging,
c.LogLevel,
Expand Down
9 changes: 9 additions & 0 deletions pkg/interruptionevent/draincordon/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,15 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error {
if (err == nil || (!nodeFound && h.commonHandler.NthConfig.DeleteSqsMsgIfNodeNotFound)) && drainEvent.PostDrainTask != nil {
h.commonHandler.RunPostDrainTask(nodeName, drainEvent)
}

// Only add out-of-service taint if ENABLE_OUT_OF_SERVICE_TAINT flag is true, and CORDON_ONLY flag is false
if err == nil && h.commonHandler.NthConfig.EnableOutOfServiceTaint && !h.commonHandler.NthConfig.CordonOnly {
err = h.commonHandler.Node.TaintOutOfService(nodeName)
if err != nil {
return fmt.Errorf("cannot add out-of-service taint on node %s: %w", nodeName, err)
}
}

return nil
}

Expand Down
30 changes: 24 additions & 6 deletions pkg/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ const (
ASGLifecycleTerminationTaint = "aws-node-termination-handler/asg-lifecycle-termination"
// RebalanceRecommendationTaint is a taint used to make spot instance unschedulable
RebalanceRecommendationTaint = "aws-node-termination-handler/rebalance-recommendation"
// OutOfServiceTaint is a taint used to forcefully evict pods without matching tolerations and detach persistent volumes
OutOfServiceTaintKey = "node.kubernetes.io/out-of-service"
OutOfServiceTaintValue = "nodeshutdown"
OutOfServiceTaintEffectType = "NoExecute"

maxTaintValueLength = 63
daemonSet = "DaemonSet"
Expand Down Expand Up @@ -447,7 +451,7 @@ func (n Node) TaintSpotItn(nodeName string, eventID string) error {
eventID = eventID[:maxTaintValueLength]
}

return addTaint(k8sNode, n, SpotInterruptionTaint, eventID)
return addTaint(k8sNode, n, SpotInterruptionTaint, eventID, n.nthConfig.TaintEffect)
}

// TaintASGLifecycleTermination adds the spot termination notice taint onto a node
Expand All @@ -465,7 +469,7 @@ func (n Node) TaintASGLifecycleTermination(nodeName string, eventID string) erro
eventID = eventID[:maxTaintValueLength]
}

return addTaint(k8sNode, n, ASGLifecycleTerminationTaint, eventID)
return addTaint(k8sNode, n, ASGLifecycleTerminationTaint, eventID, n.nthConfig.TaintEffect)
}

// TaintRebalanceRecommendation adds the rebalance recommendation notice taint onto a node
Expand All @@ -483,7 +487,7 @@ func (n Node) TaintRebalanceRecommendation(nodeName string, eventID string) erro
eventID = eventID[:maxTaintValueLength]
}

return addTaint(k8sNode, n, RebalanceRecommendationTaint, eventID)
return addTaint(k8sNode, n, RebalanceRecommendationTaint, eventID, n.nthConfig.TaintEffect)
}

// LogPods logs all the pod names on a node
Expand Down Expand Up @@ -525,7 +529,21 @@ func (n Node) TaintScheduledMaintenance(nodeName string, eventID string) error {
eventID = eventID[:maxTaintValueLength]
}

return addTaint(k8sNode, n, ScheduledMaintenanceTaint, eventID)
return addTaint(k8sNode, n, ScheduledMaintenanceTaint, eventID, n.nthConfig.TaintEffect)
}

// TaintOutOfService adds the out-of-service taint (NoExecute) onto a node
func (n Node) TaintOutOfService(nodeName string) error {
if !n.nthConfig.EnableOutOfServiceTaint || n.nthConfig.CordonOnly {
return nil
}

k8sNode, err := n.fetchKubernetesNode(nodeName)
if err != nil {
return fmt.Errorf("Unable to fetch kubernetes node from API: %w", err)
}

return addTaint(k8sNode, n, OutOfServiceTaintKey, OutOfServiceTaintValue, OutOfServiceTaintEffectType)
}

// RemoveNTHTaints removes NTH-specific taints from a node
Expand Down Expand Up @@ -711,8 +729,8 @@ func getTaintEffect(effect string) corev1.TaintEffect {
}
}

func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string) error {
effect := getTaintEffect(nth.nthConfig.TaintEffect)
func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, effectType string) error {
effect := getTaintEffect(effectType)
if nth.nthConfig.DryRun {
log.Info().Msgf("Would have added taint (%s=%s:%s) to node %s, but dry-run flag was set", taintKey, taintValue, effect, nth.nthConfig.NodeName)
return nil
Expand Down
40 changes: 40 additions & 0 deletions pkg/node/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ import (
// Size of the fakeRecorder buffer
const recorderBufferSize = 10

const outOfServiceTaintKey = "node.kubernetes.io/out-of-service"
const outOfServiceTaintValue = "nodeshutdown"

var nodeName = "NAME"

func getDrainHelper(client *fake.Clientset) *drain.Helper {
Expand Down Expand Up @@ -418,3 +421,40 @@ func TestFilterOutDaemonSetPods(t *testing.T) {
filteredMockPodList := tNode.FilterOutDaemonSetPods(mockPodList)
h.Equals(t, 2, len(filteredMockPodList.Items))
}

func TestTaintOutOfService(t *testing.T) {
client := fake.NewSimpleClientset()
_, err := client.CoreV1().Nodes().Create(
context.Background(),
&v1.Node{
ObjectMeta: metav1.ObjectMeta{Name: nodeName},
},
metav1.CreateOptions{})
h.Ok(t, err)

tNode, err := newNode(config.Config{EnableOutOfServiceTaint: true}, client)
h.Ok(t, err)
h.Equals(t, true, tNode.GetNthConfig().EnableOutOfServiceTaint)
h.Equals(t, false, tNode.GetNthConfig().CordonOnly)

err = tNode.TaintOutOfService(nodeName)
h.Ok(t, err)

updatedNode, err := client.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
h.Ok(t, err)
taintFound := false
expectedTaint := v1.Taint{
Key: outOfServiceTaintKey,
Value: outOfServiceTaintValue,
Effect: corev1.TaintEffectNoExecute,
}
for _, taint := range updatedNode.Spec.Taints {
if taint.Key == expectedTaint.Key &&
taint.Value == expectedTaint.Value &&
taint.Effect == expectedTaint.Effect {
taintFound = true
break
}
}
h.Equals(t, true, taintFound)
}
23 changes: 20 additions & 3 deletions test/e2e/spot-interruption-test
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ function fail_and_exit {
exit "${1:-1}"
}

function remove_out_of_service_taint {
local node=$1
echo "Removing out-of-service taint from node ${node}"
kubectl taint nodes "${node}" node.kubernetes.io/out-of-service:NoExecute- || true
}

echo "Starting Spot Interruption Test for Node Termination Handler"

SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
Expand All @@ -37,6 +43,7 @@ anth_helm_args=(
--set enableScheduledEventDraining="false"
--set enableSpotInterruptionDraining="true"
--set taintNode="true"
--set enableOutOfServiceTaint="true"
--set daemonsetTolerations=""
--wait
--force
Expand Down Expand Up @@ -110,6 +117,7 @@ fi

cordoned=0
tainted=0
outOfServiceTainted=0
test_node=${TEST_NODE:-$CLUSTER_NAME-worker}
for i in $(seq 1 $TAINT_CHECK_CYCLES); do
if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then
Expand All @@ -118,13 +126,19 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do
fi

if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes "${test_node}" -o json | grep -q "aws-node-termination-handler/spot-itn" >/dev/null; then
echo "✅ Verified the worked node was tainted!"
tainted=1
echo "✅ Verified the worked node was tainted!"
tainted=1
fi

if [[ $cordoned -eq 1 && $tainted -eq 1 ]] && kubectl get nodes "${test_node}" -o json | grep -q "node.kubernetes.io/out-of-service" >/dev/null; then
echo "✅ Verified the worked node was tainted as out-of-service!"
outOfServiceTainted=1
fi

if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
if [[ $tainted -eq 1 && $outOfServiceTainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
echo "✅ Verified the regular-pod-test pod was evicted!"
echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅"
remove_out_of_service_taint "${test_node}"
exit 0
fi
echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
Expand All @@ -135,8 +149,11 @@ if [[ $cordoned -eq 0 ]]; then
echo "❌ Worker node was not cordoned"
elif [[ $tainted -eq 0 ]]; then
echo "❌ Worker node was not tainted"
elif [[ $outOfServiceTainted -eq 0 ]]; then
echo "❌ Worker node was not tainted as out-of-service"
else
echo "❌ regular-pod-test pod was not evicted"
fi

remove_out_of_service_taint "${test_node}"
fail_and_exit 1
Loading