Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/helm/aws-node-termination-handler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ The configuration in this table applies to all AWS Node Termination Handler mode
| `webhookTemplateConfigMapName` | Pass the webhook template file as a configmap. | "``" |
| `webhookTemplateConfigMapKey` | Name of the Configmap key storing the template file. | `""` |
| `enableSqsTerminationDraining` | If `true`, this turns on queue-processor mode which drains nodes when an SQS termination event is received. | `false` |
| `enableOutOfServiceTaint` | If `true`, this will add out-of-service taint to node after cordon/drain process which would forcefully evict pods without matching tolerations and detach persistent volumes. | `false` |

### Queue-Processor Mode Configuration

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ spec:
value: {{ .Values.cordonOnly | quote }}
- name: TAINT_NODE
value: {{ .Values.taintNode | quote }}
- name: ENABLE_OUT_OF_SERVICE_TAINT
value: {{ .Values.enableOutOfServiceTaint | quote }}
- name: EXCLUDE_FROM_LOAD_BALANCERS
value: {{ .Values.excludeFromLoadBalancers | quote }}
- name: DELETE_LOCAL_DATA
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ spec:
value: {{ .Values.cordonOnly | quote }}
- name: TAINT_NODE
value: {{ .Values.taintNode | quote }}
- name: ENABLE_OUT_OF_SERVICE_TAINT
value: {{ .Values.enableOutOfServiceTaint | quote }}
- name: EXCLUDE_FROM_LOAD_BALANCERS
value: {{ .Values.excludeFromLoadBalancers | quote }}
- name: DELETE_LOCAL_DATA
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ spec:
value: {{ .Values.cordonOnly | quote }}
- name: TAINT_NODE
value: {{ .Values.taintNode | quote }}
- name: ENABLE_OUT_OF_SERVICE_TAINT
value: {{ .Values.enableOutOfServiceTaint | quote }}
- name: EXCLUDE_FROM_LOAD_BALANCERS
value: {{ .Values.excludeFromLoadBalancers | quote }}
- name: DELETE_LOCAL_DATA
Expand Down
3 changes: 3 additions & 0 deletions config/helm/aws-node-termination-handler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ cordonOnly: false
# Taint node upon spot interruption termination notice.
taintNode: false

# Add out-of-service taint to node after cordon/drain process which would forcefully evict pods without matching tolerations and detach persistent volumes.
enableOutOfServiceTaint: false

# Exclude node from load balancer before cordoning via the ServiceNodeExclusion feature gate.
excludeFromLoadBalancers: false

Expand Down
7 changes: 7 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ const (
taintNode = "TAINT_NODE"
taintEffectDefault = "NoSchedule"
taintEffect = "TAINT_EFFECT"
enableOutOfServiceTaintConfigKey = "ENABLE_OUT_OF_SERVICE_TAINT"
enableOutOfServiceTaintDefault = false
excludeFromLoadBalancers = "EXCLUDE_FROM_LOAD_BALANCERS"
jsonLoggingConfigKey = "JSON_LOGGING"
jsonLoggingDefault = false
Expand Down Expand Up @@ -149,6 +151,7 @@ type Config struct {
CordonOnly bool
TaintNode bool
TaintEffect string
EnableOutOfServiceTaint bool
ExcludeFromLoadBalancers bool
JsonLogging bool
LogLevel string
Expand Down Expand Up @@ -215,6 +218,7 @@ func ParseCliArgs() (config Config, err error) {
flag.BoolVar(&config.CordonOnly, "cordon-only", getBoolEnv(cordonOnly, false), "If true, nodes will be cordoned but not drained when an interruption event occurs.")
flag.BoolVar(&config.TaintNode, "taint-node", getBoolEnv(taintNode, false), "If true, nodes will be tainted when an interruption event occurs.")
flag.StringVar(&config.TaintEffect, "taint-effect", getEnv(taintEffect, taintEffectDefault), "Sets the effect when a node is tainted.")
flag.BoolVar(&config.EnableOutOfServiceTaint, "enable-out-of-service-taint", getBoolEnv(enableOutOfServiceTaintConfigKey, enableOutOfServiceTaintDefault), "If true, nodes will be tainted as out-of-service after we cordon/drain the nodes when an interruption event occurs.")
flag.BoolVar(&config.ExcludeFromLoadBalancers, "exclude-from-load-balancers", getBoolEnv(excludeFromLoadBalancers, false), "If true, nodes will be marked for exclusion from load balancers when an interruption event occurs.")
flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.")
flag.StringVar(&config.LogLevel, "log-level", getEnv(logLevelConfigKey, logLevelDefault), "Sets the log level (INFO, DEBUG, or ERROR)")
Expand Down Expand Up @@ -344,6 +348,7 @@ func (c Config) PrintJsonConfigArgs() {
Bool("cordon_only", c.CordonOnly).
Bool("taint_node", c.TaintNode).
Str("taint_effect", c.TaintEffect).
Bool("enable_out_of_service_taint", c.EnableOutOfServiceTaint).
Bool("exclude_from_load_balancers", c.ExcludeFromLoadBalancers).
Bool("json_logging", c.JsonLogging).
Str("log_level", c.LogLevel).
Expand Down Expand Up @@ -395,6 +400,7 @@ func (c Config) PrintHumanConfigArgs() {
"\tcordon-only: %t,\n"+
"\ttaint-node: %t,\n"+
"\ttaint-effect: %s,\n"+
"\tenable-out-of-service-taint: %t,\n"+
"\texclude-from-load-balancers: %t,\n"+
"\tjson-logging: %t,\n"+
"\tlog-level: %s,\n"+
Expand Down Expand Up @@ -437,6 +443,7 @@ func (c Config) PrintHumanConfigArgs() {
c.CordonOnly,
c.TaintNode,
c.TaintEffect,
c.EnableOutOfServiceTaint,
c.ExcludeFromLoadBalancers,
c.JsonLogging,
c.LogLevel,
Expand Down
9 changes: 9 additions & 0 deletions pkg/interruptionevent/draincordon/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,15 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error {
if (err == nil || (!nodeFound && h.commonHandler.NthConfig.DeleteSqsMsgIfNodeNotFound)) && drainEvent.PostDrainTask != nil {
h.commonHandler.RunPostDrainTask(nodeName, drainEvent)
}

// Only add out-of-service taint if ENABLE_OUT_OF_SERVICE_TAINT flag is true, and CORDON_ONLY flag is false
if err == nil && h.commonHandler.NthConfig.EnableOutOfServiceTaint && !h.commonHandler.NthConfig.CordonOnly {
err = h.commonHandler.Node.TaintOutOfService(nodeName)
if err != nil {
return fmt.Errorf("cannot add out-of-service taint on node %s: %w", nodeName, err)
}
}

return nil
}

Expand Down
30 changes: 24 additions & 6 deletions pkg/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ const (
ASGLifecycleTerminationTaint = "aws-node-termination-handler/asg-lifecycle-termination"
// RebalanceRecommendationTaint is a taint used to make spot instance unschedulable
RebalanceRecommendationTaint = "aws-node-termination-handler/rebalance-recommendation"
// OutOfServiceTaint is a taint used to forcefully evict pods without matching tolerations and detach persistent volumes
OutOfServiceTaintKey = "node.kubernetes.io/out-of-service"
OutOfServiceTaintValue = "nodeshutdown"
OutOfServiceTaintEffectType = "NoExecute"

maxTaintValueLength = 63
daemonSet = "DaemonSet"
Expand Down Expand Up @@ -447,7 +451,7 @@ func (n Node) TaintSpotItn(nodeName string, eventID string) error {
eventID = eventID[:maxTaintValueLength]
}

return addTaint(k8sNode, n, SpotInterruptionTaint, eventID)
return addTaint(k8sNode, n, SpotInterruptionTaint, eventID, n.nthConfig.TaintEffect)
}

// TaintASGLifecycleTermination adds the spot termination notice taint onto a node
Expand All @@ -465,7 +469,7 @@ func (n Node) TaintASGLifecycleTermination(nodeName string, eventID string) erro
eventID = eventID[:maxTaintValueLength]
}

return addTaint(k8sNode, n, ASGLifecycleTerminationTaint, eventID)
return addTaint(k8sNode, n, ASGLifecycleTerminationTaint, eventID, n.nthConfig.TaintEffect)
}

// TaintRebalanceRecommendation adds the rebalance recommendation notice taint onto a node
Expand All @@ -483,7 +487,7 @@ func (n Node) TaintRebalanceRecommendation(nodeName string, eventID string) erro
eventID = eventID[:maxTaintValueLength]
}

return addTaint(k8sNode, n, RebalanceRecommendationTaint, eventID)
return addTaint(k8sNode, n, RebalanceRecommendationTaint, eventID, n.nthConfig.TaintEffect)
}

// LogPods logs all the pod names on a node
Expand Down Expand Up @@ -525,7 +529,21 @@ func (n Node) TaintScheduledMaintenance(nodeName string, eventID string) error {
eventID = eventID[:maxTaintValueLength]
}

return addTaint(k8sNode, n, ScheduledMaintenanceTaint, eventID)
return addTaint(k8sNode, n, ScheduledMaintenanceTaint, eventID, n.nthConfig.TaintEffect)
}

// TaintOutOfService adds the out-of-service taint (NoExecute) onto a node
func (n Node) TaintOutOfService(nodeName string) error {
if !n.nthConfig.EnableOutOfServiceTaint || n.nthConfig.CordonOnly {
return nil
}

k8sNode, err := n.fetchKubernetesNode(nodeName)
if err != nil {
return fmt.Errorf("Unable to fetch kubernetes node from API: %w", err)
}

return addTaint(k8sNode, n, OutOfServiceTaintKey, OutOfServiceTaintValue, OutOfServiceTaintEffectType)
}

// RemoveNTHTaints removes NTH-specific taints from a node
Expand Down Expand Up @@ -711,8 +729,8 @@ func getTaintEffect(effect string) corev1.TaintEffect {
}
}

func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string) error {
effect := getTaintEffect(nth.nthConfig.TaintEffect)
func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, effectType string) error {
effect := getTaintEffect(effectType)
if nth.nthConfig.DryRun {
log.Info().Msgf("Would have added taint (%s=%s:%s) to node %s, but dry-run flag was set", taintKey, taintValue, effect, nth.nthConfig.NodeName)
return nil
Expand Down
40 changes: 40 additions & 0 deletions pkg/node/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ import (
// Size of the fakeRecorder buffer
const recorderBufferSize = 10

const outOfServiceTaintKey = "node.kubernetes.io/out-of-service"
const outOfServiceTaintValue = "nodeshutdown"

var nodeName = "NAME"

func getDrainHelper(client *fake.Clientset) *drain.Helper {
Expand Down Expand Up @@ -418,3 +421,40 @@ func TestFilterOutDaemonSetPods(t *testing.T) {
filteredMockPodList := tNode.FilterOutDaemonSetPods(mockPodList)
h.Equals(t, 2, len(filteredMockPodList.Items))
}

func TestTaintOutOfService(t *testing.T) {
client := fake.NewSimpleClientset()
_, err := client.CoreV1().Nodes().Create(
context.Background(),
&v1.Node{
ObjectMeta: metav1.ObjectMeta{Name: nodeName},
},
metav1.CreateOptions{})
h.Ok(t, err)

tNode, err := newNode(config.Config{EnableOutOfServiceTaint: true}, client)
h.Ok(t, err)
h.Equals(t, true, tNode.GetNthConfig().EnableOutOfServiceTaint)
h.Equals(t, false, tNode.GetNthConfig().CordonOnly)

err = tNode.TaintOutOfService(nodeName)
h.Ok(t, err)

updatedNode, err := client.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
h.Ok(t, err)
taintFound := false
expectedTaint := v1.Taint{
Key: outOfServiceTaintKey,
Value: outOfServiceTaintValue,
Effect: corev1.TaintEffectNoExecute,
}
for _, taint := range updatedNode.Spec.Taints {
if taint.Key == expectedTaint.Key &&
taint.Value == expectedTaint.Value &&
taint.Effect == expectedTaint.Effect {
taintFound = true
break
}
}
h.Equals(t, true, taintFound)
}
23 changes: 20 additions & 3 deletions test/e2e/spot-interruption-test
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ function fail_and_exit {
exit "${1:-1}"
}

function remove_out_of_service_taint {
local node=$1
echo "Removing out-of-service taint from node ${node}"
kubectl taint nodes "${node}" node.kubernetes.io/out-of-service:NoExecute- || true
}

echo "Starting Spot Interruption Test for Node Termination Handler"

SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
Expand All @@ -37,6 +43,7 @@ anth_helm_args=(
--set enableScheduledEventDraining="false"
--set enableSpotInterruptionDraining="true"
--set taintNode="true"
--set enableOutOfServiceTaint="true"
--set daemonsetTolerations=""
--wait
--force
Expand Down Expand Up @@ -110,6 +117,7 @@ fi

cordoned=0
tainted=0
outOfServiceTainted=0
test_node=${TEST_NODE:-$CLUSTER_NAME-worker}
for i in $(seq 1 $TAINT_CHECK_CYCLES); do
if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then
Expand All @@ -118,13 +126,19 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do
fi

if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes "${test_node}" -o json | grep -q "aws-node-termination-handler/spot-itn" >/dev/null; then
echo "✅ Verified the worked node was tainted!"
tainted=1
echo "✅ Verified the worked node was tainted!"
tainted=1
fi

if [[ $cordoned -eq 1 && $tainted -eq 1 ]] && kubectl get nodes "${test_node}" -o json | grep -q "node.kubernetes.io/out-of-service" >/dev/null; then
echo "✅ Verified the worked node was tainted as out-of-service!"
outOfServiceTainted=1
fi

if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
if [[ $tainted -eq 1 && $outOfServiceTainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
echo "✅ Verified the regular-pod-test pod was evicted!"
echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅"
remove_out_of_service_taint "${test_node}"
exit 0
fi
echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
Expand All @@ -135,8 +149,11 @@ if [[ $cordoned -eq 0 ]]; then
echo "❌ Worker node was not cordoned"
elif [[ $tainted -eq 0 ]]; then
echo "❌ Worker node was not tainted"
elif [[ $outOfServiceTainted -eq 0 ]]; then
echo "❌ Worker node was not tainted as out-of-service"
else
echo "❌ regular-pod-test pod was not evicted"
fi

remove_out_of_service_taint "${test_node}"
fail_and_exit 1
Loading