Skip to content

Commit 3c9c79c

Browse files
authored
Merge pull request #3226 from andyzhangx/NeverStopTaintRemoval
chore: keep disk taint removal in a loop by default
2 parents 92b412d + c7e3515 commit 3c9c79c

File tree

2 files changed

+23
-11
lines changed

2 files changed

+23
-11
lines changed

pkg/azuredisk/azuredisk.go

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ type Driver struct {
128128
endpoint string
129129
disableAVSetNodes bool
130130
removeNotReadyTaint bool
131+
neverStopTaintRemoval bool
131132
kubeClient clientset.Interface
132133
// a timed cache storing volume stats <volumeID, volumeStats>
133134
volStatsCache azcache.Resource
@@ -182,6 +183,7 @@ func NewDriver(options *DriverOptions) *Driver {
182183
driver.endpoint = options.Endpoint
183184
driver.disableAVSetNodes = options.DisableAVSetNodes
184185
driver.removeNotReadyTaint = options.RemoveNotReadyTaint
186+
driver.neverStopTaintRemoval = options.NeverStopTaintRemoval
185187
driver.maxConcurrentFormat = options.MaxConcurrentFormat
186188
driver.concurrentFormatTimeout = options.ConcurrentFormatTimeout
187189
driver.enableMinimumRetryAfter = options.EnableMinimumRetryAfter
@@ -314,7 +316,7 @@ func NewDriver(options *DriverOptions) *Driver {
314316
// Remove taint from node to indicate driver startup success
315317
// This is done at the last possible moment to prevent race conditions or false positive removals
316318
time.AfterFunc(time.Duration(options.TaintRemovalInitialDelayInSeconds)*time.Second, func() {
317-
removeTaintInBackground(kubeClient, driver.NodeID, driver.Name, taintRemovalBackoff, removeNotReadyTaint)
319+
removeTaintInBackground(kubeClient, driver.NodeID, driver.Name, taintRemovalBackoff, driver.neverStopTaintRemoval, removeNotReadyTaint)
318320
})
319321
}
320322
return &driver
@@ -670,18 +672,26 @@ type JSONPatch struct {
670672
}
671673

672674
// removeTaintInBackground is a goroutine that retries removeNotReadyTaint with exponential backoff
673-
func removeTaintInBackground(k8sClient clientset.Interface, nodeName, driverName string, backoff wait.Backoff, removalFunc func(clientset.Interface, string, string) error) {
675+
func removeTaintInBackground(k8sClient clientset.Interface, nodeName, driverName string, backoff wait.Backoff, neverStop bool, removalFunc func(clientset.Interface, string, string) error) {
674676
backoffErr := wait.ExponentialBackoff(backoff, func() (bool, error) {
675-
err := removalFunc(k8sClient, nodeName, driverName)
676-
if err != nil {
677-
klog.ErrorS(err, "Unexpected failure when attempting to remove node taint(s)")
677+
if err := removalFunc(k8sClient, nodeName, driverName); err != nil {
678+
klog.Errorf("taint removal returned with error: %v", err)
678679
return false, nil
679680
}
680681
return true, nil
681682
})
682683

683-
if backoffErr != nil {
684-
klog.ErrorS(backoffErr, "Retries exhausted, giving up attempting to remove node taint(s)")
684+
klog.Errorf("taint removal returned with error: %v", backoffErr)
685+
if neverStop {
686+
klog.V(2).Infof("Starting taint removal loop, will retry indefinitely")
687+
for {
688+
klog.V(6).Infof("Waiting for around 5 minutes before retrying taint removal")
689+
time.Sleep(4*time.Minute + wait.Jitter(time.Minute, 1.0))
690+
if err := removalFunc(k8sClient, nodeName, driverName); err != nil {
691+
klog.Errorf("taint removal returned with error: %v", err)
692+
return
693+
}
694+
}
685695
}
686696
}
687697

@@ -700,10 +710,10 @@ func removeNotReadyTaint(clientset clientset.Interface, nodeName, driverName str
700710
}
701711

702712
taintKeyToRemove := driverName + consts.AgentNotReadyNodeTaintKeySuffix
703-
klog.V(2).Infof("removing taint with key %s from local node %s", taintKeyToRemove, nodeName)
713+
klog.V(6).Infof("removing taint with key %s from local node %s", taintKeyToRemove, nodeName)
704714
var taintsToKeep []corev1.Taint
705715
for _, taint := range node.Spec.Taints {
706-
klog.V(5).Infof("checking taint key %s, value %s, effect %s", taint.Key, taint.Value, taint.Effect)
716+
klog.V(6).Infof("checking taint key %s, value %s, effect %s", taint.Key, taint.Value, taint.Effect)
707717
if taint.Key != taintKeyToRemove {
708718
taintsToKeep = append(taintsToKeep, taint)
709719
} else {
@@ -712,7 +722,7 @@ func removeNotReadyTaint(clientset clientset.Interface, nodeName, driverName str
712722
}
713723

714724
if len(taintsToKeep) == len(node.Spec.Taints) {
715-
klog.V(2).Infof("No taints to remove on node, skipping taint removal")
725+
klog.V(6).Infof("No taints to remove on node, skipping taint removal")
716726
return nil
717727
}
718728

@@ -751,7 +761,7 @@ func checkAllocatable(ctx context.Context, clientset clientset.Interface, nodeNa
751761
for _, driver := range csiNode.Spec.Drivers {
752762
if driver.Name == driverName {
753763
if driver.Allocatable != nil && driver.Allocatable.Count != nil {
754-
klog.V(2).Infof("CSINode Allocatable value is set for driver on node %s, count %d", nodeName, *driver.Allocatable.Count)
764+
klog.V(6).Infof("CSINode Allocatable value is set for driver on node %s, count %d", nodeName, *driver.Allocatable.Count)
755765
return nil
756766
}
757767
return fmt.Errorf("isAllocatableSet: allocatable value not set for driver on node %s", nodeName)

pkg/azuredisk/azuredisk_option.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ type DriverOptions struct {
6565
Endpoint string
6666
DisableAVSetNodes bool
6767
RemoveNotReadyTaint bool
68+
NeverStopTaintRemoval bool
6869
TaintRemovalInitialDelayInSeconds int64
6970
MaxConcurrentFormat int64
7071
ConcurrentFormatTimeout int64
@@ -114,6 +115,7 @@ func (o *DriverOptions) AddFlags() *flag.FlagSet {
114115
fs.StringVar(&o.Kubeconfig, "kubeconfig", "", "Absolute path to the kubeconfig file. Required only when running out of cluster.")
115116
fs.BoolVar(&o.DisableAVSetNodes, "disable-avset-nodes", false, "disable DisableAvailabilitySetNodes in cloud config for controller")
116117
fs.BoolVar(&o.RemoveNotReadyTaint, "remove-not-ready-taint", true, "remove NotReady taint from node when node is ready")
118+
fs.BoolVar(&o.NeverStopTaintRemoval, "never-stop-taint-removal", true, "if true, taint removal will never stop, otherwise it will stop after the first successful removal")
117119
fs.Int64Var(&o.TaintRemovalInitialDelayInSeconds, "taint-removal-initial-delay-seconds", 30, "initial delay in seconds for taint removal")
118120
fs.StringVar(&o.Endpoint, "endpoint", "unix://tmp/csi.sock", "CSI endpoint")
119121
fs.Int64Var(&o.MaxConcurrentFormat, "max-concurrent-format", 2, "maximum number of concurrent format exec calls")

0 commit comments

Comments
 (0)