Skip to content

Commit c5bc6fa

Browse files
[release-1.27] Handle istio-cni cleanup on node restart (#57880)
* Handle istio-cni on node cleanup Currently on cleanup if safe upgrades are enable we check if the cni daemonset has a deletion time stamp. If it didn't have a stamp then we are in the process of upgrade or rebooting the node. Otherwise we should cleanup. This didn't handle failures on the get request for the DS (other than not found) which could indicate the node is in an unhealthy state / restarting. Previously an err would mean we would cleanup. Now we will retry the get, and assume we shouldn't cleanup by default. Signed-off-by: Jackie Elliott <[email protected]> * fix lint Signed-off-by: Jackie Elliott <[email protected]> * Add release note Signed-off-by: Jackie Elliott <[email protected]> * Cleanup root Signed-off-by: Jackie Elliott <[email protected]> * Refactor StopCleanup to only default to true when using istio owned cni config. Also, check for cni pod in plugin prior to getting k8s client. Signed-off-by: Jackie Elliott <[email protected]> * Handle unauthorized get error on cleanup Signed-off-by: Jackie Elliott <[email protected]> * Fix releasenotes and string format Signed-off-by: Jackie Elliott <[email protected]> * Fix nits Signed-off-by: Jackie Elliott <[email protected]> --------- Signed-off-by: Jackie Elliott <[email protected]> Co-authored-by: Jackie Elliott <[email protected]>
1 parent c2bcef1 commit c5bc6fa

File tree

4 files changed

+81
-32
lines changed

4 files changed

+81
-32
lines changed

cni/pkg/cmd/root.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -132,25 +132,27 @@ var rootCmd = &cobra.Command{
132132
// if it is, we do NOT remove the plugin, and do
133133
// NOT do ambient watch server cleanup
134134
defer func() {
135-
var isUpgrade bool
135+
var shouldStopCleanup bool
136136
if cfg.InstallConfig.AmbientDisableSafeUpgrade {
137137
log.Info("Ambient node agent safe upgrade explicitly disabled via env")
138-
isUpgrade = false
138+
shouldStopCleanup = false
139139
} else {
140-
isUpgrade = ambientAgent.ShouldStopForUpgrade("istio-cni", nodeagent.PodNamespace)
140+
shouldStopCleanup = ambientAgent.ShouldStopCleanup("istio-cni", nodeagent.PodNamespace, cfg.InstallConfig.IstioOwnedCNIConfig)
141141
}
142-
log.Infof("Ambient node agent shutting down - is upgrade shutdown? %t", isUpgrade)
142+
log.Infof("Ambient node agent shutting down - should stop cleanup? %t", shouldStopCleanup)
143+
144+
// TODO(jaellio) - do we want to add support for a partial cleanup
143145
// if we are doing an "upgrade shutdown", then
144146
// we do NOT want to remove/cleanup the CNI plugin.
145147
//
146148
// This is important - we want it to remain in place to "stall"
147149
// new ambient-enabled pods while our replacement spins up.
148-
if !isUpgrade {
150+
if !shouldStopCleanup {
149151
if cleanErr := installer.Cleanup(); cleanErr != nil {
150152
log.Error(cleanErr.Error())
151153
}
152154
}
153-
ambientAgent.Stop(isUpgrade)
155+
ambientAgent.Stop(shouldStopCleanup)
154156
}()
155157

156158
ambientAgent.Start()

cni/pkg/nodeagent/server.go

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ import (
2121
"sync/atomic"
2222
"time"
2323

24+
"github.com/cenkalti/backoff/v4"
2425
corev1 "k8s.io/api/core/v1"
26+
"k8s.io/apimachinery/pkg/api/errors"
2527
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2628
"k8s.io/client-go/rest"
2729

@@ -31,7 +33,10 @@ import (
3133

3234
const defaultZTunnelKeepAliveCheckInterval = 5 * time.Second
3335

34-
var log = scopes.CNIAgent
36+
var (
37+
log = scopes.CNIAgent
38+
tokenWaitBackoff = time.Second
39+
)
3540

3641
type MeshDataplane interface {
3742
// MUST be called first, (even before Start()).
@@ -119,18 +124,47 @@ func (s *Server) Stop(skipCleanup bool) {
119124
s.dataplane.Stop(skipCleanup)
120125
}
121126

122-
func (s *Server) ShouldStopForUpgrade(selfName, selfNamespace string) bool {
127+
// ShouldStopCleanup of istio-cni config and binary when upgrading or on node reboot
128+
func (s *Server) ShouldStopCleanup(selfName, selfNamespace string, istioOwnedCNIConfig bool) bool {
123129
dsName := fmt.Sprintf("%s-node", selfName)
124-
cniDS, err := s.kubeClient.Kube().AppsV1().DaemonSets(selfNamespace).Get(context.Background(), dsName, metav1.GetOptions{})
125-
log.Debugf("Daemonset %s has deletion timestamp?: %+v", dsName, cniDS.DeletionTimestamp)
126-
if err == nil && cniDS != nil && cniDS.DeletionTimestamp == nil {
127-
log.Infof("terminating, but parent DS %s is still present, this is an upgrade, leaving plugin in place", dsName)
128-
return true
130+
shouldStopCleanup := false
131+
var numRetries uint64
132+
// use different defaults when using an istio owned CNI config file
133+
if istioOwnedCNIConfig {
134+
shouldStopCleanup = true
135+
numRetries = 2
129136
}
130-
131-
// If the DS is gone, it's definitely not an upgrade, so carry on like normal.
132-
log.Infof("parent DS %s is gone or marked for deletion, this is not an upgrade, shutting down normally %s", dsName, err)
133-
return false
137+
err := backoff.Retry(
138+
func() error {
139+
cniDS, err := s.kubeClient.Kube().AppsV1().DaemonSets(selfNamespace).Get(context.Background(), dsName, metav1.GetOptions{})
140+
141+
if err == nil && cniDS != nil && cniDS.DeletionTimestamp == nil {
142+
log.Infof("terminating, but parent DaemonSet %s is still present, this is an upgrade or a node reboot, leaving plugin in place", dsName)
143+
shouldStopCleanup = true
144+
return nil
145+
}
146+
if errors.IsNotFound(err) || (cniDS != nil && cniDS.DeletionTimestamp != nil) {
147+
// If the DS is gone, or marked for deletion, this is not an upgrade.
148+
// We can safely shut down the plugin.
149+
log.Infof("parent DaemonSet %s is not found or marked for deletion, this is not an upgrade, shutting down normally", dsName)
150+
shouldStopCleanup = false
151+
return nil
152+
}
153+
if errors.IsUnauthorized(err) {
154+
log.Infof("permission to get parent DaemonSet %s has been revoked manually or due to uninstall, this is not an upgrade, "+
155+
"shutting down normally", dsName)
156+
shouldStopCleanup = false
157+
return nil
158+
}
159+
log.Infof("failed to get parent DS %s, retrying: %v", dsName, err)
160+
return err
161+
},
162+
// Limiting retries to 3 so other shutdown tasks can complete before the graceful shutdown period ends
163+
backoff.WithMaxRetries(backoff.NewConstantBackOff(tokenWaitBackoff), numRetries))
164+
if err != nil {
165+
log.Infof("failed to get parent DaemonSet %s, returning %t: %v", dsName, shouldStopCleanup, err)
166+
}
167+
return shouldStopCleanup
134168
}
135169

136170
// buildKubeClient creates the kube client

cni/pkg/plugin/plugin.go

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -164,23 +164,26 @@ func CmdAdd(args *skel.CmdArgs) (err error) {
164164
return err
165165
}
166166

167+
// Preemptively check if the pod is a CNI pod.
168+
// It is possible that the kubeconfig is not available if it hasn't been written yet
169+
// by the CNI pod or it is invalid which cause the CNI pod to be unable to start if
170+
// on the creation of a new K8s client. We preemptively check if the pod is a CNI pod
171+
// to avoid a deadlock on the kubeconfig when the k8s client is unnecessary to process
172+
// the CNI add event for the CNI pod itself.
173+
if conf.AmbientEnabled {
174+
k8sArgs := K8sArgs{}
175+
if err := types.LoadArgs(args.Args, &k8sArgs); err != nil {
176+
return fmt.Errorf("failed to load args: %v", err)
177+
}
178+
if isCNIPod(conf, &k8sArgs) {
179+
// If we are in a degraded state and this is our own agent pod, skip
180+
return pluginResponse(conf)
181+
}
182+
}
183+
167184
// Create a kube client
168185
client, err := newK8sClient(*conf)
169-
// If creation of a kube client fails, check if the pod is a CNI pod.
170-
// It is possible that the kubeconfig is not available if it hasn't been written yet
171-
// by the CNI pod which could be unable to start if we failed here. We skip this
172-
// failure for the CNI pod to avoid a deadlock.
173186
if err != nil {
174-
if conf.AmbientEnabled {
175-
k8sArgs := K8sArgs{}
176-
if err := types.LoadArgs(args.Args, &k8sArgs); err != nil {
177-
return fmt.Errorf("failed to load args after failed attempt to get client: %v", err)
178-
}
179-
if isCNIPod(conf, &k8sArgs) {
180-
// If we are in a degraded state and this is our own agent pod, skip
181-
return pluginResponse(conf)
182-
}
183-
}
184187
return fmt.Errorf("failed to createNewK8sClient: %v", err)
185188
}
186189

@@ -404,6 +407,6 @@ func isCNIPod(conf *Config, k8sArgs *K8sArgs) bool {
404407
log.Infof("in a degraded state and %v looks like our own agent pod, skipping", k8sArgs.K8S_POD_NAME)
405408
return true
406409
}
407-
log.Warnf("not a CNI pod, podName: %s, podNamespace: %s", k8sArgs.K8S_POD_NAME, k8sArgs.K8S_POD_NAMESPACE)
410+
log.Warnf("not a CNI pod, podName: %s, podNamespace: %s, conf pod namespace: %s", k8sArgs.K8S_POD_NAME, k8sArgs.K8S_POD_NAMESPACE, conf.PodNamespace)
408411
return false
409412
}

releasenotes/notes/57316.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: release-notes/v2
2+
kind: feature
3+
area: security
4+
issue:
5+
- 57316
6+
releaseNotes:
7+
- |
8+
**Fixed** the behavior of istio-cni cleanup when the get daemonset command fails due to an error
9+
other than "not found". Defaults to not cleaning up the CNI config and binary when it cannot be
10+
concluded if an upgrade, delete, or node reboot is in progress.

0 commit comments

Comments
 (0)