Skip to content

Commit df9dd3d

Browse files
committed
Fix CSI node DaemonSet progressing
The Progression condition should be set only when the DaemonSet has a new content and is actively rolling out its update. After all replicas are updated, the Progressing condition should be false, even when some pods are missing. E.g. because a node is drained, something evicted them or so on. To find out if a DaemonSet has finished roll out (= stable), store the generation of last stable DaemonSet in a DaemonSet annotation. If the DaemonSet misses a pod, check its annotation. Is the current DaemonSet generation equal to the last stable generation? Then it's not a reconfiguration of the CSI driver and do not set Progressing condition. Is the current DaemonSet generation different? The operator has changed the DaemonSet spec, i.e. reconfiguration is in progress, i.e. set the Progressing condition. There is still a case when user changes the DaemonSet spec and thus bumps the generation. Progressing condition will be set. We consider it acceptable.
1 parent f838eb5 commit df9dd3d

File tree

2 files changed

+137
-10
lines changed

2 files changed

+137
-10
lines changed

pkg/operator/csi/csidrivernodeservicecontroller/csi_driver_node_service_controller.go

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ const (
3434
nodeDriverRegistrarImageEnvName = "NODE_DRIVER_REGISTRAR_IMAGE"
3535
livenessProbeImageEnvName = "LIVENESS_PROBE_IMAGE"
3636
kubeRBACProxyImageEnvName = "KUBE_RBAC_PROXY_IMAGE"
37+
stableGenerationAnnotationName = "storage.openshift.io/stable-generation"
3738
)
3839

3940
// DaemonSetHookFunc is a hook function to modify the DaemonSet.
@@ -181,6 +182,11 @@ func (c *CSIDriverNodeServiceController) syncManaged(ctx context.Context, opSpec
181182
return err
182183
}
183184

185+
daemonSet, err = c.storeLastStableGeneration(ctx, syncContext, daemonSet)
186+
if err != nil {
187+
return err
188+
}
189+
184190
// Create an OperatorStatusApplyConfiguration with generations
185191
status := applyoperatorv1.OperatorStatus().
186192
WithGenerations(&applyoperatorv1.GenerationStatusApplyConfiguration{
@@ -217,7 +223,7 @@ func (c *CSIDriverNodeServiceController) syncManaged(ctx context.Context, opSpec
217223
WithMessage("DaemonSet is not progressing").
218224
WithReason("AsExpected")
219225

220-
if ok, msg := isProgressing(opStatus, daemonSet); ok {
226+
if ok, msg := c.isProgressing(daemonSet); ok {
221227
progressingCondition = progressingCondition.
222228
WithStatus(opv1.ConditionTrue).
223229
WithMessage(msg).
@@ -253,7 +259,22 @@ func (c *CSIDriverNodeServiceController) getDaemonSet(opSpec *opv1.OperatorSpec)
253259
return required, nil
254260
}
255261

256-
func isProgressing(status *opv1.OperatorStatus, daemonSet *appsv1.DaemonSet) (bool, string) {
262+
func (c *CSIDriverNodeServiceController) isProgressing(daemonSet *appsv1.DaemonSet) (bool, string) {
263+
// Progressing means "[the component] is actively rolling out new code, propagating config
264+
// changes (e.g, a version change), or otherwise moving from one steady state to another."
265+
// This controller expects that all "config changes" result in increased DaemonSet generation
266+
// (i.e. DaemonSet .spec changes)
267+
// The controller stores the last "stable" DS generation in an annotation.
268+
// Stable means that all DS replicas are available and at the latest version.
269+
// Any subsequent missing replicas must be caused by a node added / removed / rebooted /
270+
// pod manually killed, which then does not result in Progressing=true.
271+
lastStableGeneration := daemonSet.Annotations[stableGenerationAnnotationName]
272+
currentGeneration := strconv.FormatInt(int64(daemonSet.Generation), 10)
273+
if lastStableGeneration == currentGeneration {
274+
// The previous reconfiguration has completed in the past.
275+
return false, ""
276+
}
277+
257278
switch {
258279
case daemonSet.Generation != daemonSet.Status.ObservedGeneration:
259280
return true, "Waiting for DaemonSet to act on changes"
@@ -315,3 +336,20 @@ func (c *CSIDriverNodeServiceController) syncDeleting(ctx context.Context, opSpe
315336
// All removed, remove the finalizer as the last step
316337
return v1helpers.RemoveFinalizer(ctx, c.operatorClient, c.instanceName)
317338
}
339+
340+
func (c *CSIDriverNodeServiceController) storeLastStableGeneration(ctx context.Context, syncContext factory.SyncContext, daemonSet *appsv1.DaemonSet) (*appsv1.DaemonSet, error) {
341+
lastStableGeneration := daemonSet.Annotations[stableGenerationAnnotationName]
342+
currentGeneration := strconv.FormatInt(int64(daemonSet.Generation), 10)
343+
if lastStableGeneration == currentGeneration {
344+
return daemonSet, nil
345+
}
346+
347+
if isProgressing, _ := c.isProgressing(daemonSet); isProgressing {
348+
return daemonSet, nil
349+
}
350+
351+
klog.V(2).Infof("DaemonSet %s/%s generation %d is stable", daemonSet.Namespace, daemonSet.Name, daemonSet.Generation)
352+
daemonSet.Annotations[stableGenerationAnnotationName] = currentGeneration
353+
daemonSet, _, err := resourceapply.ApplyDaemonSet(ctx, c.kubeClient.AppsV1(), syncContext.Recorder(), daemonSet, daemonSet.Generation)
354+
return daemonSet, err
355+
}

pkg/operator/csi/csidrivernodeservicecontroller/csi_driver_node_service_controller_test.go

Lines changed: 97 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package csidrivernodeservicecontroller
33
import (
44
"context"
55
"fmt"
6-
clocktesting "k8s.io/utils/clock/testing"
76
"os"
87
"sort"
98
"strings"
@@ -19,6 +18,8 @@ import (
1918
coreinformers "k8s.io/client-go/informers"
2019
fakecore "k8s.io/client-go/kubernetes/fake"
2120
core "k8s.io/client-go/testing"
21+
"k8s.io/client-go/tools/cache"
22+
clocktesting "k8s.io/utils/clock/testing"
2223

2324
"github.com/google/go-cmp/cmp"
2425
opv1 "github.com/openshift/api/operator/v1"
@@ -100,7 +101,7 @@ func newTestContext(test testCase, t *testing.T) *testContext {
100101
}
101102

102103
// Add global reactors
103-
addGenerationReactor(coreClient)
104+
addGenerationReactor(coreClient, coreInformerFactory.Apps().V1().DaemonSets().Informer().GetStore())
104105

105106
// fakeDriverInstance also fulfils the OperatorClient interface
106107
fakeOperatorClient := v1helpers.NewFakeOperatorClientWithObjectMeta(
@@ -331,8 +332,18 @@ func withDaemonSetTLSConfig() daemonSetModifier {
331332
}
332333
}
333334

335+
func withDaemonSetAnnotation(annotation string, value string) daemonSetModifier {
336+
return func(instance *appsv1.DaemonSet) *appsv1.DaemonSet {
337+
if instance.Annotations == nil {
338+
instance.Annotations = map[string]string{}
339+
}
340+
instance.Annotations[annotation] = value
341+
return instance
342+
}
343+
}
344+
334345
// This reactor is always enabled and bumps DaemonSet generation when they get updated.
335-
func addGenerationReactor(client *fakecore.Clientset) {
346+
func addGenerationReactor(client *fakecore.Clientset, store cache.Store) {
336347
client.PrependReactor("*", "daemonsets", func(action core.Action) (handled bool, ret runtime.Object, err error) {
337348
switch a := action.(type) {
338349
case core.CreateActionImpl:
@@ -343,7 +354,20 @@ func addGenerationReactor(client *fakecore.Clientset) {
343354
case core.UpdateActionImpl:
344355
object := a.GetObject()
345356
ds := object.(*appsv1.DaemonSet)
346-
ds.Generation++
357+
358+
// Bump generation only when the spec changes.
359+
// Explicitly, do not bump it on annotation change - the API server does not do it either.
360+
oldDSObject, exists, err := store.GetByKey(ds.Namespace + "/" + ds.Name)
361+
if err != nil {
362+
return false, nil, err
363+
}
364+
if !exists {
365+
return false, nil, fmt.Errorf("DaemonSet %s not found", cache.MetaObjectToName(ds))
366+
}
367+
oldDS := oldDSObject.(*appsv1.DaemonSet)
368+
if !equality.Semantic.DeepEqual(oldDS.Spec, ds.Spec) {
369+
ds.Generation++
370+
}
347371
return false, ds, nil
348372
}
349373
return false, nil, nil
@@ -426,6 +450,7 @@ func TestSync(t *testing.T) {
426450
},
427451
},
428452
{
453+
name: "no change",
429454
// DaemonSet is fully deployed and its status is synced to CR
430455
manifestFunc: makeFakeManifest,
431456
images: defaultImages(),
@@ -434,15 +459,45 @@ func TestSync(t *testing.T) {
434459
argsLevel2,
435460
defaultImages(),
436461
withDaemonSetGeneration(1, 1),
437-
withDaemonSetStatus(replica1, replica1, replica1, replica0)),
462+
withDaemonSetStatus(replica1, replica1, replica1, replica0),
463+
withDaemonSetAnnotation(stableGenerationAnnotationName, "1")),
438464
driver: makeFakeDriverInstance(withGenerations(1)),
439465
},
440466
expectedObjects: testObjects{
441467
daemonSet: getDaemonSet(
442468
argsLevel2,
443469
defaultImages(),
444470
withDaemonSetGeneration(1, 1),
445-
withDaemonSetStatus(replica1, replica1, replica1, replica0)),
471+
withDaemonSetStatus(replica1, replica1, replica1, replica0),
472+
withDaemonSetAnnotation(stableGenerationAnnotationName, "1")),
473+
driver: makeFakeDriverInstance(
474+
// withStatus(replica1),
475+
withGenerations(1),
476+
withTrueConditions(conditionAvailable),
477+
withFalseConditions(conditionProgressing)),
478+
},
479+
},
480+
{
481+
name: "finished progressing",
482+
// DaemonSet is fully deployed and its status is synced to CR
483+
manifestFunc: makeFakeManifest,
484+
images: defaultImages(),
485+
initialObjects: testObjects{
486+
daemonSet: getDaemonSet(
487+
argsLevel2,
488+
defaultImages(),
489+
withDaemonSetGeneration(1, 1),
490+
withDaemonSetStatus(replica1, replica1, replica1, replica0),
491+
withDaemonSetAnnotation(stableGenerationAnnotationName, "0")),
492+
driver: makeFakeDriverInstance(withGenerations(1)),
493+
},
494+
expectedObjects: testObjects{
495+
daemonSet: getDaemonSet(
496+
argsLevel2,
497+
defaultImages(),
498+
withDaemonSetGeneration(1, 1),
499+
withDaemonSetStatus(replica1, replica1, replica1, replica0),
500+
withDaemonSetAnnotation(stableGenerationAnnotationName, "1")), // The current generation has reached stability
446501
driver: makeFakeDriverInstance(
447502
// withStatus(replica1),
448503
withGenerations(1),
@@ -490,7 +545,8 @@ func TestSync(t *testing.T) {
490545
argsLevel2,
491546
defaultImages(),
492547
withDaemonSetGeneration(1, 1),
493-
withDaemonSetStatus(replica0, replica0, replica1, replica1)), // the DaemonSet is updating 1 pod
548+
withDaemonSetStatus(replica0, replica0, replica1, replica1), // the DaemonSet is updating 1 pod
549+
withDaemonSetAnnotation(stableGenerationAnnotationName, "0")), // and the current generation was not stable yet
494550
driver: makeFakeDriverInstance(
495551
// withStatus(replica1),
496552
withGenerations(1),
@@ -502,7 +558,8 @@ func TestSync(t *testing.T) {
502558
argsLevel2,
503559
defaultImages(),
504560
withDaemonSetGeneration(1, 1),
505-
withDaemonSetStatus(replica0, replica0, replica1, replica1)), // no change to the DaemonSet
561+
withDaemonSetStatus(replica0, replica0, replica1, replica1),
562+
withDaemonSetAnnotation(stableGenerationAnnotationName, "0")), // and the current generation was not stable yet
506563
driver: makeFakeDriverInstance(
507564
// withStatus(replica0),
508565
withGenerations(1),
@@ -658,6 +715,38 @@ func TestSync(t *testing.T) {
658715
withFalseConditions(conditionAvailable)), // Degraded is set later on
659716
},
660717
},
718+
{
719+
// DaemonSet is updating pods
720+
name: "missing pods - not progressing",
721+
manifestFunc: makeFakeManifest,
722+
images: defaultImages(),
723+
initialObjects: testObjects{
724+
daemonSet: getDaemonSet(
725+
argsLevel2,
726+
defaultImages(),
727+
withDaemonSetGeneration(1, 1),
728+
withDaemonSetStatus(replica0, replica0, replica1, replica1), // the DaemonSet is updating 1 pod
729+
withDaemonSetAnnotation(stableGenerationAnnotationName, "1")), // But the current generation was stable in the past
730+
driver: makeFakeDriverInstance(
731+
// withStatus(replica1),
732+
withGenerations(1),
733+
withTrueConditions(conditionAvailable),
734+
withFalseConditions(conditionProgressing)),
735+
},
736+
expectedObjects: testObjects{
737+
daemonSet: getDaemonSet(
738+
argsLevel2,
739+
defaultImages(),
740+
withDaemonSetGeneration(1, 1),
741+
withDaemonSetStatus(replica0, replica0, replica1, replica1),
742+
withDaemonSetAnnotation(stableGenerationAnnotationName, "1")), // no change to the DaemonSet
743+
driver: makeFakeDriverInstance(
744+
// withStatus(replica0),
745+
withGenerations(1), // But the current generation was stable in the past
746+
withTrueConditions(conditionAvailable),
747+
withFalseConditions(conditionProgressing)), // The operator is not Progressing
748+
},
749+
},
661750
}
662751

663752
for _, test := range testCases {

0 commit comments

Comments
 (0)