Skip to content

Commit a0fdb4a

Browse files
committed
OCPBUGS-62632: Do not report Progressing=True during cluster scaleup or node reboot
This change improves user's upgrade experience as outlined by OCPSTRAT-2484 by fixing false alarms in ClusterOperator status. The following new rule is adhered to: * Operators should not report Progressing only because DaemonSets owned by them are adjusting to a new node from cluster scaleup or a node rebooting from cluster upgrade. This is achieved by adding helper annotation "tuned.openshift.io/stable-generation" to the tuned daemonset. Other minor fixes: * Minor logging fix (updateTunedProfileStatus()) * Removed non-functional code from the operator initializing ClusterOperator status Resolves: OCPBUGS-62632
1 parent 5d94d67 commit a0fdb4a

File tree

5 files changed

+693
-30
lines changed

5 files changed

+693
-30
lines changed

pkg/apis/tuned/v1/tuned_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ const (
2929
// TunedDeferredUpdate request the tuned daemons to defer the update of the rendered profile
3030
// until the next restart.
3131
TunedDeferredUpdate string = "tuned.openshift.io/deferred"
32+
33+
// StableGenerationAnnotationName is tuned daemonset annotation used to determine ClusterOperator
34+
// progressing status.
35+
StableGenerationAnnotationName = "tuned.openshift.io/last-stable-daemonset-generation"
3236
)
3337

3438
/////////////////////////////////////////////////////////////////////////////////

pkg/operator/controller.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ import (
3838
tunedinformers "github.com/openshift/cluster-node-tuning-operator/pkg/generated/informers/externalversions"
3939
ntomf "github.com/openshift/cluster-node-tuning-operator/pkg/manifests"
4040
"github.com/openshift/cluster-node-tuning-operator/pkg/metrics"
41-
tunedpkg "github.com/openshift/cluster-node-tuning-operator/pkg/tuned"
4241
"github.com/openshift/cluster-node-tuning-operator/pkg/util"
4342
"github.com/openshift/cluster-node-tuning-operator/version"
4443

@@ -571,6 +570,10 @@ func (c *Controller) syncDaemonSet(tuned *tunedv1.Tuned) error {
571570
update = true
572571
}
573572

573+
// OCPBUGS-62632: do not indicate ClusterOperator status Progressing on node scaling or reboot
574+
ds, change := c.storeLastStableGeneration(ds)
575+
update = update || change
576+
574577
if update {
575578
// Update the DaemonSet
576579
ds = ds.DeepCopy() // never update the objects from cache
@@ -658,7 +661,6 @@ func (c *Controller) syncProfile(tuned *tunedv1.Tuned, nodeName string) error {
658661
profileMf.Spec.Config.Verbosity = computed.Operand.Verbosity
659662
profileMf.Spec.Config.TuneDConfig = computed.Operand.TuneDConfig
660663
profileMf.Spec.Profile = computed.AllProfiles
661-
profileMf.Status.Conditions = tunedpkg.InitializeStatusConditions()
662664
_, err = c.clients.Tuned.TunedV1().Profiles(ntoconfig.WatchNamespace()).Create(context.TODO(), profileMf, metav1.CreateOptions{})
663665
if err != nil {
664666
return fmt.Errorf("failed to create Profile %s: %v", profileMf.Name, err)
@@ -737,7 +739,6 @@ func (c *Controller) syncProfile(tuned *tunedv1.Tuned, nodeName string) error {
737739
profile.Spec.Config.TuneDConfig = computed.Operand.TuneDConfig
738740
profile.Spec.Config.ProviderName = providerName
739741
profile.Spec.Profile = computed.AllProfiles
740-
profile.Status.Conditions = tunedpkg.InitializeStatusConditions()
741742
delete(c.pc.state.bootcmdline, nodeName) // bootcmdline retrieved from node annotation is potentially stale, let it resync on node update
742743

743744
klog.V(2).Infof("syncProfile(): updating Profile %s [%s]", profile.Name, computed.TunedProfileName)

pkg/operator/status.go

Lines changed: 57 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@ package operator
22

33
import (
44
"context"
5-
"errors"
65
"fmt"
76
"os"
7+
"strconv"
88

99
configv1 "github.com/openshift/api/config/v1"
1010
operatorv1 "github.com/openshift/api/operator/v1"
1111
operatorv1helpers "github.com/openshift/library-go/pkg/operator/v1helpers"
12+
appsv1 "k8s.io/api/apps/v1"
1213
corev1 "k8s.io/api/core/v1"
1314
apierrors "k8s.io/apimachinery/pkg/api/errors"
1415
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -24,10 +25,6 @@ import (
2425
"github.com/openshift/cluster-node-tuning-operator/version"
2526
)
2627

27-
const (
28-
errGenerationMismatch = "generation mismatch"
29-
)
30-
3128
// syncOperatorStatus computes the operator's current status and therefrom
3229
// creates or updates the ClusterOperator resource for the operator.
3330
func (c *Controller) syncOperatorStatus(tuned *tunedv1.Tuned) error {
@@ -44,10 +41,6 @@ func (c *Controller) syncOperatorStatus(tuned *tunedv1.Tuned) error {
4441
oldConditions := co.Status.Conditions
4542
co.Status.Conditions, operandReleaseVersion, err = c.computeStatus(tuned, oldConditions)
4643
if err != nil {
47-
if err.Error() == errGenerationMismatch {
48-
// Tuned DaemonSet has not updated its status yet
49-
return nil
50-
}
5144
return err
5245
}
5346
if len(operandReleaseVersion) > 0 {
@@ -98,6 +91,53 @@ func (c *Controller) getOrCreateOperatorStatus() (*configv1.ClusterOperator, err
9891
return co, nil
9992
}
10093

94+
// storeLastStableGeneration stores the last stable daemonSet's generation into its annotation.
95+
// Returns the updated daemonSet and true if the annotation was updated.
96+
func (c *Controller) storeLastStableGeneration(daemonSet *appsv1.DaemonSet) (*appsv1.DaemonSet, bool) {
97+
lastStableGeneration := daemonSet.Annotations[tunedv1.StableGenerationAnnotationName]
98+
currentGeneration := strconv.FormatInt(daemonSet.Generation, 10)
99+
if lastStableGeneration == currentGeneration {
100+
return daemonSet, false
101+
}
102+
103+
if isProgressing, _ := isProgressing(daemonSet); isProgressing {
104+
return daemonSet, false
105+
}
106+
107+
klog.V(2).Infof("DaemonSet %s/%s generation %d is stable", daemonSet.Namespace, daemonSet.Name, daemonSet.Generation)
108+
daemonSet.Annotations[tunedv1.StableGenerationAnnotationName] = currentGeneration
109+
return daemonSet, true
110+
}
111+
112+
// Progressing means "[the component] is actively rolling out new code, propagating config
113+
// changes (e.g, a version change), or otherwise moving from one steady state to another.".
114+
// This controller expects that all "config changes" result in increased DaemonSet generation
115+
// (i.e. DaemonSet .spec changes).
116+
// The controller stores the last "stable" DS generation in an annotation.
117+
// Stable means that all DS replicas are available and at the latest version.
118+
// Any subsequent missing replicas must be caused by a node added / removed / rebooted /
119+
// pod manually killed, which then does not result in Progressing=true.
120+
//
121+
// This code is borrowed from library-go.
122+
func isProgressing(daemonSet *appsv1.DaemonSet) (bool, string) {
123+
lastStableGeneration := daemonSet.Annotations[tunedv1.StableGenerationAnnotationName]
124+
currentGeneration := strconv.FormatInt(daemonSet.Generation, 10)
125+
if lastStableGeneration == currentGeneration {
126+
// The previous reconfiguration has completed in the past.
127+
return false, ""
128+
}
129+
130+
switch {
131+
case daemonSet.Generation != daemonSet.Status.ObservedGeneration:
132+
return true, "Waiting for DaemonSet to act on changes"
133+
case daemonSet.Status.UpdatedNumberScheduled < daemonSet.Status.DesiredNumberScheduled:
134+
return true, fmt.Sprintf("Waiting for DaemonSet to update %d node pods", daemonSet.Status.DesiredNumberScheduled)
135+
case daemonSet.Status.NumberUnavailable > 0:
136+
return true, "Waiting for DaemonSet to deploy node pods"
137+
}
138+
return false, ""
139+
}
140+
101141
// profileApplied returns true if Tuned Profile 'profile' has been applied.
102142
func profileApplied(profile *tunedv1.Profile) bool {
103143
if profile == nil || profile.Spec.Config.TunedProfile != profile.Status.TunedProfile {
@@ -178,7 +218,10 @@ func (c *Controller) numMCLabelsAcrossMCP(profileList []*tunedv1.Profile) int {
178218
return n
179219
}
180220

181-
// computeStatus computes the operator's current status.
221+
// computeStatus returns the operator's current status conditions, operand's version and error if any.
222+
// Operand's version is only returned if the operator is NOT progressing towards a new version,
223+
// i.e. is stable. In all other cases (including when we fail to retrieve the daemonset), an empty string
224+
// is returned.
182225
func (c *Controller) computeStatus(tuned *tunedv1.Tuned, conditions []configv1.ClusterOperatorStatusCondition) ([]configv1.ClusterOperatorStatusCondition, string, error) {
183226
const (
184227
// maximum number of seconds for the operator to be Unavailable with a unique
@@ -253,11 +296,6 @@ func (c *Controller) computeStatus(tuned *tunedv1.Tuned, conditions []configv1.C
253296
copyAvailableCondition()
254297
}
255298
} else {
256-
if ds.Status.ObservedGeneration != ds.Generation {
257-
// Do not base the conditions on stale information
258-
return conditions, "", errors.New(errGenerationMismatch)
259-
}
260-
261299
dsReleaseVersion := c.getDaemonSetReleaseVersion(ds)
262300

263301
if ds.Status.NumberAvailable > 0 {
@@ -278,17 +316,11 @@ func (c *Controller) computeStatus(tuned *tunedv1.Tuned, conditions []configv1.C
278316
klog.V(3).Infof("syncOperatorStatus(): %s", availableCondition.Message)
279317
}
280318

281-
// The operator is actively making changes to the operand (is Progressing) when:
282-
// the total number of Nodes that should be running the daemon Pod
283-
// (including Nodes correctly running the daemon Pod) != the total number of
284-
// Nodes that are running updated daemon Pod.
285-
if ds.Status.DesiredNumberScheduled != ds.Status.UpdatedNumberScheduled ||
286-
ds.Status.CurrentNumberScheduled != ds.Status.NumberReady ||
287-
ds.Status.DesiredNumberScheduled == 0 {
319+
if ok, msg := isProgressing(ds); ok {
288320
klog.V(3).Infof("setting Progressing condition to true")
289321
progressingCondition.Status = configv1.ConditionTrue
290322
progressingCondition.Reason = "Reconciling"
291-
progressingCondition.Message = fmt.Sprintf("Working towards %q", os.Getenv("RELEASE_VERSION"))
323+
progressingCondition.Message = msg
292324
} else {
293325
progressingCondition.Status = configv1.ConditionFalse
294326
progressingCondition.Reason = "AsExpected"
@@ -314,9 +346,8 @@ func (c *Controller) computeStatus(tuned *tunedv1.Tuned, conditions []configv1.C
314346
numProgressingProfiles, numDegradedProfiles := numProfilesProgressingDegraded(profileList)
315347

316348
if numProgressingProfiles > 0 {
317-
progressingCondition.Status = configv1.ConditionTrue
318-
progressingCondition.Reason = "ProfileProgressing"
319-
progressingCondition.Message = fmt.Sprintf("Waiting for %v/%v Profiles to be applied", numProgressingProfiles, len(profileList))
349+
availableCondition.Reason = "ProfileProgressing"
350+
availableCondition.Message = fmt.Sprintf("Waiting for %v/%v Profiles to be applied", numProgressingProfiles, len(profileList))
320351
}
321352

322353
if numDegradedProfiles > 0 {

0 commit comments

Comments
 (0)