Skip to content

Commit cada4f4

Browse files
Merge pull request #1196 from petr-muller/ota-1531-06-establish-gates-early
OTA-1531: [6/6] cvo: use early gate checker to avoid delayed initialization
2 parents 546f3c2 + c55bdea commit cada4f4

File tree

6 files changed

+77
-134
lines changed

6 files changed

+77
-134
lines changed

pkg/cvo/cvo.go

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,19 @@ type Operator struct {
174174
// via annotation
175175
exclude string
176176

177+
// requiredFeatureSet is the feature set that was detected in the cluster when CVO was started and is used
178+
// to select the manifests that will be applied in the cluster. The starting value cannot be changed in the executing
179+
// CVO but the featurechangestopper controller will detect a feature set change in the cluster and shutdown the CVO.
180+
// Enforcing featuresets is a standard GA CVO behavior that supports the feature gating functionality across the whole
181+
// cluster, as opposed to the enabledFeatureGates which controls what gated behaviors of CVO itself are enabled by
182+
// the cluster feature gates.
183+
// See: https://github.com/openshift/enhancements/blob/master/enhancements/update/cvo-techpreview-manifests.md
184+
requiredFeatureSet configv1.FeatureSet
185+
186+
// enabledFeatureGates is the checker for what gated CVO behaviors are enabled or disabled by specific cluster-level
187+
// feature gates. It allows multiplexing the cluster-level feature gates to more granular CVO-level gates. Similarly
188+
// to the requiredFeatureSet, the enabledFeatureGates cannot be changed in the executing CVO but the
189+
// featurechangestopper controller will detect when cluster feature gate config changes and shutdown the CVO.
177190
enabledFeatureGates featuregates.CvoGateChecker
178191

179192
clusterProfile string
@@ -210,6 +223,8 @@ func New(
210223
injectClusterIdIntoPromQL bool,
211224
updateService string,
212225
alwaysEnableCapabilities []configv1.ClusterVersionCapability,
226+
featureSet configv1.FeatureSet,
227+
cvoGates featuregates.CvoGateChecker,
213228
) (*Operator, error) {
214229
eventBroadcaster := record.NewBroadcaster()
215230
eventBroadcaster.StartLogging(klog.Infof)
@@ -243,10 +258,9 @@ func New(
243258
conditionRegistry: standard.NewConditionRegistry(promqlTarget),
244259
injectClusterIdIntoPromQL: injectClusterIdIntoPromQL,
245260

246-
// Because of OCPBUGS-30080, we can only detect the enabled feature gates after Operator loads the initial payload
247-
// from disk via LoadInitialPayload. We must not have any gate-checking code until that happens, so we initialize
248-
// this field with a checker that panics when used.
249-
enabledFeatureGates: featuregates.PanicOnUsageBeforeInitialization,
261+
requiredFeatureSet: featureSet,
262+
enabledFeatureGates: cvoGates,
263+
250264
alwaysEnableCapabilities: alwaysEnableCapabilities,
251265
}
252266

@@ -283,7 +297,7 @@ func New(
283297

284298
// LoadInitialPayload waits until a ClusterVersion object exists. It then retrieves the payload contents, verifies the
285299
// initial state and returns it. If the payload is invalid, an error is returned.
286-
func (optr *Operator) LoadInitialPayload(ctx context.Context, startingRequiredFeatureSet configv1.FeatureSet, restConfig *rest.Config) (*payload.Update, error) {
300+
func (optr *Operator) LoadInitialPayload(ctx context.Context, restConfig *rest.Config) (*payload.Update, error) {
287301

288302
// wait until cluster version object exists
289303
if err := wait.PollUntilContextCancel(ctx, 3*time.Second, true, func(ctx context.Context) (bool, error) {
@@ -303,7 +317,7 @@ func (optr *Operator) LoadInitialPayload(ctx context.Context, startingRequiredFe
303317
return nil, fmt.Errorf("Error when attempting to get cluster version object: %w", err)
304318
}
305319

306-
update, err := payload.LoadUpdate(optr.defaultPayloadDir(), optr.release.Image, optr.exclude, string(startingRequiredFeatureSet),
320+
update, err := payload.LoadUpdate(optr.defaultPayloadDir(), optr.release.Image, optr.exclude, string(optr.requiredFeatureSet),
307321
optr.clusterProfile, configv1.KnownClusterVersionCapabilities)
308322

309323
if err != nil {
@@ -339,8 +353,12 @@ func (optr *Operator) LoadInitialPayload(ctx context.Context, startingRequiredFe
339353

340354
// InitializeFromPayload configures the controller that loads and applies content to the cluster given an initial payload
341355
// and feature gate data.
342-
func (optr *Operator) InitializeFromPayload(update *payload.Update, requiredFeatureSet configv1.FeatureSet, cvoFlags featuregates.CvoGateChecker, restConfig *rest.Config, burstRestConfig *rest.Config) {
343-
optr.enabledFeatureGates = cvoFlags
356+
func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *rest.Config, burstRestConfig *rest.Config) error {
357+
update, err := optr.LoadInitialPayload(ctx, restConfig)
358+
if err != nil {
359+
return err
360+
}
361+
344362
optr.release = update.Release
345363
optr.releaseCreated = update.ImageRef.CreationTimestamp.Time
346364

@@ -358,11 +376,13 @@ func (optr *Operator) InitializeFromPayload(update *payload.Update, requiredFeat
358376
Cap: time.Second * 15,
359377
},
360378
optr.exclude,
361-
requiredFeatureSet,
379+
optr.requiredFeatureSet,
362380
optr.eventRecorder,
363381
optr.clusterProfile,
364382
optr.alwaysEnableCapabilities,
365383
)
384+
385+
return nil
366386
}
367387

368388
// ownerReferenceModifier sets the owner reference to the current CV resource if no other reference exists. It also resets

pkg/featuregates/featurechangestopper.go

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,16 @@ type ChangeStopper struct {
3030
}
3131

3232
// NewChangeStopper returns a new ChangeStopper.
33-
func NewChangeStopper(featureGateInformer configinformersv1.FeatureGateInformer) (*ChangeStopper, error) {
33+
func NewChangeStopper(featureGateInformer configinformersv1.FeatureGateInformer, requiredFeatureSet configv1.FeatureSet, cvoGates CvoGates) (*ChangeStopper, error) {
3434
c := &ChangeStopper{
3535
featureGateLister: featureGateInformer.Lister(),
3636
cacheSynced: []cache.InformerSynced{featureGateInformer.Informer().HasSynced},
3737
queue: workqueue.NewTypedRateLimitingQueueWithConfig[any](workqueue.DefaultTypedControllerRateLimiter[any](), workqueue.TypedRateLimitingQueueConfig[any]{Name: "feature-gate-stopper"}),
38+
39+
startingRequiredFeatureSet: &requiredFeatureSet,
40+
startingCvoGates: &cvoGates,
3841
}
42+
3943
c.queue.Add("cluster") // seed an initial sync, in case startingRequiredFeatureSet is wrong
4044
if _, err := featureGateInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
4145
AddFunc: func(_ interface{}) {
@@ -54,11 +58,6 @@ func NewChangeStopper(featureGateInformer configinformersv1.FeatureGateInformer)
5458
return c, nil
5559
}
5660

57-
func (c *ChangeStopper) SetStartingFeatures(requiredFeatureSet configv1.FeatureSet, cvoGates CvoGates) {
58-
c.startingRequiredFeatureSet = &requiredFeatureSet
59-
c.startingCvoGates = &cvoGates
60-
}
61-
6261
// syncHandler processes a single work entry, with the
6362
// processNextWorkItem caller handling the queue management. It returns
6463
// done when there will be no more work (because the feature gate changed).

pkg/featuregates/featurechangestopper_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,10 @@ func TestTechPreviewChangeStopper(t *testing.T) {
101101
client := fakeconfigv1client.NewSimpleClientset(fg)
102102

103103
informerFactory := configv1informer.NewSharedInformerFactory(client, 0)
104-
c, err := NewChangeStopper(informerFactory.Config().V1().FeatureGates())
104+
c, err := NewChangeStopper(informerFactory.Config().V1().FeatureGates(), tt.startingRequiredFeatureSet, tt.startingCvoFeatureGates)
105105
if err != nil {
106106
t.Fatal(err)
107107
}
108-
c.SetStartingFeatures(tt.startingRequiredFeatureSet, tt.startingCvoFeatureGates)
109108
informerFactory.Start(ctx.Done())
110109

111110
if err := c.Run(ctx, shutdownFn); err != nil {

pkg/featuregates/featuregates.go

Lines changed: 5 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@ import (
88
// StubOpenShiftVersion is the default OpenShift version placeholder for the purpose of determining
99
// enabled and disabled CVO feature gates. It is assumed to never conflict with a real OpenShift
1010
// version. Both DefaultCvoGates and CvoGatesFromFeatureGate should return a safe conservative
11-
// default set of enabled and disabled gates, typically with unknownVersion set to true.
11+
// default set of enabled and disabled gates when StubOpenShiftVersion is passed as the version.
12+
// This value is an analogue to the "0.0.1-snapshot" string that is used as a placeholder in
13+
// second-level operator manifests. Here we use the same string for consistency, but the constant
14+
// should be only used by the callers of CvoGatesFromFeatureGate and DefaultCvoGates methods when
15+
// the caller does not know its actual OpenShift version.
1216
const StubOpenShiftVersion = "0.0.1-snapshot"
1317

1418
// CvoGateChecker allows CVO code to check which feature gates are enabled
@@ -41,36 +45,6 @@ type CvoGateChecker interface {
4145
CVOConfiguration() bool
4246
}
4347

44-
type panicOnUsageBeforeInitializationFunc func()
45-
46-
func panicOnUsageBeforeInitialization() {
47-
panic("CVO feature flags were used before they were initialized")
48-
}
49-
50-
// PanicOnUsageBeforeInitialization is a CvoGateChecker that panics if any of its methods are called. This checker should
51-
// be used before CVO feature gates are actually known and some code tries to check them.
52-
var PanicOnUsageBeforeInitialization = panicOnUsageBeforeInitializationFunc(panicOnUsageBeforeInitialization)
53-
54-
func (p panicOnUsageBeforeInitializationFunc) ReconciliationIssuesCondition() bool {
55-
p()
56-
return false
57-
}
58-
59-
func (p panicOnUsageBeforeInitializationFunc) StatusReleaseArchitecture() bool {
60-
p()
61-
return false
62-
}
63-
64-
func (p panicOnUsageBeforeInitializationFunc) UnknownVersion() bool {
65-
p()
66-
return false
67-
}
68-
69-
func (p panicOnUsageBeforeInitializationFunc) CVOConfiguration() bool {
70-
p()
71-
return false
72-
}
73-
7448
// CvoGates contains flags that control CVO functionality gated by product feature gates. The
7549
// names do not correspond to product feature gates, the booleans here are "smaller" (product-level
7650
// gate will enable multiple CVO behaviors).

pkg/start/start.go

Lines changed: 19 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ import (
3535
configv1 "github.com/openshift/api/config/v1"
3636
clientset "github.com/openshift/client-go/config/clientset/versioned"
3737
configinformers "github.com/openshift/client-go/config/informers/externalversions"
38-
configlistersv1 "github.com/openshift/client-go/config/listers/config/v1"
3938
operatorclientset "github.com/openshift/client-go/operator/clientset/versioned"
4039
operatorinformers "github.com/openshift/client-go/operator/informers/externalversions"
4140
"github.com/openshift/library-go/pkg/config/clusterstatus"
@@ -190,13 +189,13 @@ func (o *Options) Run(ctx context.Context) error {
190189
}
191190

192191
clusterVersionConfigInformerFactory, configInformerFactory := o.prepareConfigInformerFactories(cb)
193-
_, _, err = o.processInitialFeatureGate(ctx, configInformerFactory)
192+
startingFeatureSet, startingCvoGates, err := o.processInitialFeatureGate(ctx, configInformerFactory)
194193
if err != nil {
195194
return fmt.Errorf("error processing feature gates: %w", err)
196195
}
197196

198197
// initialize the controllers and attempt to load the payload information
199-
controllerCtx, err := o.NewControllerContext(cb, clusterVersionConfigInformerFactory, configInformerFactory)
198+
controllerCtx, err := o.NewControllerContext(cb, startingFeatureSet, startingCvoGates, clusterVersionConfigInformerFactory, configInformerFactory)
200199
if err != nil {
201200
return err
202201
}
@@ -244,7 +243,7 @@ func (o *Options) getOpenShiftVersion() string {
244243
return releaseMetadata.Version
245244
}
246245

247-
func (o *Options) processInitialFeatureGate(ctx context.Context, configInformerFactory configinformers.SharedInformerFactory) (configv1.FeatureSet, *featuregates.CvoGates, error) {
246+
func (o *Options) processInitialFeatureGate(ctx context.Context, configInformerFactory configinformers.SharedInformerFactory) (configv1.FeatureSet, featuregates.CvoGates, error) {
248247
featureGates := configInformerFactory.Config().V1().FeatureGates().Lister()
249248
configInformerFactory.Start(ctx.Done())
250249
configInformerFactory.WaitForCacheSync(ctx.Done())
@@ -256,8 +255,8 @@ func (o *Options) processInitialFeatureGate(ctx context.Context, configInformerF
256255
var clusterFeatureGate *configv1.FeatureGate
257256

258257
// client-go automatically retries some network blip errors on GETs for 30s by default, and we want to
259-
// retry the remaining ones ourselves. If we fail longer than that, the operator won't be able to do work
260-
// anyway. Return the error and crashloop.
258+
// retry the remaining ones ourselves. If we fail longer than that, CVO won't be able to do work anyway.
259+
// Return the error and crashloop.
261260
//
262261
// We implement the timeout with a context because the timeout in PollImmediateWithContext does not behave
263262
// well when ConditionFunc takes longer time to execute, like here where the GET can be retried by client-go
@@ -284,17 +283,17 @@ func (o *Options) processInitialFeatureGate(ctx context.Context, configInformerF
284283
}
285284
}); err != nil {
286285
if lastError != nil {
287-
return "", nil, lastError
286+
return "", cvoGates, lastError
288287
}
289-
return "", nil, err
288+
return "", cvoGates, err
290289
}
291290

292291
if cvoGates.UnknownVersion() {
293292
klog.Warningf("CVO features for version %s could not be detected from FeatureGate; will use defaults plus special UnknownVersion feature gate", cvoOpenShiftVersion)
294293
}
295294
klog.Infof("CVO features for version %s enabled at startup: %+v", cvoOpenShiftVersion, cvoGates)
296295

297-
return startingFeatureSet, &cvoGates, nil
296+
return startingFeatureSet, cvoGates, nil
298297
}
299298

300299
// run launches a number of goroutines to handle manifest application,
@@ -362,7 +361,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock resource
362361
resultChannel <- asyncResult{name: "metrics server", error: err}
363362
}()
364363
}
365-
if err := controllerCtx.InitializeFromPayload(runContext, restConfig, burstRestConfig); err != nil {
364+
if err := controllerCtx.CVO.InitializeFromPayload(runContext, restConfig, burstRestConfig); err != nil {
366365
if firstError == nil {
367366
firstError = err
368367
}
@@ -577,13 +576,17 @@ type Context struct {
577576
// OperatorInformerFactory should be used to get informers / listers for code that works with resources from the
578577
// operator.openshift.io group
579578
OperatorInformerFactory operatorinformers.SharedInformerFactory
580-
581-
fgLister configlistersv1.FeatureGateLister
582579
}
583580

584581
// NewControllerContext initializes the default Context for the current Options. It does
585582
// not start any background processes.
586-
func (o *Options) NewControllerContext(cb *ClientBuilder, clusterVersionConfigInformerFactory, configInformerFactory configinformers.SharedInformerFactory) (*Context, error) {
583+
func (o *Options) NewControllerContext(
584+
cb *ClientBuilder,
585+
startingFeatureSet configv1.FeatureSet,
586+
startingCvoGates featuregates.CvoGates,
587+
clusterVersionConfigInformerFactory,
588+
configInformerFactory configinformers.SharedInformerFactory,
589+
) (*Context, error) {
587590
kubeClient := cb.KubeClientOrDie(internal.ConfigNamespace, useProtobuf)
588591
openshiftConfigInformerFactory := coreinformers.NewSharedInformerFactoryWithOptions(kubeClient, resyncPeriod(o.ResyncInterval), coreinformers.WithNamespace(internal.ConfigNamespace))
589592
openshiftConfigManagedInformerFactory := coreinformers.NewSharedInformerFactoryWithOptions(kubeClient, resyncPeriod(o.ResyncInterval), coreinformers.WithNamespace(internal.ConfigManagedNamespace))
@@ -621,12 +624,14 @@ func (o *Options) NewControllerContext(cb *ClientBuilder, clusterVersionConfigIn
621624
o.InjectClusterIdIntoPromQL,
622625
o.UpdateService,
623626
stringsToCapabilities(o.AlwaysEnableCapabilities),
627+
startingFeatureSet,
628+
startingCvoGates,
624629
)
625630
if err != nil {
626631
return nil, err
627632
}
628633

629-
featureChangeStopper, err := featuregates.NewChangeStopper(configInformerFactory.Config().V1().FeatureGates())
634+
featureChangeStopper, err := featuregates.NewChangeStopper(configInformerFactory.Config().V1().FeatureGates(), startingFeatureSet, startingCvoGates)
630635
if err != nil {
631636
return nil, err
632637
}
@@ -639,8 +644,6 @@ func (o *Options) NewControllerContext(cb *ClientBuilder, clusterVersionConfigIn
639644
OperatorInformerFactory: operatorInformerFactory,
640645
CVO: cvo,
641646
StopOnFeatureGateChange: featureChangeStopper,
642-
643-
fgLister: configInformerFactory.Config().V1().FeatureGates().Lister(),
644647
}
645648

646649
if o.EnableAutoUpdate {
@@ -663,70 +666,6 @@ func (o *Options) NewControllerContext(cb *ClientBuilder, clusterVersionConfigIn
663666
return ctx, nil
664667
}
665668

666-
// InitializeFromPayload initializes the CVO and FeatureGate ChangeStoppers controllers from the payload. It extracts the
667-
// current CVO version from the initial payload and uses it to determine the initial the required featureset and enabled
668-
// feature gates. Both the payload and determined feature information are used to initialize CVO and feature gate
669-
// ChangeStopper controllers.
670-
func (c *Context) InitializeFromPayload(ctx context.Context, restConfig *rest.Config, burstRestConfig *rest.Config) error {
671-
var startingFeatureSet configv1.FeatureSet
672-
var clusterFeatureGate *configv1.FeatureGate
673-
674-
// client-go automatically retries some network blip errors on GETs for 30s by default, and we want to
675-
// retry the remaining ones ourselves. If we fail longer than that, the operator won't be able to do work
676-
// anyway. Return the error and crashloop.
677-
//
678-
// We implement the timeout with a context because the timeout in PollImmediateWithContext does not behave
679-
// well when ConditionFunc takes longer time to execute, like here where the GET can be retried by client-go
680-
var lastError error
681-
if err := wait.PollUntilContextTimeout(context.Background(), 2*time.Second, 25*time.Second, true, func(ctx context.Context) (bool, error) {
682-
gate, fgErr := c.fgLister.Get("cluster")
683-
switch {
684-
case apierrors.IsNotFound(fgErr):
685-
// if we have no featuregates, then the cluster is using the default featureset, which is "".
686-
// This excludes everything that could possibly depend on a different feature set.
687-
startingFeatureSet = ""
688-
klog.Infof("FeatureGate not found in cluster, using default feature set %q at startup", startingFeatureSet)
689-
return true, nil
690-
case fgErr != nil:
691-
lastError = fgErr
692-
klog.Warningf("Failed to get FeatureGate from cluster: %v", fgErr)
693-
return false, nil
694-
default:
695-
clusterFeatureGate = gate
696-
startingFeatureSet = gate.Spec.FeatureSet
697-
klog.Infof("FeatureGate found in cluster, using its feature set %q at startup", startingFeatureSet)
698-
return true, nil
699-
}
700-
}); err != nil {
701-
if lastError != nil {
702-
return lastError
703-
}
704-
return err
705-
}
706-
707-
payload, err := c.CVO.LoadInitialPayload(ctx, startingFeatureSet, restConfig)
708-
if err != nil {
709-
return err
710-
}
711-
712-
var cvoGates featuregates.CvoGates
713-
if clusterFeatureGate != nil {
714-
cvoGates = featuregates.CvoGatesFromFeatureGate(clusterFeatureGate, payload.Release.Version)
715-
} else {
716-
cvoGates = featuregates.DefaultCvoGates(payload.Release.Version)
717-
}
718-
719-
if cvoGates.UnknownVersion() {
720-
klog.Infof("CVO features for version %s could not be detected from FeatureGate; will use defaults plus special UnknownVersion feature gate", payload.Release.Version)
721-
}
722-
klog.Infof("CVO features for version %s enabled at startup: %+v", payload.Release.Version, cvoGates)
723-
724-
c.StopOnFeatureGateChange.SetStartingFeatures(startingFeatureSet, cvoGates)
725-
c.CVO.InitializeFromPayload(payload, startingFeatureSet, cvoGates, restConfig, burstRestConfig)
726-
727-
return nil
728-
}
729-
730669
func stringsToCapabilities(names []string) []configv1.ClusterVersionCapability {
731670
caps := make([]configv1.ClusterVersionCapability, len(names))
732671
for i, c := range names {

0 commit comments

Comments
 (0)