Skip to content

Commit e6ff14c

Browse files
committed
OTA-1531: Rework error handling in FeatureGate processing
- The `FeatureGate` GET is now made over an informer client which is backed by a cache, so polling to smooth over non-isNotFound errors no longer makes sense. The GET runs after a cache sync succeeded earlier so no errors are expected (and if an error happens, it is unlikely to be resolved by a retry) - Waiting for cache sync was not bounded; impose a 30s timeout on startup (this is similar to the deadline imposed by the poll before we started to use the informer client) - Return pointers from `processInitialFeatureGate` to make return errors easier
1 parent 2d837d9 commit e6ff14c

File tree

2 files changed

+31
-40
lines changed

2 files changed

+31
-40
lines changed

pkg/start/start.go

Lines changed: 28 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ import (
1919
"k8s.io/apimachinery/pkg/fields"
2020
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
2121
"k8s.io/apimachinery/pkg/util/sets"
22-
"k8s.io/apimachinery/pkg/util/wait"
2322
coreinformers "k8s.io/client-go/informers"
2423
"k8s.io/client-go/kubernetes"
2524
"k8s.io/client-go/kubernetes/scheme"
@@ -195,7 +194,7 @@ func (o *Options) Run(ctx context.Context) error {
195194
}
196195

197196
// initialize the controllers and attempt to load the payload information
198-
controllerCtx, err := o.NewControllerContext(cb, startingFeatureSet, startingCvoGates, clusterVersionConfigInformerFactory, configInformerFactory)
197+
controllerCtx, err := o.NewControllerContext(cb, *startingFeatureSet, *startingCvoGates, clusterVersionConfigInformerFactory, configInformerFactory)
199198
if err != nil {
200199
return err
201200
}
@@ -243,57 +242,49 @@ func (o *Options) getOpenShiftVersion() string {
243242
return releaseMetadata.Version
244243
}
245244

246-
func (o *Options) processInitialFeatureGate(ctx context.Context, configInformerFactory configinformers.SharedInformerFactory) (configv1.FeatureSet, featuregates.CvoGates, error) {
245+
func (o *Options) processInitialFeatureGate(ctx context.Context, configInformerFactory configinformers.SharedInformerFactory) (*configv1.FeatureSet, *featuregates.CvoGates, error) {
247246
featureGates := configInformerFactory.Config().V1().FeatureGates().Lister()
248247
configInformerFactory.Start(ctx.Done())
249-
configInformerFactory.WaitForCacheSync(ctx.Done())
248+
249+
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
250+
defer cancel()
251+
252+
for key, synced := range configInformerFactory.WaitForCacheSync(ctx.Done()) {
253+
if !synced {
254+
return nil, nil, fmt.Errorf("failed to sync %s informer cache: %v", key.String(), ctx.Err())
255+
}
256+
}
250257

251258
cvoOpenShiftVersion := o.getOpenShiftVersion()
252259
cvoGates := featuregates.DefaultCvoGates(cvoOpenShiftVersion)
253260

254261
var startingFeatureSet configv1.FeatureSet
255262
var clusterFeatureGate *configv1.FeatureGate
256263

257-
// client-go automatically retries some network blip errors on GETs for 30s by default, and we want to
258-
// retry the remaining ones ourselves. If we fail longer than that, CVO won't be able to do work anyway.
259-
// Return the error and crashloop.
260-
//
261-
// We implement the timeout with a context because the timeout in PollImmediateWithContext does not behave
262-
// well when ConditionFunc takes longer time to execute, like here where the GET can be retried by client-go
263-
var lastError error
264-
if err := wait.PollUntilContextTimeout(context.Background(), 2*time.Second, 25*time.Second, true, func(ctx context.Context) (bool, error) {
265-
gate, fgErr := featureGates.Get("cluster")
266-
switch {
267-
case apierrors.IsNotFound(fgErr):
268-
// if we have no featuregates, then the cluster is using the default featureset, which is "".
269-
// This excludes everything that could possibly depend on a different feature set.
270-
startingFeatureSet = ""
271-
klog.Infof("FeatureGate not found in cluster, will assume default feature set %q at startup", startingFeatureSet)
272-
return true, nil
273-
case fgErr != nil:
274-
lastError = fgErr
275-
klog.Warningf("Failed to get FeatureGate from cluster: %v", fgErr)
276-
return false, nil
277-
default:
278-
clusterFeatureGate = gate
279-
startingFeatureSet = gate.Spec.FeatureSet
280-
cvoGates = featuregates.CvoGatesFromFeatureGate(clusterFeatureGate, cvoOpenShiftVersion)
281-
klog.Infof("FeatureGate found in cluster, using its feature set %q at startup", startingFeatureSet)
282-
return true, nil
283-
}
284-
}); err != nil {
285-
if lastError != nil {
286-
return "", cvoGates, lastError
287-
}
288-
return "", cvoGates, err
264+
gate, err := featureGates.Get("cluster")
265+
switch {
266+
case err != nil && apierrors.IsNotFound(err):
267+
// if we have no featuregates, then the cluster is using the default featureset, which is "".
268+
// This excludes everything that could possibly depend on a different feature set.
269+
startingFeatureSet = ""
270+
klog.Infof("FeatureGate not found in cluster, will assume default feature set %q at startup", startingFeatureSet)
271+
case err != nil:
272+
// This should not happen because featureGates is backed by the informer cache which successfully synced earlier
273+
klog.Errorf("Failed to get FeatureGate from cluster: %v", err)
274+
return nil, nil, fmt.Errorf("failed to get FeatureGate from informer cache: %w", err)
275+
default:
276+
clusterFeatureGate = gate
277+
startingFeatureSet = gate.Spec.FeatureSet
278+
cvoGates = featuregates.CvoGatesFromFeatureGate(clusterFeatureGate, cvoOpenShiftVersion)
279+
klog.Infof("FeatureGate found in cluster, using its feature set %q at startup", startingFeatureSet)
289280
}
290281

291282
if cvoGates.UnknownVersion() {
292283
klog.Warningf("CVO features for version %s could not be detected from FeatureGate; will use defaults plus special UnknownVersion feature gate", cvoOpenShiftVersion)
293284
}
294285
klog.Infof("CVO features for version %s enabled at startup: %+v", cvoOpenShiftVersion, cvoGates)
295286

296-
return startingFeatureSet, cvoGates, nil
287+
return &startingFeatureSet, &cvoGates, nil
297288
}
298289

299290
// run launches a number of goroutines to handle manifest application,

pkg/start/start_integration_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ func TestIntegrationCVO_initializeAndUpgrade(t *testing.T) {
194194
if err != nil {
195195
t.Fatal(err)
196196
}
197-
controllers, err := options.NewControllerContext(cb, featureset, gates, clusterVersionConfigInformerFactory, configInformerFactory)
197+
controllers, err := options.NewControllerContext(cb, *featureset, *gates, clusterVersionConfigInformerFactory, configInformerFactory)
198198
if err != nil {
199199
t.Fatal(err)
200200
}
@@ -335,7 +335,7 @@ func TestIntegrationCVO_gracefulStepDown(t *testing.T) {
335335
if err != nil {
336336
t.Fatal(err)
337337
}
338-
controllers, err := options.NewControllerContext(cb, featureset, gates, clusterVersionConfigInformerFactory, configInformerFactory)
338+
controllers, err := options.NewControllerContext(cb, *featureset, *gates, clusterVersionConfigInformerFactory, configInformerFactory)
339339
if err != nil {
340340
t.Fatal(err)
341341
}
@@ -538,7 +538,7 @@ metadata:
538538
if err != nil {
539539
t.Fatal(err)
540540
}
541-
controllers, err := options.NewControllerContext(cb, featureset, gates, clusterVersionConfigInformerFactory, configInformerFactory)
541+
controllers, err := options.NewControllerContext(cb, *featureset, *gates, clusterVersionConfigInformerFactory, configInformerFactory)
542542
if err != nil {
543543
t.Fatal(err)
544544
}

0 commit comments

Comments
 (0)