Skip to content

Commit 7bd3096

Browse files
committed
featuregates: process gates after version is read from payload
Because of [OCPBUGS-30080](https://issues.redhat.com/browse/OCPBUGS-30080), we cannot easily determine running CVO version by a single `os.Getenv()`, like other operators can. CVO can determine its version from the initial payload it loads from disk though, but this happens a bit later in the code flow, after leadership lease is acquired and all informers are started. At that point we can provide the feature gate / featureset knowledge to the structures that need it: actual CVO controller and the feature changestopper, but these structures also need to be initialized earlier (they require informers which are already started). This leads to a slightly awkard delayed initialization scheme, where the controller structures are initialized early and populated with early content like informers etc. Later, when informers are started and CVO loads its initial payload, we can extract the version from it and use it to populate the feature gate in the controller structures. Because enabled feature gates are avaiable later in the flow, it also means part of the CVO code cannot be gated by a feature gate (like controller initialization, or initial payload loading). We do not need that now but it may cause issues later. The high-level sequence after this commit looks like this: 1. Initialize CVO and ChangeStopper controller structures with informers they need, and populate CVO's `enabledFeatureGate` checker with one panics when used (no code can check for gates before we know them) 2. Acquire lease and start the informers 3. Fetch a FeatureGate resource from the cluster (using an informer) and determine the FeatureSet from it (needed to load the payload) 4. Load the initial payload from disk and extract the version from it 5. Use the version to determine the enabled feature gates from the FeatureGate resource 6. Populate the CVO and ChangeStopper controller structures with the newly discovered feature gates
1 parent e7ea6db commit 7bd3096

12 files changed

+233
-165
lines changed

install/0000_00_cluster-version-operator_03_deployment.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ spec:
5656
name: kube-api-access
5757
readOnly: true
5858
env:
59+
# Unfortunately the placeholder is not replaced, reported as OCPBUGS-30080
5960
- name: OPERATOR_IMAGE_VERSION
6061
value: "0.0.1-snapshot"
6162
- name: KUBERNETES_SERVICE_PORT # allows CVO to communicate with apiserver directly on same host. Is substituted with port from infrastructures.status.apiServerInternalURL if available.

pkg/cvo/cvo.go

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ type Operator struct {
165165
// via annotation
166166
exclude string
167167

168-
clusterFeatures featuregates.ClusterFeatures
168+
enabledFeatureGates featuregates.CvoGateChecker
169169

170170
clusterProfile string
171171
uid types.UID
@@ -186,7 +186,6 @@ func New(
186186
client clientset.Interface,
187187
kubeClient kubernetes.Interface,
188188
exclude string,
189-
clusterFeatures featuregates.ClusterFeatures,
190189
clusterProfile string,
191190
promqlTarget clusterconditions.PromQLTarget,
192191
injectClusterIdIntoPromQL bool,
@@ -218,10 +217,14 @@ func New(
218217
upgradeableQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "upgradeable"),
219218

220219
exclude: exclude,
221-
clusterFeatures: clusterFeatures,
222220
clusterProfile: clusterProfile,
223221
conditionRegistry: standard.NewConditionRegistry(promqlTarget),
224222
injectClusterIdIntoPromQL: injectClusterIdIntoPromQL,
223+
224+
// Because of OCPBUGS-30080, we can only detect the enabled feature gates after Operator loads the initial payload
225+
// from disk via LoadInitialPayload. We must not have any gate-checking code until that happens, so we initialize
226+
// this field with a checker that panics when used.
227+
enabledFeatureGates: featuregates.PanicOnUsageBeforeInitialization,
225228
}
226229

227230
if _, err := cvInformer.Informer().AddEventHandler(optr.clusterVersionEventHandler()); err != nil {
@@ -253,10 +256,9 @@ func New(
253256
return optr, nil
254257
}
255258

256-
// InitializeFromPayload waits until a ClusterVersion object exists. It then retrieves the payload contents and verifies the
257-
// initial state, then configures the controller that loads and applies content to the cluster. It returns an error if the
258-
// payload appears to be in error rather than continuing.
259-
func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *rest.Config, burstRestConfig *rest.Config) error {
259+
// LoadInitialPayload waits until a ClusterVersion object exists. It then retrieves the payload contents, verifies the
260+
// initial state and returns it. If the payload is invalid, an error is returned.
261+
func (optr *Operator) LoadInitialPayload(ctx context.Context, startingRequiredFeatureSet configv1.FeatureSet, restConfig *rest.Config) (*payload.Update, error) {
260262

261263
// wait until cluster version object exists
262264
if err := wait.PollUntilContextCancel(ctx, 3*time.Second, true, func(ctx context.Context) (bool, error) {
@@ -273,24 +275,19 @@ func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *res
273275
}
274276
return true, nil
275277
}); err != nil {
276-
return fmt.Errorf("Error when attempting to get cluster version object: %w", err)
278+
return nil, fmt.Errorf("Error when attempting to get cluster version object: %w", err)
277279
}
278280

279-
update, err := payload.LoadUpdate(optr.defaultPayloadDir(), optr.release.Image, optr.exclude, optr.clusterFeatures.StartingRequiredFeatureSet,
281+
update, err := payload.LoadUpdate(optr.defaultPayloadDir(), optr.release.Image, optr.exclude, string(startingRequiredFeatureSet),
280282
optr.clusterProfile, capability.GetKnownCapabilities())
281283

282284
if err != nil {
283-
return fmt.Errorf("the local release contents are invalid - no current version can be determined from disk: %v", err)
285+
return nil, fmt.Errorf("the local release contents are invalid - no current version can be determined from disk: %v", err)
284286
}
285-
286-
optr.release = update.Release
287-
optr.releaseCreated = update.ImageRef.CreationTimestamp.Time
288-
optr.SetArchitecture(update.Architecture)
289-
290287
httpClientConstructor := sigstore.NewCachedHTTPClientConstructor(optr.HTTPClient, nil)
291288
configClient, err := coreclientsetv1.NewForConfig(restConfig)
292289
if err != nil {
293-
return fmt.Errorf("unable to create a configuration client: %v", err)
290+
return nil, fmt.Errorf("unable to create a configuration client: %v", err)
294291
}
295292

296293
customSignatureStore := &customsignaturestore.Store{
@@ -302,7 +299,7 @@ func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *res
302299
// attempt to load a verifier as defined in the payload
303300
verifier, signatureStore, err := loadConfigMapVerifierDataFromUpdate(update, httpClientConstructor.HTTPClient, configClient, customSignatureStore)
304301
if err != nil {
305-
return err
302+
return nil, err
306303
}
307304
if verifier != nil {
308305
klog.Infof("Verifying release authenticity: %v", verifier)
@@ -312,6 +309,16 @@ func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *res
312309
}
313310
optr.verifier = verifier
314311
optr.signatureStore = signatureStore
312+
return update, nil
313+
}
314+
315+
// InitializeFromPayload configures the controller that loads and applies content to the cluster given an initial payload
316+
// and feature gate data.
317+
func (optr *Operator) InitializeFromPayload(update *payload.Update, requiredFeatureSet configv1.FeatureSet, cvoFlags featuregates.CvoGateChecker, restConfig *rest.Config, burstRestConfig *rest.Config) {
318+
optr.enabledFeatureGates = cvoFlags
319+
optr.release = update.Release
320+
optr.releaseCreated = update.ImageRef.CreationTimestamp.Time
321+
optr.SetArchitecture(update.Architecture)
315322

316323
// after the verifier has been loaded, initialize the sync worker with a payload retriever
317324
// which will consume the verifier
@@ -327,12 +334,10 @@ func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *res
327334
Cap: time.Second * 15,
328335
},
329336
optr.exclude,
330-
optr.clusterFeatures.StartingRequiredFeatureSet,
337+
requiredFeatureSet,
331338
optr.eventRecorder,
332339
optr.clusterProfile,
333340
)
334-
335-
return nil
336341
}
337342

338343
// ownerReferenceModifier sets the owner reference to the current CV resource if no other reference exists. It also resets

pkg/cvo/cvo_scenarios_test.go

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,11 @@ import (
2626

2727
configv1 "github.com/openshift/api/config/v1"
2828
"github.com/openshift/client-go/config/clientset/versioned/fake"
29+
"github.com/openshift/library-go/pkg/manifest"
2930

31+
"github.com/openshift/cluster-version-operator/pkg/featuregates"
3032
"github.com/openshift/cluster-version-operator/pkg/payload"
3133
"github.com/openshift/cluster-version-operator/pkg/payload/precondition"
32-
"github.com/openshift/library-go/pkg/manifest"
3334
)
3435

3536
var architecture string
@@ -108,14 +109,15 @@ func setupCVOTest(payloadDir string) (*Operator, map[string]apiruntime.Object, *
108109
}
109110

110111
o := &Operator{
111-
namespace: "test",
112-
name: "version",
113-
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cvo-loop-test"),
114-
client: client,
115-
cvLister: &clientCVLister{client: client},
116-
exclude: "exclude-test",
117-
eventRecorder: record.NewFakeRecorder(100),
118-
clusterProfile: payload.DefaultClusterProfile,
112+
namespace: "test",
113+
name: "version",
114+
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cvo-loop-test"),
115+
client: client,
116+
enabledFeatureGates: featuregates.DefaultCvoGates("version"),
117+
cvLister: &clientCVLister{client: client},
118+
exclude: "exclude-test",
119+
eventRecorder: record.NewFakeRecorder(100),
120+
clusterProfile: payload.DefaultClusterProfile,
119121
}
120122

121123
dynamicScheme := apiruntime.NewScheme()

pkg/cvo/cvo_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,12 @@ import (
3939
configv1 "github.com/openshift/api/config/v1"
4040
clientset "github.com/openshift/client-go/config/clientset/versioned"
4141
"github.com/openshift/client-go/config/clientset/versioned/fake"
42-
43-
"github.com/openshift/cluster-version-operator/pkg/payload"
4442
"github.com/openshift/library-go/pkg/manifest"
4543
"github.com/openshift/library-go/pkg/verify/store/serial"
4644
"github.com/openshift/library-go/pkg/verify/store/sigstore"
45+
46+
"github.com/openshift/cluster-version-operator/pkg/featuregates"
47+
"github.com/openshift/cluster-version-operator/pkg/payload"
4748
)
4849

4950
var (
@@ -2273,6 +2274,7 @@ func TestOperator_sync(t *testing.T) {
22732274
optr.configSync = &fakeSyncRecorder{Returns: expectStatus}
22742275
}
22752276
optr.eventRecorder = record.NewFakeRecorder(100)
2277+
optr.enabledFeatureGates = featuregates.DefaultCvoGates("version")
22762278

22772279
ctx := context.Background()
22782280
err := optr.sync(ctx, optr.queueKey())

pkg/cvo/status.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ func (optr *Operator) syncStatus(ctx context.Context, original, config *configv1
199199
original = config.DeepCopy()
200200
}
201201

202-
updateClusterVersionStatus(&config.Status, status, optr.release, optr.getAvailableUpdates, optr.clusterFeatures.StartingCvoFeatureGates, validationErrs)
202+
updateClusterVersionStatus(&config.Status, status, optr.release, optr.getAvailableUpdates, optr.enabledFeatureGates, validationErrs)
203203

204204
if klog.V(6).Enabled() {
205205
klog.Infof("Apply config: %s", diff.ObjectReflectDiff(original, config))
@@ -211,7 +211,7 @@ func (optr *Operator) syncStatus(ctx context.Context, original, config *configv1
211211

212212
// updateClusterVersionStatus updates the passed cvStatus with the latest status information
213213
func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status *SyncWorkerStatus,
214-
release configv1.Release, getAvailableUpdates func() *availableUpdates, enabledGates featuregates.CvoGates,
214+
release configv1.Release, getAvailableUpdates func() *availableUpdates, enabledGates featuregates.CvoGateChecker,
215215
validationErrs field.ErrorList) {
216216

217217
cvStatus.ObservedGeneration = status.Generation
@@ -382,7 +382,7 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status
382382
}
383383

384384
oldRriCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, resourceReconciliationIssuesConditionType)
385-
if enabledGates.ResourceReconciliationIssuesCondition || (oldRriCondition != nil && enabledGates.UnknownVersion) {
385+
if enabledGates.ResourceReconciliationIssuesCondition() || (oldRriCondition != nil && enabledGates.UnknownVersion()) {
386386
rriCondition := configv1.ClusterOperatorStatusCondition{
387387
Type: resourceReconciliationIssuesConditionType,
388388
Status: configv1.ConditionFalse,

pkg/cvo/status_test.go

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ import (
1717
"github.com/openshift/client-go/config/clientset/versioned/fake"
1818

1919
"github.com/openshift/cluster-version-operator/lib/resourcemerge"
20-
"github.com/openshift/cluster-version-operator/pkg/featuregates"
2120
)
2221

2322
func Test_mergeEqualVersions(t *testing.T) {
@@ -197,6 +196,19 @@ func TestOperator_syncFailingStatus(t *testing.T) {
197196
}
198197
}
199198

199+
type fakeRriFlags struct {
200+
unknownVersion bool
201+
resourceReconciliationIssuesCondition bool
202+
}
203+
204+
func (f fakeRriFlags) UnknownVersion() bool {
205+
return f.unknownVersion
206+
}
207+
208+
func (f fakeRriFlags) ResourceReconciliationIssuesCondition() bool {
209+
return f.resourceReconciliationIssuesCondition
210+
}
211+
200212
func TestUpdateClusterVersionStatus_UnknownVersionAndRRI(t *testing.T) {
201213
ignoreLastTransitionTime := cmpopts.IgnoreFields(configv1.ClusterOperatorStatusCondition{}, "LastTransitionTime")
202214

@@ -247,9 +259,9 @@ func TestUpdateClusterVersionStatus_UnknownVersionAndRRI(t *testing.T) {
247259
for _, tc := range testCases {
248260
tc := tc
249261
t.Run(tc.name, func(t *testing.T) {
250-
gates := featuregates.CvoGates{
251-
UnknownVersion: tc.unknownVersion,
252-
ResourceReconciliationIssuesCondition: false,
262+
gates := fakeRriFlags{
263+
unknownVersion: tc.unknownVersion,
264+
resourceReconciliationIssuesCondition: false,
253265
}
254266
release := configv1.Release{}
255267
getAvailableUpdates := func() *availableUpdates { return nil }
@@ -317,7 +329,10 @@ func TestUpdateClusterVersionStatus_ResourceReconciliationIssues(t *testing.T) {
317329
for _, tc := range testCases {
318330
tc := tc
319331
t.Run(tc.name, func(t *testing.T) {
320-
gates := featuregates.CvoGates{ResourceReconciliationIssuesCondition: tc.enabled}
332+
gates := fakeRriFlags{
333+
unknownVersion: false,
334+
resourceReconciliationIssuesCondition: tc.enabled,
335+
}
321336
release := configv1.Release{}
322337
getAvailableUpdates := func() *availableUpdates { return nil }
323338
var noErrors field.ErrorList

pkg/cvo/sync_worker.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,14 +178,14 @@ type SyncWorker struct {
178178

179179
// requiredFeatureSet is set to the value of Feature.config.openshift.io|spec.featureSet, which contributes to
180180
// whether or not some manifests are included for reconciliation.
181-
requiredFeatureSet string
181+
requiredFeatureSet configv1.FeatureSet
182182

183183
clusterProfile string
184184
}
185185

186186
// NewSyncWorker initializes a ConfigSyncWorker that will retrieve payloads to disk, apply them via builder
187187
// to a server, and obey limits about how often to reconcile or retry on errors.
188-
func NewSyncWorker(retriever PayloadRetriever, builder payload.ResourceBuilder, reconcileInterval time.Duration, backoff wait.Backoff, exclude string, requiredFeatureSet string, eventRecorder record.EventRecorder, clusterProfile string) *SyncWorker {
188+
func NewSyncWorker(retriever PayloadRetriever, builder payload.ResourceBuilder, reconcileInterval time.Duration, backoff wait.Backoff, exclude string, requiredFeatureSet configv1.FeatureSet, eventRecorder record.EventRecorder, clusterProfile string) *SyncWorker {
189189
return &SyncWorker{
190190
retriever: retriever,
191191
builder: builder,
@@ -210,7 +210,7 @@ func NewSyncWorker(retriever PayloadRetriever, builder payload.ResourceBuilder,
210210
// NewSyncWorkerWithPreconditions initializes a ConfigSyncWorker that will retrieve payloads to disk, apply them via builder
211211
// to a server, and obey limits about how often to reconcile or retry on errors.
212212
// It allows providing preconditions for loading payload.
213-
func NewSyncWorkerWithPreconditions(retriever PayloadRetriever, builder payload.ResourceBuilder, preconditions precondition.List, reconcileInterval time.Duration, backoff wait.Backoff, exclude string, requiredFeatureSet string, eventRecorder record.EventRecorder, clusterProfile string) *SyncWorker {
213+
func NewSyncWorkerWithPreconditions(retriever PayloadRetriever, builder payload.ResourceBuilder, preconditions precondition.List, reconcileInterval time.Duration, backoff wait.Backoff, exclude string, requiredFeatureSet configv1.FeatureSet, eventRecorder record.EventRecorder, clusterProfile string) *SyncWorker {
214214
worker := NewSyncWorker(retriever, builder, reconcileInterval, backoff, exclude, requiredFeatureSet, eventRecorder, clusterProfile)
215215
worker.preconditions = preconditions
216216
return worker
@@ -315,7 +315,7 @@ func (w *SyncWorker) syncPayload(ctx context.Context, work *SyncWork) ([]configv
315315

316316
// Capability filtering is not done here since unknown capabilities are allowed
317317
// during updated payload load and enablement checking only occurs during apply.
318-
payloadUpdate, err := payload.LoadUpdate(info.Directory, desired.Image, w.exclude, w.requiredFeatureSet, w.clusterProfile, nil)
318+
payloadUpdate, err := payload.LoadUpdate(info.Directory, desired.Image, w.exclude, string(w.requiredFeatureSet), w.clusterProfile, nil)
319319

320320
if err != nil {
321321
msg := fmt.Sprintf("Loading payload failed version=%q image=%q failure=%v", desired.Version, desired.Image, err)

0 commit comments

Comments
 (0)