Skip to content

Commit 8a8bca5

Browse files
Merge pull request #1135 from hongkailiu/OTA-1411
OTA-1411: USC: Maintain status insights for ClusterOperator resources
2 parents e90705b + 6b5af28 commit 8a8bca5

File tree

4 files changed

+905
-25
lines changed

4 files changed

+905
-25
lines changed

install/0000_00_update-status-controller_02_rbac-DevPreviewNoUpgrade.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,18 @@ rules:
9999
- apiGroups:
100100
- config.openshift.io
101101
resources:
102+
- clusteroperators
102103
- clusterversions
103104
verbs:
104105
- get
105106
- list
106107
- watch
108+
- apiGroups:
109+
- apps
110+
resources:
111+
- deployments
112+
verbs:
113+
- get
107114
---
108115
apiVersion: rbac.authorization.k8s.io/v1
109116
kind: RoleBinding

pkg/updatestatus/controlplaneinformer.go

Lines changed: 233 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@ package updatestatus
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
7+
"strings"
68
"time"
79

810
"gopkg.in/yaml.v3"
9-
"k8s.io/apimachinery/pkg/api/errors"
11+
kerrors "k8s.io/apimachinery/pkg/api/errors"
1012
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1113
"k8s.io/apimachinery/pkg/runtime"
14+
appsv1client "k8s.io/client-go/kubernetes/typed/apps/v1"
1215
"k8s.io/klog/v2"
1316

1417
configv1 "github.com/openshift/api/config/v1"
@@ -20,75 +23,272 @@ import (
2023
"github.com/openshift/cluster-version-operator/lib/resourcemerge"
2124
)
2225

23-
// controlPlaneInformerController is the controller that monitors health of the control plane-related resources (initially,
24-
// just ClusterVersion but will need to handle ClusterOperators too) and produces insights for control plane update.
26+
// controlPlaneInformerController is the controller that monitors health of the control plane-related resources
27+
// and produces insights for control plane update.
2528
type controlPlaneInformerController struct {
26-
clusterVersions configv1listers.ClusterVersionLister
27-
recorder events.Recorder
29+
clusterVersions configv1listers.ClusterVersionLister
30+
clusterOperators configv1listers.ClusterOperatorLister
31+
recorder events.Recorder
2832

2933
// sendInsight should be called to send produced insights to the update status controller
3034
sendInsight sendInsightFn
3135

36+
appsClient appsv1client.AppsV1Interface
37+
3238
// now is a function that returns the current time, used for testing
3339
now func() metav1.Time
3440
}
3541

3642
func newControlPlaneInformerController(
43+
appsClient appsv1client.AppsV1Interface,
3744
configInformers configinformers.SharedInformerFactory,
3845
recorder events.Recorder,
3946
sendInsight sendInsightFn,
4047
) factory.Controller {
4148
cpiRecorder := recorder.WithComponentSuffix("control-plane-informer")
4249

4350
c := &controlPlaneInformerController{
44-
clusterVersions: configInformers.Config().V1().ClusterVersions().Lister(),
45-
recorder: cpiRecorder,
46-
sendInsight: sendInsight,
51+
clusterVersions: configInformers.Config().V1().ClusterVersions().Lister(),
52+
clusterOperators: configInformers.Config().V1().ClusterOperators().Lister(),
53+
recorder: cpiRecorder,
54+
sendInsight: sendInsight,
55+
appsClient: appsClient,
4756

4857
now: metav1.Now,
4958
}
5059

5160
cvInformer := configInformers.Config().V1().ClusterVersions().Informer()
61+
coInformer := configInformers.Config().V1().ClusterOperators().Informer()
5262

5363
controller := factory.New().
5464
// call sync on ClusterVersion changes
5565
WithInformersQueueKeysFunc(configApiQueueKeys, cvInformer).
66+
// call sync on ClusterOperator changes with a filter
67+
WithFilteredEventsInformersQueueKeysFunc(configApiQueueKeys, clusterOperatorEventFilterFunc, coInformer).
5668
WithSync(c.sync).
5769
ToController("ControlPlaneInformer", c.recorder)
5870

5971
return controller
6072
}
6173

74+
func clusterOperatorEventFilterFunc(obj interface{}) bool {
75+
co, ok := obj.(*configv1.ClusterOperator)
76+
if ok {
77+
for annotation := range co.Annotations {
78+
if strings.HasPrefix(annotation, "exclude.release.openshift.io/") ||
79+
strings.HasPrefix(annotation, "include.release.openshift.io/") {
80+
return true
81+
}
82+
}
83+
}
84+
return false
85+
}
86+
87+
const (
88+
clusterVersionKindName = "ClusterVersion"
89+
clusterOperatorKindName = "ClusterOperator"
90+
)
91+
6292
// sync is called for any controller event. It will assess the state and health of the control plane, indicated by
6393
// the changed resource (ClusterVersion), produce insights, and send them to the update status controller. Status
6494
// insights are not stored between calls, so every call produces a fresh insight. This means some fields do not follow
6595
// conventions, like LastTransitionTime in the Updating condition. Proper continuous insight maintenance will need to
6696
// be added later (not yet sure whether on consumer or producer side).
67-
func (c *controlPlaneInformerController) sync(_ context.Context, syncCtx factory.SyncContext) error {
97+
func (c *controlPlaneInformerController) sync(ctx context.Context, syncCtx factory.SyncContext) error {
6898
queueKey := syncCtx.QueueKey()
6999

70-
clusterVersion, err := c.clusterVersions.Get(queueKey)
100+
t, name, err := parseQueueKey(queueKey)
71101
if err != nil {
72-
if errors.IsNotFound(err) {
73-
// TODO: Handle deletes by deleting the status insight
74-
return nil
75-
}
76-
return err
102+
return fmt.Errorf("failed to parse queue key: %w", err)
77103
}
78104

79-
now := c.now()
80-
insight := assessClusterVersion(clusterVersion, now)
81-
msg := makeInsightMsgForClusterVersion(insight, now)
105+
var msg informerMsg
106+
switch t {
107+
case clusterVersionKindName:
108+
clusterVersion, err := c.clusterVersions.Get(name)
109+
if err != nil {
110+
if kerrors.IsNotFound(err) {
111+
// TODO: Handle deletes by deleting the status insight
112+
return nil
113+
}
114+
return err
115+
}
116+
117+
now := c.now()
118+
insight := assessClusterVersion(clusterVersion, now)
119+
msg = makeInsightMsgForClusterVersion(insight, now)
120+
121+
case clusterOperatorKindName:
122+
clusterVersion, err := c.clusterVersions.Get("version")
123+
if err != nil {
124+
return err
125+
}
126+
targetVersion := clusterVersion.Status.Desired.Version
127+
128+
clusterOperator, err := c.clusterOperators.Get(name)
129+
if err != nil {
130+
if kerrors.IsNotFound(err) {
131+
// TODO: Handle deletes by deleting the status insight
132+
return nil
133+
}
134+
return err
135+
}
136+
137+
now := c.now()
138+
insight, err := assessClusterOperator(ctx, clusterOperator, targetVersion, c.appsClient, now)
139+
if err != nil {
140+
return fmt.Errorf("failed to assess cluster operator %s: %w", name, err)
141+
}
142+
msg = makeInsightMsgForClusterOperator(insight, now)
143+
default:
144+
return fmt.Errorf("invalid queue key %s with unexpected type %s", queueKey, t)
145+
}
82146
var msgForLog string
83147
if klog.V(4).Enabled() {
84148
msgForLog = fmt.Sprintf(" | msg=%s", string(msg.insight))
85149
}
86-
klog.V(2).Infof("CPI :: Syncing ClusterVersion %s%s", clusterVersion.Name, msgForLog)
150+
klog.V(2).Infof("CPI :: Syncing %s %s%s", t, name, msgForLog)
87151
c.sendInsight(msg)
88152

89153
return nil
90154
}
91155

156+
func makeInsightMsgForClusterOperator(coInsight *ClusterOperatorStatusInsight, acquiredAt metav1.Time) informerMsg {
157+
uid := fmt.Sprintf("usc-co-%s", coInsight.Name)
158+
insight := Insight{
159+
UID: uid,
160+
AcquiredAt: acquiredAt,
161+
InsightUnion: InsightUnion{
162+
Type: ClusterOperatorStatusInsightType,
163+
ClusterOperatorStatusInsight: coInsight,
164+
},
165+
}
166+
// Should handle errors, but ultimately we will have a proper API and won’t need to serialize ourselves
167+
rawInsight, _ := yaml.Marshal(insight)
168+
return informerMsg{
169+
uid: uid,
170+
insight: rawInsight,
171+
}
172+
}
173+
174+
func assessClusterOperator(ctx context.Context, operator *configv1.ClusterOperator, targetVersion string, appsClient appsv1client.AppsV1Interface, now metav1.Time) (*ClusterOperatorStatusInsight, error) {
175+
updating := metav1.Condition{
176+
Type: string(ClusterOperatorStatusInsightUpdating),
177+
Status: metav1.ConditionUnknown,
178+
Reason: string(ClusterOperatorUpdatingCannotDetermine),
179+
LastTransitionTime: now,
180+
}
181+
182+
imagePullSpec, err := getImagePullSpec(ctx, operator.Name, appsClient)
183+
if err != nil && !errors.Is(err, operatorImageNotImplemented) {
184+
return nil, err
185+
}
186+
187+
noOperatorImageVersion := true
188+
var operatorImageUpdated, versionUpdated bool
189+
for _, version := range operator.Status.Versions {
190+
if version.Name == "operator-image" {
191+
noOperatorImageVersion = false
192+
if imagePullSpec != "" && imagePullSpec == version.Version {
193+
operatorImageUpdated = true
194+
}
195+
}
196+
if version.Name == "operator" && version.Version == targetVersion {
197+
versionUpdated = true
198+
}
199+
}
200+
201+
// "operator-image" might not be implemented by every cluster operator
202+
updated := (noOperatorImageVersion || operatorImageUpdated) && versionUpdated
203+
if updated {
204+
updating.Status = metav1.ConditionFalse
205+
updating.Reason = string(ClusterOperatorUpdatingReasonUpdated)
206+
}
207+
208+
var available *configv1.ClusterOperatorStatusCondition
209+
var degraded *configv1.ClusterOperatorStatusCondition
210+
var progressing *configv1.ClusterOperatorStatusCondition
211+
212+
for _, condition := range operator.Status.Conditions {
213+
condition := condition
214+
switch {
215+
case condition.Type == configv1.OperatorAvailable:
216+
available = &condition
217+
case condition.Type == configv1.OperatorDegraded:
218+
degraded = &condition
219+
case condition.Type == configv1.OperatorProgressing:
220+
progressing = &condition
221+
}
222+
}
223+
224+
if !updated && progressing != nil {
225+
if progressing.Status == configv1.ConditionTrue {
226+
updating.Status = metav1.ConditionTrue
227+
updating.Reason = string(ClusterOperatorUpdatingReasonProgressing)
228+
updating.Message = progressing.Message
229+
}
230+
if progressing.Status == configv1.ConditionFalse {
231+
updating.Status = metav1.ConditionFalse
232+
updating.Reason = string(ClusterOperatorUpdatingReasonPending)
233+
updating.Message = progressing.Message
234+
}
235+
}
236+
237+
health := metav1.Condition{
238+
Type: string(ClusterOperatorStatusInsightHealthy),
239+
Status: metav1.ConditionTrue,
240+
Reason: string(ClusterOperatorHealthyReasonAsExpected),
241+
LastTransitionTime: now,
242+
}
243+
244+
if available == nil {
245+
health.Status = metav1.ConditionUnknown
246+
health.Reason = string(ClusterOperatorHealthyReasonUnavailable)
247+
health.Message = "The cluster operator is unavailable because the available condition is not found in the cluster operator's status"
248+
} else if available.Status != configv1.ConditionTrue {
249+
health.Status = metav1.ConditionFalse
250+
health.Reason = string(ClusterOperatorHealthyReasonUnavailable)
251+
health.Message = available.Message
252+
} else if degraded != nil && degraded.Status == configv1.ConditionTrue {
253+
health.Status = metav1.ConditionFalse
254+
health.Reason = string(ClusterOperatorHealthyReasonDegraded)
255+
health.Message = degraded.Message
256+
}
257+
258+
return &ClusterOperatorStatusInsight{
259+
Name: operator.Name,
260+
Resource: ResourceRef{
261+
Resource: "clusteroperators",
262+
Group: configv1.GroupName,
263+
Name: operator.Name,
264+
},
265+
Conditions: []metav1.Condition{updating, health},
266+
}, nil
267+
}
268+
269+
var operatorImageNotImplemented = errors.New("operator-image not implemented in the versions from cluster operator's status")
270+
271+
func getImagePullSpec(ctx context.Context, name string, appsClient appsv1client.AppsV1Interface) (string, error) {
272+
// It is known that the image pull spec for co/machine-config can be accessed from the deployment
273+
if name == "machine-config" {
274+
if appsClient == nil {
275+
return "", errors.New("apps client is nil")
276+
}
277+
mcoDeployment, err := appsClient.Deployments("openshift-machine-config-operator").Get(ctx, "machine-config-operator", metav1.GetOptions{})
278+
if err != nil {
279+
return "", err
280+
}
281+
for _, c := range mcoDeployment.Spec.Template.Spec.Containers {
282+
if c.Name == "machine-config-operator" {
283+
return c.Image, nil
284+
}
285+
}
286+
return "", errors.New("machine-config-operator container not found")
287+
}
288+
// We may add here retrieval of the image pull spec for other COs when they implement "operator-image" in the status.versions
289+
return "", operatorImageNotImplemented
290+
}
291+
92292
// makeInsightMsgForClusterVersion creates an informerMsg for the given ClusterVersionStatusInsight. It defines an uid
93293
// name and serializes the insight as YAML. Serialization is convenient because it prevents any data sharing issues
94294
// between controllers.
@@ -270,16 +470,27 @@ func versionsFromHistory(history []configv1.UpdateHistory) ControlPlaneUpdateVer
270470
return versions
271471
}
272472

473+
func parseQueueKey(queueKey string) (string, string, error) {
474+
splits := strings.Split(queueKey, "/")
475+
if len(splits) != 2 {
476+
return "", "", fmt.Errorf("invalid queue key: %s", queueKey)
477+
}
478+
return splits[0], splits[1], nil
479+
}
480+
273481
func configApiQueueKeys(object runtime.Object) []string {
274482
if object == nil {
275483
return nil
276484
}
277485

278486
switch o := object.(type) {
279487
case *configv1.ClusterVersion:
280-
return []string{o.Name}
488+
return []string{fmt.Sprintf("%s/%s", clusterVersionKindName, o.Name)}
489+
case *configv1.ClusterOperator:
490+
return []string{fmt.Sprintf("%s/%s", clusterOperatorKindName, o.Name)}
281491
}
282492

283-
klog.Fatalf("USC :: Unknown object type: %T", object)
284-
return nil
493+
msg := fmt.Sprintf("USC :: Unknown object type: %T", object)
494+
klog.Error(msg)
495+
panic(msg)
285496
}

0 commit comments

Comments
 (0)