Skip to content

Commit 3a5722f

Browse files
Merge pull request #2017 from wking/oc-adm-upgrade-recommend-accept
OTA-1532: pkg/cli/admin/upgrade/recommend: New, feature-gated --accept
2 parents 49459b5 + 2d7e004 commit 3a5722f

File tree

6 files changed

+128
-45
lines changed

6 files changed

+128
-45
lines changed

pkg/cli/admin/upgrade/recommend/alerts.go

Lines changed: 50 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,18 @@ import (
1818
// Conditions that are True for happy signals, False for sad signals,
1919
// and Unknown when we do not have enough information to make a
2020
// happy-or-sad determination.
21-
func (o *options) alerts(ctx context.Context) ([]metav1.Condition, error) {
21+
func (o *options) alerts(ctx context.Context) ([]acceptableCondition, error) {
2222
var alertsBytes []byte
2323
if o.mockData.alertsPath != "" {
2424
if len(o.mockData.alerts) == 0 {
25-
return []metav1.Condition{{
26-
Type: "recommended/Alert",
27-
Status: metav1.ConditionUnknown,
28-
Reason: "NoTestData",
29-
Message: fmt.Sprintf("This --mock-clusterversion run did not have alert data available at %v", o.mockData.alertsPath),
25+
return []acceptableCondition{{
26+
Condition: metav1.Condition{
27+
Type: "recommended/Alert",
28+
Status: metav1.ConditionUnknown,
29+
Reason: "NoTestData",
30+
Message: fmt.Sprintf("This --mock-clusterversion run did not have alert data available at %v", o.mockData.alertsPath),
31+
},
32+
acceptanceName: "AlertNoTestData",
3033
}}, nil
3134
}
3235
alertsBytes = o.mockData.alerts
@@ -51,7 +54,7 @@ func (o *options) alerts(ctx context.Context) ([]metav1.Condition, error) {
5154
return nil, fmt.Errorf("parsing alerts: %w", err)
5255
}
5356

54-
var conditions []metav1.Condition
57+
var conditions []acceptableCondition
5558
i := 0
5659
haveCritical := false
5760
havePodDisruptionBudget := false
@@ -100,11 +103,14 @@ func (o *options) alerts(ctx context.Context) ([]metav1.Condition, error) {
100103
// ideally the upstream PodDisruptionBudget*Limit alerts templated in the namespace and PDB, but until they do, inject those ourselves
101104
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels.Namespace, alert.Labels.PodDisruptionBudget, details)
102105
}
103-
conditions = append(conditions, metav1.Condition{
104-
Type: fmt.Sprintf("recommended/CriticalAlerts/%s/%d", alertName, i),
105-
Status: metav1.ConditionFalse,
106-
Reason: fmt.Sprintf("Alert:%s", alert.State),
107-
Message: fmt.Sprintf("%s alert %s %s, suggesting significant cluster issues worth investigating. %s", alert.Labels.Severity, alert.Labels.AlertName, alert.State, details),
106+
conditions = append(conditions, acceptableCondition{
107+
Condition: metav1.Condition{
108+
Type: fmt.Sprintf("recommended/CriticalAlerts/%s/%d", alertName, i),
109+
Status: metav1.ConditionFalse,
110+
Reason: fmt.Sprintf("Alert:%s", alert.State),
111+
Message: fmt.Sprintf("%s alert %s %s, suggesting significant cluster issues worth investigating. %s", alert.Labels.Severity, alert.Labels.AlertName, alert.State, details),
112+
},
113+
acceptanceName: alertName,
108114
})
109115
i += 1
110116
continue
@@ -114,75 +120,84 @@ func (o *options) alerts(ctx context.Context) ([]metav1.Condition, error) {
114120
havePodDisruptionBudget = true
115121
// ideally the upstream PodDisruptionBudget*Limit alerts templated in the namespace and PDB, but until they do, inject those ourselves
116122
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels.Namespace, alert.Labels.PodDisruptionBudget, details)
117-
conditions = append(conditions, metav1.Condition{
118-
Type: fmt.Sprintf("recommended/PodDisruptionBudgetAlerts/%s/%d", alertName, i),
119-
Status: metav1.ConditionFalse,
120-
Reason: fmt.Sprintf("Alert:%s", alert.State),
121-
Message: fmt.Sprintf("%s alert %s %s, which might slow node drains. %s", alert.Labels.Severity, alert.Labels.AlertName, alert.State, details),
123+
conditions = append(conditions, acceptableCondition{
124+
Condition: metav1.Condition{
125+
Type: fmt.Sprintf("recommended/PodDisruptionBudgetAlerts/%s/%d", alertName, i),
126+
Status: metav1.ConditionFalse,
127+
Reason: fmt.Sprintf("Alert:%s", alert.State),
128+
Message: fmt.Sprintf("%s alert %s %s, which might slow node drains. %s", alert.Labels.Severity, alert.Labels.AlertName, alert.State, details),
129+
},
130+
acceptanceName: alertName,
122131
})
123132
i += 1
124133
continue
125134
}
126135

127136
if alertName == "KubeContainerWaiting" {
128137
havePullWaiting = true
129-
conditions = append(conditions, metav1.Condition{
130-
Type: fmt.Sprintf("recommended/PodImagePullAlerts/%s/%d", alertName, i),
131-
Status: metav1.ConditionFalse,
132-
Reason: fmt.Sprintf("Alert:%s", alert.State),
133-
Message: fmt.Sprintf("%s alert %s %s, which may slow workload redistribution during rolling node updates. %s", alert.Labels.Severity, alert.Labels.AlertName, alert.State, details),
138+
conditions = append(conditions, acceptableCondition{
139+
Condition: metav1.Condition{
140+
Type: fmt.Sprintf("recommended/PodImagePullAlerts/%s/%d", alertName, i),
141+
Status: metav1.ConditionFalse,
142+
Reason: fmt.Sprintf("Alert:%s", alert.State),
143+
Message: fmt.Sprintf("%s alert %s %s, which may slow workload redistribution during rolling node updates. %s", alert.Labels.Severity, alert.Labels.AlertName, alert.State, details),
144+
},
145+
acceptanceName: alertName,
134146
})
135147
i += 1
136148
continue
137149
}
138150

139151
if alertName == "KubeNodeNotReady" || alertName == "KubeNodeReadinessFlapping" || alertName == "KubeNodeUnreachable" {
140152
haveNodes = true
141-
conditions = append(conditions, metav1.Condition{
142-
Type: fmt.Sprintf("recommended/NodeAlerts/%s/%d", alertName, i),
143-
Status: metav1.ConditionFalse,
144-
Reason: fmt.Sprintf("Alert:%s", alert.State),
145-
Message: fmt.Sprintf("%s alert %s %s, which may slow workload redistribution during rolling node updates. %s", alert.Labels.Severity, alert.Labels.AlertName, alert.State, details),
153+
conditions = append(conditions, acceptableCondition{
154+
Condition: metav1.Condition{
155+
Type: fmt.Sprintf("recommended/NodeAlerts/%s/%d", alertName, i),
156+
Status: metav1.ConditionFalse,
157+
Reason: fmt.Sprintf("Alert:%s", alert.State),
158+
Message: fmt.Sprintf("%s alert %s %s, which may slow workload redistribution during rolling node updates. %s", alert.Labels.Severity, alert.Labels.AlertName, alert.State, details),
159+
},
160+
acceptanceName: alertName,
146161
})
147162
i += 1
148163
continue
149164
}
150165
}
151166

152167
if !haveCritical {
153-
conditions = append(conditions, metav1.Condition{
168+
conditions = append(conditions, acceptableCondition{Condition: metav1.Condition{
154169
Type: "recommended/CriticalAlerts",
155170
Status: metav1.ConditionTrue,
156171
Reason: "AsExpected",
157172
Message: "No critical alerts firing.",
158-
})
173+
}})
159174
}
160175

161176
if !havePodDisruptionBudget {
162-
conditions = append(conditions, metav1.Condition{
177+
conditions = append(conditions, acceptableCondition{Condition: metav1.Condition{
163178
Type: "recommended/PodDisruptionBudgetAlerts",
164179
Status: metav1.ConditionTrue,
165180
Reason: "AsExpected",
166181
Message: "No PodDisruptionBudget alerts firing.",
167-
})
182+
}})
168183
}
169184

170185
if !havePullWaiting {
171-
conditions = append(conditions, metav1.Condition{
186+
conditions = append(conditions, acceptableCondition{Condition: metav1.Condition{
172187
Type: "recommended/PodImagePullAlerts",
173188
Status: metav1.ConditionTrue,
174189
Reason: "AsExpected",
175190
Message: "No Pod container image pull alerts firing.",
176-
})
191+
}})
177192
}
178193

179194
if !haveNodes {
180-
conditions = append(conditions, metav1.Condition{
195+
conditions = append(conditions, acceptableCondition{Condition: metav1.Condition{
181196
Type: "recommended/NodeAlerts",
182197
Status: metav1.ConditionTrue,
183198
Reason: "AsExpected",
184199
Message: "No Node alerts firing.",
185-
})
200+
}})
186201
}
187202

188203
return conditions, nil

pkg/cli/admin/upgrade/recommend/examples/4.12.16-longest-not-recommended.version-4.12.51-output

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,5 @@ Reason: MultipleReasons
1515
Message: An unintended reversion to the default kubelet nodeStatusReportFrequency can cause significant load on the control plane. https://issues.redhat.com/browse/MCO-1094
1616

1717
After rebooting into kernel-4.18.0-372.88.1.el8_6 or later, kernel nodes experience high load average and io_wait times. The nodes might fail to start or stop pods and probes may fail. Workload and host processes may become unresponsive and workload may be disrupted. https://issues.redhat.com/browse/COS-2705
18+
19+
error: issues that apply to this cluster but which were not included in --accept: AlertNoTestData,ConditionalUpdateRisk

pkg/cli/admin/upgrade/recommend/examples/4.12.16-longest-recommended.version-4.12.51-output

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,5 @@ Reason: MultipleReasons
1515
Message: An unintended reversion to the default kubelet nodeStatusReportFrequency can cause significant load on the control plane. https://issues.redhat.com/browse/MCO-1094
1616

1717
After rebooting into kernel-4.18.0-372.88.1.el8_6 or later, kernel nodes experience high load average and io_wait times. The nodes might fail to start or stop pods and probes may fail. Workload and host processes may become unresponsive and workload may be disrupted. https://issues.redhat.com/browse/COS-2705
18+
19+
error: issues that apply to this cluster but which were not included in --accept: AlertNoTestData,ConditionalUpdateRisk

pkg/cli/admin/upgrade/recommend/examples/4.16.27-degraded-monitoring.version-4.16.32-output

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,5 @@ Channel: candidate-4.16 (available channels: candidate-4.16, candidate-4.17, can
2323
Update to 4.16.32 has no known issues relevant to this cluster.
2424
Image: quay.io/openshift-release-dev/ocp-release@sha256:0e71cb61694473b40e8d95f530eaf250a62616debb98199f31b4034808687dae
2525
Release URL: https://access.redhat.com/errata/RHSA-2025:0650
26+
27+
error: issues that apply to this cluster but which were not included in --accept: ClusterOperatorDown,Failing,PodDisruptionBudgetAtLimit

pkg/cli/admin/upgrade/recommend/precheck.go

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,28 @@ import (
77
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
88
)
99

10-
type check func(ctx context.Context) ([]metav1.Condition, error)
10+
// acceptableCondition extends metav1.Condition and a risk-acceptance
11+
// name, to give users a way to identify unhappy conditions they are
12+
// willing to accept.
13+
type acceptableCondition struct {
14+
metav1.Condition
1115

12-
// precheck runs a set of Condition-generating checkers, and
13-
// aggregates their results. The Conditions must use True for happy
16+
// acceptanceName should be set for Status != True
17+
// conditions, to give users a way to identify unhappy conditions
18+
// they are willing to accept. The value does not need to be
19+
// unique among conditions, for example, multiple alerts with
20+
// the same name could all use the same risk acceptance name.
21+
acceptanceName string
22+
}
23+
24+
type check func(ctx context.Context) ([]acceptableCondition, error)
25+
26+
// precheck runs a set of condition-generating checkers, and
27+
// aggregates their results. The conditions must use True for happy
1428
// signals, False for sad signals, and Unknown when we do not have enough
1529
// information to make a happy-or-sad determination.
16-
func (o *options) precheck(ctx context.Context) ([]metav1.Condition, error) {
17-
var conditions []metav1.Condition
30+
func (o *options) precheck(ctx context.Context) ([]acceptableCondition, error) {
31+
var conditions []acceptableCondition
1832
var errs []error
1933
for _, check := range []check{
2034
o.alerts,

pkg/cli/admin/upgrade/recommend/recommend.go

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414

1515
apierrors "k8s.io/apimachinery/pkg/api/errors"
1616
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
17+
"k8s.io/apimachinery/pkg/util/sets"
1718
"k8s.io/cli-runtime/pkg/genericiooptions"
1819
"k8s.io/client-go/rest"
1920
kcmdutil "k8s.io/kubectl/pkg/cmd/util"
@@ -61,6 +62,10 @@ func New(f kcmdutil.Factory, streams genericiooptions.IOStreams) *cobra.Command
6162
flags.BoolVar(&o.showOutdatedReleases, "show-outdated-releases", o.showOutdatedReleases, "Display additional older releases. These releases may be exposed to known issues which have been fixed in more recent releases. But all updates will contain fixes not present in your current release.")
6263
flags.StringVar(&o.rawVersion, "version", o.rawVersion, "Select a particular target release to display by version.")
6364

65+
if kcmdutil.FeatureGate("OC_ENABLE_CMD_UPGRADE_RECOMMEND_ACCEPT").IsEnabled() {
66+
flags.StringSliceVar(&o.accept, "accept", o.accept, "Comma-delimited names for issues that you find acceptable. With --version, any unaccepted issues will result in a non-zero exit code.")
67+
}
68+
6469
// TODO: We can remove this flag once the idea about `oc adm upgrade recommend` stabilizes and the command
6570
// is promoted out of the OC_ENABLE_CMD_UPGRADE_RECOMMEND feature gate
6671
flags.StringVar(&o.mockData.cvPath, "mock-clusterversion", "", "Path to a YAML ClusterVersion object to use for testing (will be removed later).")
@@ -81,6 +86,9 @@ type options struct {
8186
// version is the parsed form of rawVersion. Consumers after options.Complete should prefer this property.
8287
version *semver.Version
8388

89+
// accept is a slice of acceptable issue names.
90+
accept []string
91+
8492
RESTConfig *rest.Config
8593
Client configv1client.Interface
8694
}
@@ -125,6 +133,9 @@ func (o *options) Complete(f kcmdutil.Factory, cmd *cobra.Command, args []string
125133
}
126134

127135
func (o *options) Run(ctx context.Context) error {
136+
issues := sets.New[string]()
137+
accept := sets.New[string](o.accept...)
138+
128139
var cv *configv1.ClusterVersion
129140
if cv = o.mockData.clusterVersion; cv == nil {
130141
var err error
@@ -139,34 +150,60 @@ func (o *options) Run(ctx context.Context) error {
139150

140151
if c := findClusterOperatorStatusCondition(cv.Status.Conditions, clusterStatusFailing); c != nil {
141152
if c.Status != configv1.ConditionFalse {
142-
fmt.Fprintf(o.Out, "%s=%s:\n\n Reason: %s\n Message: %s\n\n", c.Type, c.Status, c.Reason, strings.ReplaceAll(c.Message, "\n", "\n "))
153+
acceptContext := ""
154+
if accept.Has(string(clusterStatusFailing)) {
155+
acceptContext = "accepted "
156+
}
157+
fmt.Fprintf(o.Out, "%s%s=%s:\n\n Reason: %s\n Message: %s\n\n", acceptContext, c.Type, c.Status, c.Reason, strings.ReplaceAll(c.Message, "\n", "\n "))
158+
issues.Insert(string(clusterStatusFailing))
143159
}
144160
} else {
145-
fmt.Fprintf(o.ErrOut, "warning: No current %s info, see `oc describe clusterversion` for more details.\n", clusterStatusFailing)
161+
acceptContext := ""
162+
if accept.Has(string(clusterStatusFailing)) {
163+
acceptContext = "accepted "
164+
}
165+
fmt.Fprintf(o.ErrOut, "%swarning: No current %s info, see `oc describe clusterversion` for more details.\n", acceptContext, clusterStatusFailing)
166+
issues.Insert(string(clusterStatusFailing))
146167
}
147168

148169
if c := findClusterOperatorStatusCondition(cv.Status.Conditions, configv1.OperatorProgressing); c != nil && c.Status == configv1.ConditionTrue && len(c.Message) > 0 {
149-
fmt.Fprintf(o.Out, "info: An update is in progress. You may wish to let this update complete before requesting a new update.\n %s\n\n", strings.ReplaceAll(c.Message, "\n", "\n "))
170+
acceptContext := ""
171+
if accept.Has(string(configv1.OperatorProgressing)) {
172+
acceptContext = "accepted "
173+
}
174+
fmt.Fprintf(o.Out, "%sinfo: An update is in progress. You may wish to let this update complete before requesting a new update.\n %s\n\n", acceptContext, strings.ReplaceAll(c.Message, "\n", "\n "))
175+
issues.Insert(string(configv1.OperatorProgressing))
150176
}
151177

152178
if o.precheckEnabled {
153179
conditions, err := o.precheck(ctx)
154180
if err != nil {
155181
fmt.Fprintf(o.Out, "Failed to check for at least some preconditions: %v\n", err)
182+
issues.Insert("FailedToCompletePrecheck")
156183
}
157184
var happyConditions []string
185+
var acceptedConditions []string
158186
var unhappyConditions []string
159187
for _, condition := range conditions {
160188
if condition.Status == metav1.ConditionTrue {
161189
happyConditions = append(happyConditions, fmt.Sprintf("%s (%s)", condition.Type, condition.Reason))
162190
} else {
163-
unhappyConditions = append(unhappyConditions, condition.Type)
191+
issues.Insert(condition.acceptanceName)
192+
if accept.Has(condition.acceptanceName) {
193+
acceptedConditions = append(acceptedConditions, condition.Type)
194+
} else {
195+
unhappyConditions = append(unhappyConditions, condition.Type)
196+
}
164197
}
165198
}
166199
if len(happyConditions) > 0 {
167200
sort.Strings(happyConditions)
168201
fmt.Fprintf(o.Out, "The following conditions found no cause for concern in updating this cluster to later releases: %s\n\n", strings.Join(happyConditions, ", "))
169202
}
203+
if len(acceptedConditions) > 0 {
204+
sort.Strings(acceptedConditions)
205+
fmt.Fprintf(o.Out, "The following conditions found cause for concern in updating this cluster to later releases, but were explicitly accepted via --accept: %s\n\n", strings.Join(acceptedConditions, ", "))
206+
}
170207
if len(unhappyConditions) > 0 {
171208
sort.Strings(unhappyConditions)
172209
fmt.Fprintf(o.Out, "The following conditions found cause for concern in updating this cluster to later releases: %s\n\n", strings.Join(unhappyConditions, ", "))
@@ -261,7 +298,18 @@ func (o *options) Run(ctx context.Context) error {
261298
if c := notRecommendedCondition(update); c == nil {
262299
fmt.Fprintf(o.Out, "Update to %s has no known issues relevant to this cluster.\nImage: %s\nRelease URL: %s\n", update.Release.Version, update.Release.Image, update.Release.URL)
263300
} else {
264-
fmt.Fprintf(o.Out, "Update to %s %s=%s:\nImage: %s\nRelease URL: %s\nReason: %s\nMessage: %s\n", update.Release.Version, c.Type, c.Status, update.Release.Image, update.Release.URL, c.Reason, strings.ReplaceAll(c.Message, "\n", "\n "))
301+
acceptContext := ""
302+
if accept.Has("ConditionalUpdateRisk") {
303+
acceptContext = "accepted "
304+
}
305+
fmt.Fprintf(o.Out, "Update to %s %s=%s:\nImage: %s\nRelease URL: %s\nReason: %s%s\nMessage: %s\n", update.Release.Version, c.Type, c.Status, update.Release.Image, update.Release.URL, acceptContext, c.Reason, strings.ReplaceAll(c.Message, "\n", "\n "))
306+
issues.Insert("ConditionalUpdateRisk")
307+
}
308+
unaccepted := issues.Difference(accept)
309+
if unaccepted.Len() > 0 {
310+
return fmt.Errorf("issues that apply to this cluster but which were not included in --accept: %s", strings.Join(sets.List(unaccepted), ","))
311+
} else if issues.Len() > 0 {
312+
fmt.Fprintf(o.Out, "Update to %s has no known issues relevant to this cluster other than the accepted %s.\n", update.Release.Version, strings.Join(sets.List(issues), ","))
265313
}
266314
return nil
267315
}

0 commit comments

Comments
 (0)