Skip to content

Commit e64f453

Browse files
committed
test/e2e/upgrade/monitor: --abort-at=100 requires a complete update
There's not much difference between "the machine-config ClusterOperator has finished updating, so 100% of operators have updated" and "ClusterVersion claims the update complete". The manifests the cluster-version operator reconciles after the machine-config ClusterOperator are mostly related to monitoring and resource-deletion: $ oc adm release extract --to manifests quay.io/openshift-release-dev/ocp-release:4.15.0-ec.2-x86_64 $ grep -r10 'name: machine-config' manifests | grep 'kind: ClusterOperator' manifests/0000_80_machine-config-operator_06_clusteroperator.yaml-kind: ClusterOperator $ ls manifests/0000_8* | sort | tail -n1 manifests/0000_80_machine-config-operator_06_clusteroperator.yaml $ for X in manifests/0000_9*; do yaml2json < "${X}" | jq -c '.[] | {deletion: .metadata.annotations["release.openshift.io/delete"], kind, dashboard: .metadata.labels["console.openshift.io/dashboard"]}'; done | sort | uniq -c 2 {"deletion":null,"kind":"ClusterRoleBinding","dashboard":null} 2 {"deletion":null,"kind":"ClusterRole","dashboard":null} 1 {"deletion":null,"kind":"ConfigMap","dashboard":null} 15 {"deletion":null,"kind":"ConfigMap","dashboard":"true"} # these ConfigMaps are console.openshift.io/dashboard, so a monitoring-related resource 1 {"deletion":null,"kind":"OperatorGroup","dashboard":null} 13 {"deletion":null,"kind":"PrometheusRule","dashboard":null} # PrometheusRules are monitoring-related resources (alerts and recording rules) 25 {"deletion":null,"kind":"RoleBinding","dashboard":null} 25 {"deletion":null,"kind":"Role","dashboard":null} 1 {"deletion":null,"kind":"Service","dashboard":null} 34 {"deletion":null,"kind":"ServiceMonitor","dashboard":null} # ServiceMonitors are monitoring-related resources (what Prometheus should scrape) 2 {"deletion":null,"kind":"StorageVersionMigration","dashboard":null} 1 {"deletion":"true","kind":"ClusterRoleBinding","dashboard":null} # this and later are release.openshift.io/delete manifests 1 {"deletion":"true","kind":"ConfigMap","dashboard":null} 1 {"deletion":"true","kind":"ConfigMap","dashboard":"true"} 2 {"deletion":"true","kind":"PrometheusRule","dashboard":null} $ for X in manifests/0000_9*; do yaml2json < "${X}" | jq -c '.[] | select(.metadata.annotations["release.openshift.io/delete"] == null and (.kind | contains("Role") or . == "Service")) | {kind, name: .metadata.name}'; done | sort | uniq -c 1 {"kind":"ClusterRoleBinding","name":"prometheus-k8s-scheduler-resources"} # these ClusterRoles and ClusterRoleBindings mention either Promtheus or monitoring in their names, so monitoring-related resources 1 {"kind":"ClusterRoleBinding","name":"registry-monitoring"} 1 {"kind":"ClusterRole","name":"prometheus-k8s-scheduler-resources"} 1 {"kind":"ClusterRole","name":"registry-monitoring"} 1 {"kind":"RoleBinding","name":"operator-lifecycle-manager-metrics"} # these Roles and RoleBindings grant Prometheus authority to scrape the Service(Monitor)s, so monitoring-related resources 1 {"kind":"RoleBinding","name":"prometheus"} 23 {"kind":"RoleBinding","name":"prometheus-k8s"} 1 {"kind":"Role","name":"operator-lifecycle-manager-metrics"} 1 {"kind":"Role","name":"prometheus"} 23 {"kind":"Role","name":"prometheus-k8s"} 1 {"kind":"Service","name":"check-endpoints"} # not related to Prometheus scraping, but still monitoring $ for X in manifests/0000_9*; do yaml2json < "${X}" | jq -c '.[] | .kind as $k | select((.kind | . == "ConfigMap" or . == "OperatorGroup") and .metadata.annotations["release.openshift.io/delete"] == null and .metadata.labels["console.openshift.io/dashboard"] == null).metadata | {kind: $k, namespace, name}'; done {"kind":"OperatorGroup","namespace":"openshift-monitoring","name":"openshift-cluster-monitoring"} # monitoring-related resource {"kind":"ConfigMap","namespace":"openshift-config-managed","name":"release-verification"} # https://github.com/openshift/cluster-update-keys/blob/804dfeceb7caa41a6212fb405bbcd3f3139b8a6f/manifests.rhel/0000_90_cluster-update-keys_configmap.yaml#L67 So outside of monitoring and resource-deletion, the only manifest here is cluster-update-keys' release-verification ConfigMap, where the in-cluster representation is just informative, and the cluster-version operator is using its local manifest copy (and not the in-cluster resource) to perform verification [1]. Completing the update (which this commit delivers) is worthwhile, because these monitoring resources are quick to reconcile, and it is comforting to see delations completing and to know that the cluster can happily roll back even with a 'Completed' in the history entry. [1]: https://github.com/openshift/cluster-version-operator/blob/1f35807ba567b01f289b68e2bc3cbd2f2ff0eed3/pkg/cvo/cvo.go#L364
1 parent 0a65559 commit e64f453

File tree

3 files changed

+17
-3
lines changed

3 files changed

+17
-3
lines changed

pkg/cmd/openshift-tests/run-upgrade/command.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ func NewRunUpgradeCommand(streams genericclioptions.IOStreams) *cobra.Command {
2929
Supported options:
3030
3131
* abort-at=NUMBER - Set to a number between 0 and 100 to control the percent of operators
32-
at which to stop the current upgrade and roll back to the current version.
32+
at which to stop the current upgrade and roll back to the current version (100 to require a
33+
complete update).
3334
* disrupt-reboot=POLICY - During upgrades, periodically reboot master nodes. If set to 'graceful'
3435
the reboot will allow the node to shut down services in an orderly fashion. If set to 'force' the
3536
machine will terminate immediately without clean shutdown.

test/e2e/upgrade/monitor.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,23 @@ func (m *versionMonitor) ShouldReboot() []string {
107107
return nil
108108
}
109109

110-
func (m *versionMonitor) ShouldUpgradeAbort(abortAt int) bool {
110+
func (m *versionMonitor) ShouldUpgradeAbort(abortAt int, desired configv1.Update) bool {
111111
if abortAt == 0 {
112112
return false
113113
}
114+
115+
if abortAt == 100 {
116+
if m.lastCV == nil {
117+
return false // wait for m.Check() to populate a ClusterVersion
118+
}
119+
120+
if len(m.lastCV.Status.History) == 0 {
121+
return false // wait for the cluster-version operator to populate history
122+
}
123+
history := m.lastCV.Status.History[0]
124+
return history.Image == desired.Image && history.State == configv1.CompletedUpdate
125+
}
126+
114127
coList, err := m.client.ConfigV1().ClusterOperators().List(context.Background(), metav1.ListOptions{})
115128
if err != nil {
116129
framework.Logf("Unable to retrieve cluster operators, cannot check completion percentage")

test/e2e/upgrade/upgrade.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,7 @@ func clusterUpgrade(f *framework.Framework, c configv1client.Interface, dc dynam
544544
return false, err
545545
}
546546

547-
if !aborted && monitor.ShouldUpgradeAbort(abortAt) {
547+
if !aborted && monitor.ShouldUpgradeAbort(abortAt, desired) {
548548
framework.Logf("Instructing the cluster to return to %s / %s", original.Status.Desired.Version, original.Status.Desired.Image)
549549
desired = configv1.Update{
550550
Image: original.Status.Desired.Image,

0 commit comments

Comments
 (0)