test/e2e/upgrade/monitor: --abort-at=100 requires a complete update

wking · wking · commit e64f453fc3b1 · 2023-11-15T15:40:10.000-08:00
There's not much difference between "the machine-config ClusterOperator has finished updating, so 100% of operators have updated" and "ClusterVersion claims the update complete". The manifests the cluster-version operator reconciles after the machine-config ClusterOperator are mostly related to monitoring and resource-deletion: $ oc adm release extract --to manifests quay.io/openshift-release-dev/ocp-release:4.15.0-ec.2-x86_64 $ grep -r10 'name: machine-config' manifests | grep 'kind: ClusterOperator' manifests/0000_80_machine-config-operator_06_clusteroperator.yaml-kind: ClusterOperator $ ls manifests/0000_8* | sort | tail -n1 manifests/0000_80_machine-config-operator_06_clusteroperator.yaml $ for X in manifests/0000_9*; do yaml2json < "${X}" | jq -c '.[] | {deletion: .metadata.annotations["release.openshift.io/delete"], kind, dashboard: .metadata.labels["console.openshift.io/dashboard"]}'; done | sort | uniq -c 2 {"deletion":null,"kind":"ClusterRoleBinding","dashboard":null} 2 {"deletion":null,"kind":"ClusterRole","dashboard":null} 1 {"deletion":null,"kind":"ConfigMap","dashboard":null} 15 {"deletion":null,"kind":"ConfigMap","dashboard":"true"} # these ConfigMaps are console.openshift.io/dashboard, so a monitoring-related resource 1 {"deletion":null,"kind":"OperatorGroup","dashboard":null} 13 {"deletion":null,"kind":"PrometheusRule","dashboard":null} # PrometheusRules are monitoring-related resources (alerts and recording rules) 25 {"deletion":null,"kind":"RoleBinding","dashboard":null} 25 {"deletion":null,"kind":"Role","dashboard":null} 1 {"deletion":null,"kind":"Service","dashboard":null} 34 {"deletion":null,"kind":"ServiceMonitor","dashboard":null} # ServiceMonitors are monitoring-related resources (what Prometheus should scrape) 2 {"deletion":null,"kind":"StorageVersionMigration","dashboard":null} 1 {"deletion":"true","kind":"ClusterRoleBinding","dashboard":null} # this and later are release.openshift.io/delete manifests 1 {"deletion":"true","kind":"ConfigMap","dashboard":null} 1 {"deletion":"true","kind":"ConfigMap","dashboard":"true"} 2 {"deletion":"true","kind":"PrometheusRule","dashboard":null} $ for X in manifests/0000_9*; do yaml2json < "${X}" | jq -c '.[] | select(.metadata.annotations["release.openshift.io/delete"] == null and (.kind | contains("Role") or . == "Service")) | {kind, name: .metadata.name}'; done | sort | uniq -c 1 {"kind":"ClusterRoleBinding","name":"prometheus-k8s-scheduler-resources"} # these ClusterRoles and ClusterRoleBindings mention either Promtheus or monitoring in their names, so monitoring-related resources 1 {"kind":"ClusterRoleBinding","name":"registry-monitoring"} 1 {"kind":"ClusterRole","name":"prometheus-k8s-scheduler-resources"} 1 {"kind":"ClusterRole","name":"registry-monitoring"} 1 {"kind":"RoleBinding","name":"operator-lifecycle-manager-metrics"} # these Roles and RoleBindings grant Prometheus authority to scrape the Service(Monitor)s, so monitoring-related resources 1 {"kind":"RoleBinding","name":"prometheus"} 23 {"kind":"RoleBinding","name":"prometheus-k8s"} 1 {"kind":"Role","name":"operator-lifecycle-manager-metrics"} 1 {"kind":"Role","name":"prometheus"} 23 {"kind":"Role","name":"prometheus-k8s"} 1 {"kind":"Service","name":"check-endpoints"} # not related to Prometheus scraping, but still monitoring $ for X in manifests/0000_9*; do yaml2json < "${X}" | jq -c '.[] | .kind as $k | select((.kind | . == "ConfigMap" or . == "OperatorGroup") and .metadata.annotations["release.openshift.io/delete"] == null and .metadata.labels["console.openshift.io/dashboard"] == null).metadata | {kind: $k, namespace, name}'; done {"kind":"OperatorGroup","namespace":"openshift-monitoring","name":"openshift-cluster-monitoring"} # monitoring-related resource {"kind":"ConfigMap","namespace":"openshift-config-managed","name":"release-verification"} # https://github.com/openshift/cluster-update-keys/blob/804dfeceb7caa41a6212fb405bbcd3f3139b8a6f/manifests.rhel/0000_90_cluster-update-keys_configmap.yaml#L67 So outside of monitoring and resource-deletion, the only manifest here is cluster-update-keys' release-verification ConfigMap, where the in-cluster representation is just informative, and the cluster-version operator is using its local manifest copy (and not the in-cluster resource) to perform verification [1]. Completing the update (which this commit delivers) is worthwhile, because these monitoring resources are quick to reconcile, and it is comforting to see delations completing and to know that the cluster can happily roll back even with a 'Completed' in the history entry. [1]: https://github.com/openshift/cluster-version-operator/blob/1f35807ba567b01f289b68e2bc3cbd2f2ff0eed3/pkg/cvo/cvo.go#L364
diff --git a/pkg/cmd/openshift-tests/run-upgrade/command.go b/pkg/cmd/openshift-tests/run-upgrade/command.go
@@ -29,7 +29,8 @@ func NewRunUpgradeCommand(streams genericclioptions.IOStreams) *cobra.Command {
 		Supported options:
 
 		* abort-at=NUMBER - Set to a number between 0 and 100 to control the percent of operators
-		at which to stop the current upgrade and roll back to the current version.
+		at which to stop the current upgrade and roll back to the current version (100 to require a
+		complete update).
 		* disrupt-reboot=POLICY - During upgrades, periodically reboot master nodes. If set to 'graceful'
 		the reboot will allow the node to shut down services in an orderly fashion. If set to 'force' the
 		machine will terminate immediately without clean shutdown.
diff --git a/test/e2e/upgrade/monitor.go b/test/e2e/upgrade/monitor.go
@@ -107,10 +107,23 @@ func (m *versionMonitor) ShouldReboot() []string {
 	return nil
 }
 
-func (m *versionMonitor) ShouldUpgradeAbort(abortAt int) bool {
+func (m *versionMonitor) ShouldUpgradeAbort(abortAt int, desired configv1.Update) bool {
 	if abortAt == 0 {
 		return false
 	}
+
+	if abortAt == 100 {
+		if m.lastCV == nil {
+			return false // wait for m.Check() to populate a ClusterVersion
+		}
+
+		if len(m.lastCV.Status.History) == 0 {
+			return false // wait for the cluster-version operator to populate history
+		}
+		history := m.lastCV.Status.History[0]
+		return history.Image == desired.Image && history.State == configv1.CompletedUpdate
+	}
+
 	coList, err := m.client.ConfigV1().ClusterOperators().List(context.Background(), metav1.ListOptions{})
 	if err != nil {
 		framework.Logf("Unable to retrieve cluster operators, cannot check completion percentage")
diff --git a/test/e2e/upgrade/upgrade.go b/test/e2e/upgrade/upgrade.go
@@ -544,7 +544,7 @@ func clusterUpgrade(f *framework.Framework, c configv1client.Interface, dc dynam
 					return false, err
 				}
 
-				if !aborted && monitor.ShouldUpgradeAbort(abortAt) {
+				if !aborted && monitor.ShouldUpgradeAbort(abortAt, desired) {
 					framework.Logf("Instructing the cluster to return to %s / %s", original.Status.Desired.Version, original.Status.Desired.Image)
 					desired = configv1.Update{
 						Image: original.Status.Desired.Image,

Original file line number	Diff line number	Diff line change
`@@ -544,7 +544,7 @@ func clusterUpgrade(f *framework.Framework, c configv1client.Interface, dc dynam`
`544`	`544`	`return false, err`
`545`	`545`	`}`
`546`	`546`
`547`		`- if !aborted && monitor.ShouldUpgradeAbort(abortAt) {`
	`547`	`+ if !aborted && monitor.ShouldUpgradeAbort(abortAt, desired) {`
`548`	`548`	`framework.Logf("Instructing the cluster to return to %s / %s", original.Status.Desired.Version, original.Status.Desired.Image)`
`549`	`549`	`desired = configv1.Update{`
`550`	`550`	`Image: original.Status.Desired.Image,`