Merge pull request kubernetes#71828 from yuexiao-wang/cleanup-upgrad-etcd-left

k8s-ci-robot · web-flow · commit c06c08e5aee3 · 2018-12-08T02:39:54.000-08:00
kubeadm: fixed cleanup upgrade from no-TLS etcd to TLS etcd
diff --git a/cmd/kubeadm/app/phases/upgrade/compute_test.go b/cmd/kubeadm/app/phases/upgrade/compute_test.go
@@ -78,7 +78,7 @@ type fakeEtcdClient struct {
 
 func (f fakeEtcdClient) ClusterAvailable() (bool, error) { return true, nil }
 
-func (f fakeEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) {
+func (f fakeEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {
 	return true, nil
 }
 
diff --git a/cmd/kubeadm/app/phases/upgrade/staticpods.go b/cmd/kubeadm/app/phases/upgrade/staticpods.go
@@ -174,20 +174,6 @@ func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticP
 		recoverEtcd = true
 	}
 
-	// We currently depend on getting the Etcd mirror Pod hash from the KubeAPIServer;
-	// Upgrading the Etcd protocol takes down the apiserver, so we can't verify component restarts if we restart Etcd independently.
-	// Skip waiting for Etcd to restart and immediately move on to updating the apiserver.
-	if component == constants.Etcd {
-		waitForComponentRestart = false
-	}
-	// Normally, if an Etcd upgrade is successful, but the apiserver upgrade fails, Etcd is not rolled back.
-	// In the case of a TLS upgrade, the old KubeAPIServer config is incompatible with the new Etcd confg, so we rollback Etcd
-	// if the APIServer upgrade fails.
-	if component == constants.KubeAPIServer {
-		recoverEtcd = true
-		fmt.Printf("[upgrade/staticpods] The %s manifest will be restored if component %q fails to upgrade\n", constants.Etcd, component)
-	}
-
 	if err := renewCerts(cfg, component); err != nil {
 		return errors.Wrapf(err, "failed to renew certificates for component %q", component)
 	}
@@ -311,14 +297,6 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
 		return true, errors.Wrap(err, "error creating local etcd static pod manifest file")
 	}
 
-	// Waiter configurations for checking etcd status
-	// If we are upgrading TLS we need to wait for old static pod to be removed.
-	// This is needed because we are not able to currently verify that the static pod
-	// has been updated through the apiserver across an etcd TLS upgrade.
-	// This value is arbitrary but seems to be long enough in manual testing.
-	noDelay := 0 * time.Second
-	podRestartDelay := 30 * time.Second
-
 	retries := 10
 	retryInterval := 15 * time.Second
 
@@ -328,7 +306,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
 		// Since upgrade component failed, the old etcd manifest has either been restored or was never touched
 		// Now we need to check the health of etcd cluster if it is up with old manifest
 		fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available")
-		if _, err := oldEtcdClient.WaitForClusterAvailable(noDelay, retries, retryInterval); err != nil {
+		if _, err := oldEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil {
 			fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err)
 
 			// At this point we know that etcd cluster is dead and it is safe to copy backup datastore and to rollback old etcd manifest
@@ -341,7 +319,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
 
 			// Now that we've rolled back the data, let's check if the cluster comes up
 			fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available")
-			if _, err := oldEtcdClient.WaitForClusterAvailable(noDelay, retries, retryInterval); err != nil {
+			if _, err := oldEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil {
 				fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err)
 				// Nothing else left to try to recover etcd cluster
 				return true, errors.Wrapf(err, "fatal error rolling back local etcd cluster manifest, the backup of etcd database is stored here:(%s)", backupEtcdDir)
@@ -366,7 +344,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
 
 	// Checking health state of etcd after the upgrade
 	fmt.Println("[upgrade/etcd] Waiting for etcd to become available")
-	if _, err = newEtcdClient.WaitForClusterAvailable(podRestartDelay, retries, retryInterval); err != nil {
+	if _, err = newEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil {
 		fmt.Printf("[upgrade/etcd] Failed to healthcheck etcd: %v\n", err)
 		// Despite the fact that upgradeComponent was successful, there is something wrong with the etcd cluster
 		// First step is to restore back up of datastore
@@ -384,7 +362,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
 
 		// Assuming rollback of the old etcd manifest was successful, check the status of etcd cluster again
 		fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available")
-		if _, err := oldEtcdClient.WaitForClusterAvailable(noDelay, retries, retryInterval); err != nil {
+		if _, err := oldEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil {
 			fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err)
 			// Nothing else left to try to recover etcd cluster
 			return true, errors.Wrapf(err, "fatal error rolling back local etcd cluster manifest, the backup of etcd database is stored here:(%s)", backupEtcdDir)
diff --git a/cmd/kubeadm/app/phases/upgrade/staticpods_test.go b/cmd/kubeadm/app/phases/upgrade/staticpods_test.go
@@ -230,7 +230,7 @@ type fakeTLSEtcdClient struct{ TLS bool }
 
 func (c fakeTLSEtcdClient) ClusterAvailable() (bool, error) { return true, nil }
 
-func (c fakeTLSEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) {
+func (c fakeTLSEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {
 	return true, nil
 }
 
@@ -261,7 +261,7 @@ type fakePodManifestEtcdClient struct{ ManifestDir, CertificatesDir string }
 
 func (c fakePodManifestEtcdClient) ClusterAvailable() (bool, error) { return true, nil }
 
-func (c fakePodManifestEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) {
+func (c fakePodManifestEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {
 	return true, nil
 }
 
diff --git a/cmd/kubeadm/app/util/etcd/etcd.go b/cmd/kubeadm/app/util/etcd/etcd.go
@@ -43,7 +43,7 @@ type ClusterInterrogator interface {
 	GetClusterStatus() (map[string]*clientv3.StatusResponse, error)
 	GetClusterVersions() (map[string]string, error)
 	GetVersion() (string, error)
-	WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error)
+	WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error)
 	Sync() error
 	AddMember(name string, peerAddrs string) ([]Member, error)
 }
@@ -328,10 +328,8 @@ func (c Client) GetClusterStatus() (map[string]*clientv3.StatusResponse, error)
 	return clusterStatus, nil
 }
 
-// WaitForClusterAvailable returns true if all endpoints in the cluster are available after an initial delay and retry attempts, an error is returned otherwise
-func (c Client) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) {
-	fmt.Printf("[util/etcd] Waiting %v for initial delay\n", delay)
-	time.Sleep(delay)
+// WaitForClusterAvailable returns true if all endpoints in the cluster are available after retry attempts, an error is returned otherwise
+func (c Client) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {
 	for i := 0; i < retries; i++ {
 		if i > 0 {
 			fmt.Printf("[util/etcd] Waiting %v until next retry\n", retryInterval)

Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@ type fakeEtcdClient struct {`
`78`	`78`
`79`	`79`	`func (f fakeEtcdClient) ClusterAvailable() (bool, error) { return true, nil }`
`80`	`80`
`81`		`-func (f fakeEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) {`
	`81`	`+func (f fakeEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {`
`82`	`82`	`return true, nil`
`83`	`83`	`}`
`84`	`84`
Original file line number	Diff line number	Diff line change
`@@ -230,7 +230,7 @@ type fakeTLSEtcdClient struct{ TLS bool }`
`230`	`230`
`231`	`231`	`func (c fakeTLSEtcdClient) ClusterAvailable() (bool, error) { return true, nil }`
`232`	`232`
`233`		`-func (c fakeTLSEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) {`
	`233`	`+func (c fakeTLSEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {`
`234`	`234`	`return true, nil`
`235`	`235`	`}`
`236`	`236`
`@@ -261,7 +261,7 @@ type fakePodManifestEtcdClient struct{ ManifestDir, CertificatesDir string }`
`261`	`261`
`262`	`262`	`func (c fakePodManifestEtcdClient) ClusterAvailable() (bool, error) { return true, nil }`
`263`	`263`
`264`		`-func (c fakePodManifestEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) {`
	`264`	`+func (c fakePodManifestEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {`
`265`	`265`	`return true, nil`
`266`	`266`	`}`
`267`	`267`