Skip to content

Commit c06c08e

Browse files
authored
Merge pull request kubernetes#71828 from yuexiao-wang/cleanup-upgrad-etcd-left
kubeadm: fixed cleanup upgrade from no-TLS etcd to TLS etcd
2 parents f62b530 + 39f7124 commit c06c08e

File tree

4 files changed

+10
-34
lines changed

4 files changed

+10
-34
lines changed

cmd/kubeadm/app/phases/upgrade/compute_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ type fakeEtcdClient struct {
7878

7979
func (f fakeEtcdClient) ClusterAvailable() (bool, error) { return true, nil }
8080

81-
func (f fakeEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) {
81+
func (f fakeEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {
8282
return true, nil
8383
}
8484

cmd/kubeadm/app/phases/upgrade/staticpods.go

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -174,20 +174,6 @@ func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticP
174174
recoverEtcd = true
175175
}
176176

177-
// We currently depend on getting the Etcd mirror Pod hash from the KubeAPIServer;
178-
// Upgrading the Etcd protocol takes down the apiserver, so we can't verify component restarts if we restart Etcd independently.
179-
// Skip waiting for Etcd to restart and immediately move on to updating the apiserver.
180-
if component == constants.Etcd {
181-
waitForComponentRestart = false
182-
}
183-
// Normally, if an Etcd upgrade is successful, but the apiserver upgrade fails, Etcd is not rolled back.
184-
// In the case of a TLS upgrade, the old KubeAPIServer config is incompatible with the new Etcd confg, so we rollback Etcd
185-
// if the APIServer upgrade fails.
186-
if component == constants.KubeAPIServer {
187-
recoverEtcd = true
188-
fmt.Printf("[upgrade/staticpods] The %s manifest will be restored if component %q fails to upgrade\n", constants.Etcd, component)
189-
}
190-
191177
if err := renewCerts(cfg, component); err != nil {
192178
return errors.Wrapf(err, "failed to renew certificates for component %q", component)
193179
}
@@ -311,14 +297,6 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
311297
return true, errors.Wrap(err, "error creating local etcd static pod manifest file")
312298
}
313299

314-
// Waiter configurations for checking etcd status
315-
// If we are upgrading TLS we need to wait for old static pod to be removed.
316-
// This is needed because we are not able to currently verify that the static pod
317-
// has been updated through the apiserver across an etcd TLS upgrade.
318-
// This value is arbitrary but seems to be long enough in manual testing.
319-
noDelay := 0 * time.Second
320-
podRestartDelay := 30 * time.Second
321-
322300
retries := 10
323301
retryInterval := 15 * time.Second
324302

@@ -328,7 +306,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
328306
// Since upgrade component failed, the old etcd manifest has either been restored or was never touched
329307
// Now we need to check the health of etcd cluster if it is up with old manifest
330308
fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available")
331-
if _, err := oldEtcdClient.WaitForClusterAvailable(noDelay, retries, retryInterval); err != nil {
309+
if _, err := oldEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil {
332310
fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err)
333311

334312
// At this point we know that etcd cluster is dead and it is safe to copy backup datastore and to rollback old etcd manifest
@@ -341,7 +319,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
341319

342320
// Now that we've rolled back the data, let's check if the cluster comes up
343321
fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available")
344-
if _, err := oldEtcdClient.WaitForClusterAvailable(noDelay, retries, retryInterval); err != nil {
322+
if _, err := oldEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil {
345323
fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err)
346324
// Nothing else left to try to recover etcd cluster
347325
return true, errors.Wrapf(err, "fatal error rolling back local etcd cluster manifest, the backup of etcd database is stored here:(%s)", backupEtcdDir)
@@ -366,7 +344,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
366344

367345
// Checking health state of etcd after the upgrade
368346
fmt.Println("[upgrade/etcd] Waiting for etcd to become available")
369-
if _, err = newEtcdClient.WaitForClusterAvailable(podRestartDelay, retries, retryInterval); err != nil {
347+
if _, err = newEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil {
370348
fmt.Printf("[upgrade/etcd] Failed to healthcheck etcd: %v\n", err)
371349
// Despite the fact that upgradeComponent was successful, there is something wrong with the etcd cluster
372350
// First step is to restore back up of datastore
@@ -384,7 +362,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
384362

385363
// Assuming rollback of the old etcd manifest was successful, check the status of etcd cluster again
386364
fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available")
387-
if _, err := oldEtcdClient.WaitForClusterAvailable(noDelay, retries, retryInterval); err != nil {
365+
if _, err := oldEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil {
388366
fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err)
389367
// Nothing else left to try to recover etcd cluster
390368
return true, errors.Wrapf(err, "fatal error rolling back local etcd cluster manifest, the backup of etcd database is stored here:(%s)", backupEtcdDir)

cmd/kubeadm/app/phases/upgrade/staticpods_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ type fakeTLSEtcdClient struct{ TLS bool }
230230

231231
func (c fakeTLSEtcdClient) ClusterAvailable() (bool, error) { return true, nil }
232232

233-
func (c fakeTLSEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) {
233+
func (c fakeTLSEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {
234234
return true, nil
235235
}
236236

@@ -261,7 +261,7 @@ type fakePodManifestEtcdClient struct{ ManifestDir, CertificatesDir string }
261261

262262
func (c fakePodManifestEtcdClient) ClusterAvailable() (bool, error) { return true, nil }
263263

264-
func (c fakePodManifestEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) {
264+
func (c fakePodManifestEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {
265265
return true, nil
266266
}
267267

cmd/kubeadm/app/util/etcd/etcd.go

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ type ClusterInterrogator interface {
4343
GetClusterStatus() (map[string]*clientv3.StatusResponse, error)
4444
GetClusterVersions() (map[string]string, error)
4545
GetVersion() (string, error)
46-
WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error)
46+
WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error)
4747
Sync() error
4848
AddMember(name string, peerAddrs string) ([]Member, error)
4949
}
@@ -328,10 +328,8 @@ func (c Client) GetClusterStatus() (map[string]*clientv3.StatusResponse, error)
328328
return clusterStatus, nil
329329
}
330330

331-
// WaitForClusterAvailable returns true if all endpoints in the cluster are available after an initial delay and retry attempts, an error is returned otherwise
332-
func (c Client) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) {
333-
fmt.Printf("[util/etcd] Waiting %v for initial delay\n", delay)
334-
time.Sleep(delay)
331+
// WaitForClusterAvailable returns true if all endpoints in the cluster are available after retry attempts, an error is returned otherwise
332+
func (c Client) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {
335333
for i := 0; i < retries; i++ {
336334
if i > 0 {
337335
fmt.Printf("[util/etcd] Waiting %v until next retry\n", retryInterval)

0 commit comments

Comments
 (0)