Skip to content

Commit 3e59a06

Browse files
committed
kubeadm: optimize the upgrade path from ClusterStatus to annotations
When doing the very first upgrade from a cluster that contains the source of truth in the ClusterStatus struct, the new kubeadm logic will try to retrieve this information from annotations. This changeset adds to both etcd and apiserver endpoint retrieval the special case in which they won't retry if we are in such cases. The logic will retry if we find any unknown error, but will not retry in the following cases: - etcd annotations do not contain etcd endpoints, but the overall list of etcd pods is greater than 0. This means that we listed at least one etcd pod, but they are missing the annotation. - API server annotation is not found on the api server pod for a given node name, but no errors aside from that one were found. This means that the API server pod is present, but is missing the annotation. In both cases there is no point in retrying, and so, this speeds up the upgrade path when coming from a previous existing cluster.
1 parent b140c5d commit 3e59a06

File tree

3 files changed

+48
-10
lines changed

3 files changed

+48
-10
lines changed

cmd/kubeadm/app/util/config/cluster.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,25 @@ import (
4141
"k8s.io/kubernetes/cmd/kubeadm/app/util/apiclient"
4242
)
4343

44+
// unretriableError is an error used temporarily while we are migrating from the
45+
// ClusterStatus struct to an annotation Pod based information. When performing
46+
// the upgrade of all control plane nodes with `kubeadm upgrade apply` and
47+
// `kubeadm upgrade node` we don't want to retry as if we were hitting connectivity
48+
// issues when the pod annotation is missing on the API server pods. This error will
49+
// be used in such scenario, for failing fast, and falling back to the ClusterStatus
50+
// retrieval in those cases.
51+
type unretriableError struct {
52+
err error
53+
}
54+
55+
func newUnretriableError(err error) *unretriableError {
56+
return &unretriableError{err: err}
57+
}
58+
59+
func (ue *unretriableError) Error() string {
60+
return fmt.Sprintf("unretriable error: %s", ue.err.Error())
61+
}
62+
4463
// FetchInitConfigurationFromCluster fetches configuration from a ConfigMap in the cluster
4564
func FetchInitConfigurationFromCluster(client clientset.Interface, w io.Writer, logPrefix string, newControlPlane bool) (*kubeadmapi.InitConfiguration, error) {
4665
fmt.Fprintf(w, "[%s] Reading configuration from the cluster...\n", logPrefix)
@@ -216,6 +235,13 @@ func getAPIEndpointFromPodAnnotation(client clientset.Interface, nodeName string
216235
// static pods were not yet mirrored into the API server we want to wait for this propagation.
217236
err := wait.ExponentialBackoff(backoff, func() (bool, error) {
218237
rawAPIEndpoint, lastErr = getRawAPIEndpointFromPodAnnotationWithoutRetry(client, nodeName)
238+
// TODO (ereslibre): this logic will need tweaking once that we get rid of the ClusterStatus, since we won't have
239+
// the ClusterStatus safety net, we will want to remove the UnretriableError and not make the distinction here
240+
// anymore.
241+
if _, ok := lastErr.(*unretriableError); ok {
242+
// Fail fast scenario, to be removed once we get rid of the ClusterStatus
243+
return true, errors.Wrapf(lastErr, "API server Pods exist, but no API endpoint annotations were found")
244+
}
219245
return lastErr == nil, nil
220246
})
221247
if err != nil {
@@ -246,7 +272,7 @@ func getRawAPIEndpointFromPodAnnotationWithoutRetry(client clientset.Interface,
246272
if apiServerEndpoint, ok := podList.Items[0].Annotations[constants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey]; ok {
247273
return apiServerEndpoint, nil
248274
}
249-
return "", errors.Errorf("API server pod for node name %q hasn't got a %q annotation, cannot retrieve API endpoint", nodeName, constants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey)
275+
return "", newUnretriableError(errors.Errorf("API server pod for node name %q hasn't got a %q annotation, cannot retrieve API endpoint", nodeName, constants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey))
250276
}
251277

252278
// TODO: remove after 1.20, when the ClusterStatus struct is removed from the kubeadm-config ConfigMap.

cmd/kubeadm/app/util/etcd/etcd.go

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,20 @@ func getRawEtcdEndpointsFromPodAnnotation(client clientset.Interface, backoff wa
147147
// Let's tolerate some unexpected transient failures from the API server or load balancers. Also, if
148148
// static pods were not yet mirrored into the API server we want to wait for this propagation.
149149
err := wait.ExponentialBackoff(backoff, func() (bool, error) {
150-
if etcdEndpoints, lastErr = getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client); lastErr != nil {
150+
var overallEtcdPodCount int
151+
if etcdEndpoints, overallEtcdPodCount, lastErr = getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client); lastErr != nil {
151152
return false, nil
152153
}
153-
// If the list of etcd endpoints is empty we want to retry: this can happen if joining a secondary
154-
// control plane while the primary control plane didn't mirror its static pods yet.
155-
return len(etcdEndpoints) > 0, nil
154+
// TODO (ereslibre): this logic will need tweaking once that we get rid of the ClusterStatus, since we won't have
155+
// the ClusterStatus safety net we will have to retry in both cases.
156+
if len(etcdEndpoints) == 0 {
157+
if overallEtcdPodCount == 0 {
158+
return false, nil
159+
}
160+
// Fail fast scenario, to be removed once we get rid of the ClusterStatus
161+
return true, errors.New("etcd Pods exist, but no etcd endpoint annotations were found")
162+
}
163+
return true, nil
156164
})
157165
if err != nil {
158166
if lastErr != nil {
@@ -163,7 +171,10 @@ func getRawEtcdEndpointsFromPodAnnotation(client clientset.Interface, backoff wa
163171
return etcdEndpoints, nil
164172
}
165173

166-
func getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client clientset.Interface) ([]string, error) {
174+
// getRawEtcdEndpointsFromPodAnnotationWithoutRetry returns the list of etcd endpoints as reported by etcd Pod annotations,
175+
// along with the number of global etcd pods. This allows for callers to tell the difference between "no endpoints found",
176+
// and "no endpoints found and pods were listed", so they can skip retrying.
177+
func getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client clientset.Interface) ([]string, int, error) {
167178
klog.V(3).Infof("retrieving etcd endpoints from %q annotation in etcd Pods", constants.EtcdAdvertiseClientUrlsAnnotationKey)
168179
podList, err := client.CoreV1().Pods(metav1.NamespaceSystem).List(
169180
context.TODO(),
@@ -172,17 +183,18 @@ func getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client clientset.Interface
172183
},
173184
)
174185
if err != nil {
175-
return []string{}, err
186+
return []string{}, 0, err
176187
}
177188
etcdEndpoints := []string{}
178189
for _, pod := range podList.Items {
179190
etcdEndpoint, ok := pod.ObjectMeta.Annotations[constants.EtcdAdvertiseClientUrlsAnnotationKey]
180191
if !ok {
181-
return []string{}, errors.Errorf("etcd Pod %q is missing the %q annotation; cannot infer etcd advertise client URL", pod.ObjectMeta.Name, constants.EtcdAdvertiseClientUrlsAnnotationKey)
192+
klog.V(3).Infof("etcd Pod %q is missing the %q annotation; cannot infer etcd advertise client URL using the Pod annotation", pod.ObjectMeta.Name, constants.EtcdAdvertiseClientUrlsAnnotationKey)
193+
continue
182194
}
183195
etcdEndpoints = append(etcdEndpoints, etcdEndpoint)
184196
}
185-
return etcdEndpoints, nil
197+
return etcdEndpoints, len(podList.Items), nil
186198
}
187199

188200
// TODO: remove after 1.20, when the ClusterStatus struct is removed from the kubeadm-config ConfigMap.

cmd/kubeadm/app/util/etcd/etcd_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ func TestGetRawEtcdEndpointsFromPodAnnotationWithoutRetry(t *testing.T) {
312312
if rt.clientSetup != nil {
313313
rt.clientSetup(client)
314314
}
315-
endpoints, err := getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client)
315+
endpoints, _, err := getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client)
316316
if err != nil && !rt.expectedErr {
317317
t.Errorf("got error %v, but wasn't expecting any error", err)
318318
return

0 commit comments

Comments
 (0)