Skip to content

Commit 2bc3804

Browse files
authored
Merge pull request kubernetes#81319 from neolit123/1.16-kubeadm-upgrade-health-check
kubeadm: add a upgrade health check that deploys a Job
2 parents 036cf78 + 906d315 commit 2bc3804

File tree

3 files changed

+136
-26
lines changed

3 files changed

+136
-26
lines changed

cmd/kubeadm/app/cmd/upgrade/common.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ func enforceRequirements(flags *applyPlanFlags, dryRun bool, newK8sVersion strin
126126
}
127127

128128
// Run healthchecks against the cluster
129-
if err := upgrade.CheckClusterHealth(client, ignorePreflightErrorsSet); err != nil {
129+
if err := upgrade.CheckClusterHealth(client, &cfg.ClusterConfiguration, ignorePreflightErrorsSet); err != nil {
130130
return nil, nil, nil, errors.Wrap(err, "[upgrade/health] FATAL")
131131
}
132132

cmd/kubeadm/app/phases/upgrade/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,15 @@ go_library(
3636
"//cmd/kubeadm/app/util/etcd:go_default_library",
3737
"//cmd/kubeadm/app/util/staticpod:go_default_library",
3838
"//staging/src/k8s.io/api/apps/v1:go_default_library",
39+
"//staging/src/k8s.io/api/batch/v1:go_default_library",
3940
"//staging/src/k8s.io/api/core/v1:go_default_library",
4041
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
4142
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
4243
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
4344
"//staging/src/k8s.io/apimachinery/pkg/util/errors:go_default_library",
4445
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
4546
"//staging/src/k8s.io/apimachinery/pkg/util/version:go_default_library",
47+
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
4648
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
4749
"//staging/src/k8s.io/component-base/version:go_default_library",
4850
"//vendor/github.com/coredns/corefile-migration/migration:go_default_library",

cmd/kubeadm/app/phases/upgrade/health.go

Lines changed: 133 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,32 +18,39 @@ package upgrade
1818

1919
import (
2020
"fmt"
21-
"net/http"
2221
"os"
22+
"time"
2323

2424
"github.com/pkg/errors"
2525

2626
apps "k8s.io/api/apps/v1"
27-
"k8s.io/api/core/v1"
27+
batchv1 "k8s.io/api/batch/v1"
28+
v1 "k8s.io/api/core/v1"
2829
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2930
"k8s.io/apimachinery/pkg/labels"
3031
"k8s.io/apimachinery/pkg/util/sets"
32+
"k8s.io/apimachinery/pkg/util/wait"
3133
clientset "k8s.io/client-go/kubernetes"
34+
"k8s.io/klog"
35+
kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm"
3236
"k8s.io/kubernetes/cmd/kubeadm/app/constants"
37+
"k8s.io/kubernetes/cmd/kubeadm/app/images"
3338
"k8s.io/kubernetes/cmd/kubeadm/app/preflight"
39+
utilpointer "k8s.io/utils/pointer"
3440
)
3541

3642
// healthCheck is a helper struct for easily performing healthchecks against the cluster and printing the output
3743
type healthCheck struct {
3844
name string
3945
client clientset.Interface
40-
// f is invoked with a k8s client passed to it. Should return an optional error
41-
f func(clientset.Interface) error
46+
cfg *kubeadmapi.ClusterConfiguration
47+
// f is invoked with a k8s client and a kubeadm ClusterConfiguration passed to it. Should return an optional error
48+
f func(clientset.Interface, *kubeadmapi.ClusterConfiguration) error
4249
}
4350

4451
// Check is part of the preflight.Checker interface
4552
func (c *healthCheck) Check() (warnings, errors []error) {
46-
if err := c.f(c.client); err != nil {
53+
if err := c.f(c.client, c.cfg); err != nil {
4754
return nil, []error{err}
4855
}
4956
return nil, nil
@@ -59,49 +66,150 @@ func (c *healthCheck) Name() string {
5966
// - all control-plane Nodes are Ready
6067
// - (if self-hosted) that there are DaemonSets with at least one Pod for all control plane components
6168
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
62-
func CheckClusterHealth(client clientset.Interface, ignoreChecksErrors sets.String) error {
63-
fmt.Println("[upgrade] Making sure the cluster is healthy:")
69+
func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.String) error {
70+
fmt.Println("[upgrade] Running cluster health checks")
6471

6572
healthChecks := []preflight.Checker{
6673
&healthCheck{
67-
name: "APIServerHealth",
74+
name: "CreateJob",
6875
client: client,
69-
f: apiServerHealthy,
76+
cfg: cfg,
77+
f: createJob,
7078
},
7179
&healthCheck{
7280
name: "ControlPlaneNodesReady",
7381
client: client,
7482
f: controlPlaneNodesReady,
7583
},
76-
// TODO: Add a check for ComponentStatuses here?
84+
&healthCheck{
85+
name: "StaticPodManifest",
86+
client: client,
87+
cfg: cfg,
88+
f: staticPodManifestHealth,
89+
},
7790
}
7891

79-
healthChecks = append(healthChecks, &healthCheck{
80-
name: "StaticPodManifest",
81-
client: client,
82-
f: staticPodManifestHealth,
83-
})
84-
8592
return preflight.RunChecks(healthChecks, os.Stderr, ignoreChecksErrors)
8693
}
8794

88-
// apiServerHealthy checks whether the API server's /healthz endpoint is healthy
89-
func apiServerHealthy(client clientset.Interface) error {
90-
healthStatus := 0
95+
// CreateJob is a check that verifies that a Job can be created in the cluster
96+
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) (lastError error) {
97+
const (
98+
jobName = "upgrade-health-check"
99+
ns = metav1.NamespaceSystem
100+
timeout = 15 * time.Second
101+
)
91102

92-
// If client.Discovery().RESTClient() is nil, the fake client is used, and that means we are dry-running. Just proceed
103+
// If client.Discovery().RESTClient() is nil, the fake client is used.
104+
// Return early because the kubeadm dryrun dynamic client only handles the core/v1 GroupVersion.
93105
if client.Discovery().RESTClient() == nil {
106+
fmt.Printf("[dryrun] Would create the Job %q in namespace %q and wait until it completes\n", jobName, ns)
94107
return nil
95108
}
96-
client.Discovery().RESTClient().Get().AbsPath("/healthz").Do().StatusCode(&healthStatus)
97-
if healthStatus != http.StatusOK {
98-
return errors.Errorf("the API Server is unhealthy; /healthz didn't return %q", "ok")
109+
110+
// Prepare Job
111+
job := &batchv1.Job{
112+
ObjectMeta: metav1.ObjectMeta{
113+
Name: jobName,
114+
Namespace: ns,
115+
},
116+
Spec: batchv1.JobSpec{
117+
BackoffLimit: utilpointer.Int32Ptr(0),
118+
Template: v1.PodTemplateSpec{
119+
Spec: v1.PodSpec{
120+
RestartPolicy: v1.RestartPolicyNever,
121+
SecurityContext: &v1.PodSecurityContext{
122+
RunAsUser: utilpointer.Int64Ptr(999),
123+
RunAsGroup: utilpointer.Int64Ptr(999),
124+
RunAsNonRoot: utilpointer.BoolPtr(true),
125+
},
126+
Tolerations: []v1.Toleration{
127+
{
128+
Key: "node-role.kubernetes.io/master",
129+
Effect: v1.TaintEffectNoSchedule,
130+
},
131+
},
132+
Containers: []v1.Container{
133+
{
134+
Name: jobName,
135+
Image: images.GetPauseImage(cfg),
136+
Args: []string{"-v"},
137+
},
138+
},
139+
},
140+
},
141+
},
142+
}
143+
144+
// Check if the Job already exists and delete it
145+
if _, err := client.BatchV1().Jobs(ns).Get(jobName, metav1.GetOptions{}); err == nil {
146+
if err = deleteHealthCheckJob(client, ns, jobName); err != nil {
147+
return err
148+
}
149+
}
150+
151+
// Cleanup the Job on exit
152+
defer func() {
153+
lastError = deleteHealthCheckJob(client, ns, jobName)
154+
}()
155+
156+
// Create the Job, but retry in case it is being currently deleted
157+
klog.V(2).Infof("Creating Job %q in the namespace %q", jobName, ns)
158+
err := wait.PollImmediate(time.Second*1, timeout, func() (bool, error) {
159+
if _, err := client.BatchV1().Jobs(ns).Create(job); err != nil {
160+
klog.V(2).Infof("Could not create Job %q in the namespace %q, retrying: %v", jobName, ns, err)
161+
lastError = err
162+
return false, nil
163+
}
164+
return true, nil
165+
})
166+
if err != nil {
167+
return errors.Wrapf(lastError, "could not create Job %q in the namespace %q", jobName, ns)
168+
}
169+
170+
// Waiting and manually deleteing the Job is a workaround to not enabling the TTL controller.
171+
// TODO: refactor this if the TTL controller is enabled in kubeadm once it goes Beta.
172+
173+
// Wait for the Job to complete
174+
err = wait.PollImmediate(time.Second*1, timeout, func() (bool, error) {
175+
job, err := client.BatchV1().Jobs(ns).Get(jobName, metav1.GetOptions{})
176+
if err != nil {
177+
lastError = err
178+
klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err)
179+
return false, nil
180+
}
181+
for _, cond := range job.Status.Conditions {
182+
if cond.Type == batchv1.JobComplete {
183+
return true, nil
184+
}
185+
}
186+
lastError = errors.Errorf("no condition of type %v", batchv1.JobComplete)
187+
klog.V(2).Infof("Job %q in the namespace %q is not yet complete, retrying", jobName, ns)
188+
return false, nil
189+
})
190+
if err != nil {
191+
return errors.Wrapf(lastError, "Job %q in the namespace %q did not complete in %v", jobName, ns, timeout)
192+
}
193+
194+
klog.V(2).Infof("Job %q in the namespace %q completed", jobName, ns)
195+
196+
return nil
197+
}
198+
199+
func deleteHealthCheckJob(client clientset.Interface, ns, jobName string) error {
200+
klog.V(2).Infof("Deleting Job %q in the namespace %q", jobName, ns)
201+
propagation := metav1.DeletePropagationForeground
202+
deleteOptions := &metav1.DeleteOptions{
203+
PropagationPolicy: &propagation,
204+
}
205+
if err := client.BatchV1().Jobs(ns).Delete(jobName, deleteOptions); err != nil {
206+
return errors.Wrapf(err, "could not delete Job %q in the namespace %q", jobName, ns)
99207
}
100208
return nil
101209
}
102210

103211
// controlPlaneNodesReady checks whether all control-plane Nodes in the cluster are in the Running state
104-
func controlPlaneNodesReady(client clientset.Interface) error {
212+
func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterConfiguration) error {
105213
selector := labels.SelectorFromSet(labels.Set(map[string]string{
106214
constants.LabelNodeRoleMaster: "",
107215
}))
@@ -124,7 +232,7 @@ func controlPlaneNodesReady(client clientset.Interface) error {
124232
}
125233

126234
// staticPodManifestHealth makes sure the required static pods are presents
127-
func staticPodManifestHealth(_ clientset.Interface) error {
235+
func staticPodManifestHealth(_ clientset.Interface, _ *kubeadmapi.ClusterConfiguration) error {
128236
nonExistentManifests := []string{}
129237
for _, component := range constants.ControlPlaneComponents {
130238
manifestFile := constants.GetStaticPodFilepath(component, constants.GetStaticPodDirectory())

0 commit comments

Comments
 (0)