@@ -18,32 +18,39 @@ package upgrade
18
18
19
19
import (
20
20
"fmt"
21
- "net/http"
22
21
"os"
22
+ "time"
23
23
24
24
"github.com/pkg/errors"
25
25
26
26
apps "k8s.io/api/apps/v1"
27
- "k8s.io/api/core/v1"
27
+ batchv1 "k8s.io/api/batch/v1"
28
+ v1 "k8s.io/api/core/v1"
28
29
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29
30
"k8s.io/apimachinery/pkg/labels"
30
31
"k8s.io/apimachinery/pkg/util/sets"
32
+ "k8s.io/apimachinery/pkg/util/wait"
31
33
clientset "k8s.io/client-go/kubernetes"
34
+ "k8s.io/klog"
35
+ kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm"
32
36
"k8s.io/kubernetes/cmd/kubeadm/app/constants"
37
+ "k8s.io/kubernetes/cmd/kubeadm/app/images"
33
38
"k8s.io/kubernetes/cmd/kubeadm/app/preflight"
39
+ utilpointer "k8s.io/utils/pointer"
34
40
)
35
41
36
42
// healthCheck is a helper struct for easily performing healthchecks against the cluster and printing the output
37
43
type healthCheck struct {
38
44
name string
39
45
client clientset.Interface
40
- // f is invoked with a k8s client passed to it. Should return an optional error
41
- f func (clientset.Interface ) error
46
+ cfg * kubeadmapi.ClusterConfiguration
47
+ // f is invoked with a k8s client and a kubeadm ClusterConfiguration passed to it. Should return an optional error
48
+ f func (clientset.Interface , * kubeadmapi.ClusterConfiguration ) error
42
49
}
43
50
44
51
// Check is part of the preflight.Checker interface
45
52
func (c * healthCheck ) Check () (warnings , errors []error ) {
46
- if err := c .f (c .client ); err != nil {
53
+ if err := c .f (c .client , c . cfg ); err != nil {
47
54
return nil , []error {err }
48
55
}
49
56
return nil , nil
@@ -59,49 +66,150 @@ func (c *healthCheck) Name() string {
59
66
// - all control-plane Nodes are Ready
60
67
// - (if self-hosted) that there are DaemonSets with at least one Pod for all control plane components
61
68
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
62
- func CheckClusterHealth (client clientset.Interface , ignoreChecksErrors sets.String ) error {
63
- fmt .Println ("[upgrade] Making sure the cluster is healthy: " )
69
+ func CheckClusterHealth (client clientset.Interface , cfg * kubeadmapi. ClusterConfiguration , ignoreChecksErrors sets.String ) error {
70
+ fmt .Println ("[upgrade] Running cluster health checks " )
64
71
65
72
healthChecks := []preflight.Checker {
66
73
& healthCheck {
67
- name : "APIServerHealth " ,
74
+ name : "CreateJob " ,
68
75
client : client ,
69
- f : apiServerHealthy ,
76
+ cfg : cfg ,
77
+ f : createJob ,
70
78
},
71
79
& healthCheck {
72
80
name : "ControlPlaneNodesReady" ,
73
81
client : client ,
74
82
f : controlPlaneNodesReady ,
75
83
},
76
- // TODO: Add a check for ComponentStatuses here?
84
+ & healthCheck {
85
+ name : "StaticPodManifest" ,
86
+ client : client ,
87
+ cfg : cfg ,
88
+ f : staticPodManifestHealth ,
89
+ },
77
90
}
78
91
79
- healthChecks = append (healthChecks , & healthCheck {
80
- name : "StaticPodManifest" ,
81
- client : client ,
82
- f : staticPodManifestHealth ,
83
- })
84
-
85
92
return preflight .RunChecks (healthChecks , os .Stderr , ignoreChecksErrors )
86
93
}
87
94
88
- // apiServerHealthy checks whether the API server's /healthz endpoint is healthy
89
- func apiServerHealthy (client clientset.Interface ) error {
90
- healthStatus := 0
95
+ // CreateJob is a check that verifies that a Job can be created in the cluster
96
+ func createJob (client clientset.Interface , cfg * kubeadmapi.ClusterConfiguration ) (lastError error ) {
97
+ const (
98
+ jobName = "upgrade-health-check"
99
+ ns = metav1 .NamespaceSystem
100
+ timeout = 15 * time .Second
101
+ )
91
102
92
- // If client.Discovery().RESTClient() is nil, the fake client is used, and that means we are dry-running. Just proceed
103
+ // If client.Discovery().RESTClient() is nil, the fake client is used.
104
+ // Return early because the kubeadm dryrun dynamic client only handles the core/v1 GroupVersion.
93
105
if client .Discovery ().RESTClient () == nil {
106
+ fmt .Printf ("[dryrun] Would create the Job %q in namespace %q and wait until it completes\n " , jobName , ns )
94
107
return nil
95
108
}
96
- client .Discovery ().RESTClient ().Get ().AbsPath ("/healthz" ).Do ().StatusCode (& healthStatus )
97
- if healthStatus != http .StatusOK {
98
- return errors .Errorf ("the API Server is unhealthy; /healthz didn't return %q" , "ok" )
109
+
110
+ // Prepare Job
111
+ job := & batchv1.Job {
112
+ ObjectMeta : metav1.ObjectMeta {
113
+ Name : jobName ,
114
+ Namespace : ns ,
115
+ },
116
+ Spec : batchv1.JobSpec {
117
+ BackoffLimit : utilpointer .Int32Ptr (0 ),
118
+ Template : v1.PodTemplateSpec {
119
+ Spec : v1.PodSpec {
120
+ RestartPolicy : v1 .RestartPolicyNever ,
121
+ SecurityContext : & v1.PodSecurityContext {
122
+ RunAsUser : utilpointer .Int64Ptr (999 ),
123
+ RunAsGroup : utilpointer .Int64Ptr (999 ),
124
+ RunAsNonRoot : utilpointer .BoolPtr (true ),
125
+ },
126
+ Tolerations : []v1.Toleration {
127
+ {
128
+ Key : "node-role.kubernetes.io/master" ,
129
+ Effect : v1 .TaintEffectNoSchedule ,
130
+ },
131
+ },
132
+ Containers : []v1.Container {
133
+ {
134
+ Name : jobName ,
135
+ Image : images .GetPauseImage (cfg ),
136
+ Args : []string {"-v" },
137
+ },
138
+ },
139
+ },
140
+ },
141
+ },
142
+ }
143
+
144
+ // Check if the Job already exists and delete it
145
+ if _ , err := client .BatchV1 ().Jobs (ns ).Get (jobName , metav1.GetOptions {}); err == nil {
146
+ if err = deleteHealthCheckJob (client , ns , jobName ); err != nil {
147
+ return err
148
+ }
149
+ }
150
+
151
+ // Cleanup the Job on exit
152
+ defer func () {
153
+ lastError = deleteHealthCheckJob (client , ns , jobName )
154
+ }()
155
+
156
+ // Create the Job, but retry in case it is being currently deleted
157
+ klog .V (2 ).Infof ("Creating Job %q in the namespace %q" , jobName , ns )
158
+ err := wait .PollImmediate (time .Second * 1 , timeout , func () (bool , error ) {
159
+ if _ , err := client .BatchV1 ().Jobs (ns ).Create (job ); err != nil {
160
+ klog .V (2 ).Infof ("Could not create Job %q in the namespace %q, retrying: %v" , jobName , ns , err )
161
+ lastError = err
162
+ return false , nil
163
+ }
164
+ return true , nil
165
+ })
166
+ if err != nil {
167
+ return errors .Wrapf (lastError , "could not create Job %q in the namespace %q" , jobName , ns )
168
+ }
169
+
170
+ // Waiting and manually deleteing the Job is a workaround to not enabling the TTL controller.
171
+ // TODO: refactor this if the TTL controller is enabled in kubeadm once it goes Beta.
172
+
173
+ // Wait for the Job to complete
174
+ err = wait .PollImmediate (time .Second * 1 , timeout , func () (bool , error ) {
175
+ job , err := client .BatchV1 ().Jobs (ns ).Get (jobName , metav1.GetOptions {})
176
+ if err != nil {
177
+ lastError = err
178
+ klog .V (2 ).Infof ("could not get Job %q in the namespace %q, retrying: %v" , jobName , ns , err )
179
+ return false , nil
180
+ }
181
+ for _ , cond := range job .Status .Conditions {
182
+ if cond .Type == batchv1 .JobComplete {
183
+ return true , nil
184
+ }
185
+ }
186
+ lastError = errors .Errorf ("no condition of type %v" , batchv1 .JobComplete )
187
+ klog .V (2 ).Infof ("Job %q in the namespace %q is not yet complete, retrying" , jobName , ns )
188
+ return false , nil
189
+ })
190
+ if err != nil {
191
+ return errors .Wrapf (lastError , "Job %q in the namespace %q did not complete in %v" , jobName , ns , timeout )
192
+ }
193
+
194
+ klog .V (2 ).Infof ("Job %q in the namespace %q completed" , jobName , ns )
195
+
196
+ return nil
197
+ }
198
+
199
+ func deleteHealthCheckJob (client clientset.Interface , ns , jobName string ) error {
200
+ klog .V (2 ).Infof ("Deleting Job %q in the namespace %q" , jobName , ns )
201
+ propagation := metav1 .DeletePropagationForeground
202
+ deleteOptions := & metav1.DeleteOptions {
203
+ PropagationPolicy : & propagation ,
204
+ }
205
+ if err := client .BatchV1 ().Jobs (ns ).Delete (jobName , deleteOptions ); err != nil {
206
+ return errors .Wrapf (err , "could not delete Job %q in the namespace %q" , jobName , ns )
99
207
}
100
208
return nil
101
209
}
102
210
103
211
// controlPlaneNodesReady checks whether all control-plane Nodes in the cluster are in the Running state
104
- func controlPlaneNodesReady (client clientset.Interface ) error {
212
+ func controlPlaneNodesReady (client clientset.Interface , _ * kubeadmapi. ClusterConfiguration ) error {
105
213
selector := labels .SelectorFromSet (labels .Set (map [string ]string {
106
214
constants .LabelNodeRoleMaster : "" ,
107
215
}))
@@ -124,7 +232,7 @@ func controlPlaneNodesReady(client clientset.Interface) error {
124
232
}
125
233
126
234
// staticPodManifestHealth makes sure the required static pods are presents
127
- func staticPodManifestHealth (_ clientset.Interface ) error {
235
+ func staticPodManifestHealth (_ clientset.Interface , _ * kubeadmapi. ClusterConfiguration ) error {
128
236
nonExistentManifests := []string {}
129
237
for _ , component := range constants .ControlPlaneComponents {
130
238
manifestFile := constants .GetStaticPodFilepath (component , constants .GetStaticPodDirectory ())
0 commit comments