Skip to content

Commit e2be7c9

Browse files
authored
Merge pull request kubernetes#70344 from andrewsykim/consolidate-node-delete
consolidate node deletion logic between kube-controller-manager and cloud-controller-manager
2 parents 7696325 + 5329f09 commit e2be7c9

File tree

18 files changed

+750
-702
lines changed

18 files changed

+750
-702
lines changed

cmd/cloud-controller-manager/app/controllermanager.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,12 +226,23 @@ func startControllers(c *cloudcontrollerconfig.CompletedConfig, stop <-chan stru
226226
nodeController := cloudcontrollers.NewCloudNodeController(
227227
c.SharedInformers.Core().V1().Nodes(),
228228
client("cloud-node-controller"), cloud,
229-
c.ComponentConfig.KubeCloudShared.NodeMonitorPeriod.Duration,
230229
c.ComponentConfig.NodeStatusUpdateFrequency.Duration)
231230

232-
nodeController.Run(stop)
231+
go nodeController.Run(stop)
233232
time.Sleep(wait.Jitter(c.ComponentConfig.Generic.ControllerStartInterval.Duration, ControllerStartJitter))
234233

234+
cloudNodeLifecycleController, err := cloudcontrollers.NewCloudNodeLifecycleController(
235+
c.SharedInformers.Core().V1().Nodes(),
236+
client("cloud-node-lifecycle-controller"), cloud,
237+
c.ComponentConfig.KubeCloudShared.NodeMonitorPeriod.Duration,
238+
)
239+
if err != nil {
240+
klog.Errorf("failed to start cloud node lifecycle controller: %s", err)
241+
} else {
242+
go cloudNodeLifecycleController.Run(stop)
243+
time.Sleep(wait.Jitter(c.ComponentConfig.Generic.ControllerStartInterval.Duration, ControllerStartJitter))
244+
}
245+
235246
// Start the PersistentVolumeLabelController
236247
pvlController := cloudcontrollers.NewPersistentVolumeLabelController(client("pvl-controller"), cloud)
237248
go pvlController.Run(5, stop)

cmd/kube-controller-manager/app/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ go_library(
4646
"//pkg/controller/certificates/cleaner:go_default_library",
4747
"//pkg/controller/certificates/rootcacertpublisher:go_default_library",
4848
"//pkg/controller/certificates/signer:go_default_library",
49+
"//pkg/controller/cloud:go_default_library",
4950
"//pkg/controller/clusterroleaggregation:go_default_library",
5051
"//pkg/controller/cronjob:go_default_library",
5152
"//pkg/controller/daemon:go_default_library",

cmd/kube-controller-manager/app/controllermanager.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,13 +383,13 @@ func NewControllerInitializers(loopMode ControllerLoopMode) map[string]InitFunc
383383
controllers["bootstrapsigner"] = startBootstrapSignerController
384384
controllers["tokencleaner"] = startTokenCleanerController
385385
controllers["nodeipam"] = startNodeIpamController
386+
controllers["nodelifecycle"] = startNodeLifecycleController
386387
if loopMode == IncludeCloudLoops {
387388
controllers["service"] = startServiceController
388389
controllers["route"] = startRouteController
390+
controllers["cloudnodelifecycle"] = startCloudNodeLifecycleController
389391
// TODO: volume controller into the IncludeCloudLoops only set.
390-
// TODO: Separate cluster in cloud check from node lifecycle controller.
391392
}
392-
controllers["nodelifecycle"] = startNodeLifecycleController
393393
controllers["persistentvolume-binder"] = startPersistentVolumeBinderController
394394
controllers["attachdetach"] = startAttachDetachController
395395
controllers["persistentvolume-expander"] = startVolumeExpandController

cmd/kube-controller-manager/app/core.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838
clientset "k8s.io/client-go/kubernetes"
3939
csiclientset "k8s.io/csi-api/pkg/client/clientset/versioned"
4040
"k8s.io/kubernetes/pkg/controller"
41+
cloudcontroller "k8s.io/kubernetes/pkg/controller/cloud"
4142
endpointcontroller "k8s.io/kubernetes/pkg/controller/endpoint"
4243
"k8s.io/kubernetes/pkg/controller/garbagecollector"
4344
namespacecontroller "k8s.io/kubernetes/pkg/controller/namespace"
@@ -125,7 +126,6 @@ func startNodeLifecycleController(ctx ControllerContext) (http.Handler, bool, er
125126
ctx.InformerFactory.Core().V1().Pods(),
126127
ctx.InformerFactory.Core().V1().Nodes(),
127128
ctx.InformerFactory.Extensions().V1beta1().DaemonSets(),
128-
ctx.Cloud,
129129
ctx.ClientBuilder.ClientOrDie("node-controller"),
130130
ctx.ComponentConfig.KubeCloudShared.NodeMonitorPeriod.Duration,
131131
ctx.ComponentConfig.NodeLifecycleController.NodeStartupGracePeriod.Duration,
@@ -146,6 +146,24 @@ func startNodeLifecycleController(ctx ControllerContext) (http.Handler, bool, er
146146
return nil, true, nil
147147
}
148148

149+
func startCloudNodeLifecycleController(ctx ControllerContext) (http.Handler, bool, error) {
150+
cloudNodeLifecycleController, err := cloudcontroller.NewCloudNodeLifecycleController(
151+
ctx.InformerFactory.Core().V1().Nodes(),
152+
ctx.ClientBuilder.ClientOrDie("cloud-node-lifecycle-controller"),
153+
ctx.Cloud,
154+
ctx.ComponentConfig.KubeCloudShared.NodeMonitorPeriod.Duration,
155+
)
156+
if err != nil {
157+
// the controller manager should continue to run if the "Instances" interface is not
158+
// supported, though it's unlikely for a cloud provider to not support it
159+
klog.Errorf("failed to start cloud node lifecycle controller: %v", err)
160+
return nil, false, nil
161+
}
162+
163+
go cloudNodeLifecycleController.Run(ctx.Stop)
164+
return nil, true, nil
165+
}
166+
149167
func startRouteController(ctx ControllerContext) (http.Handler, bool, error) {
150168
if !ctx.ComponentConfig.KubeCloudShared.AllocateNodeCIDRs || !ctx.ComponentConfig.KubeCloudShared.ConfigureCloudRoutes {
151169
klog.Infof("Will not configure cloud provider routes for allocate-node-cidrs: %v, configure-cloud-routes: %v.", ctx.ComponentConfig.KubeCloudShared.AllocateNodeCIDRs, ctx.ComponentConfig.KubeCloudShared.ConfigureCloudRoutes)

pkg/controller/BUILD

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ go_library(
5555
"//pkg/apis/core:go_default_library",
5656
"//pkg/apis/core/install:go_default_library",
5757
"//pkg/apis/core/validation:go_default_library",
58-
"//pkg/scheduler/api:go_default_library",
5958
"//pkg/serviceaccount:go_default_library",
6059
"//pkg/util/hash:go_default_library",
6160
"//pkg/util/taints:go_default_library",

pkg/controller/cloud/BUILD

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@ go_library(
1010
name = "go_default_library",
1111
srcs = [
1212
"node_controller.go",
13+
"node_lifecycle_controller.go",
1314
"pvlcontroller.go",
1415
],
1516
importpath = "k8s.io/kubernetes/pkg/controller/cloud",
1617
deps = [
1718
"//pkg/api/v1/node:go_default_library",
1819
"//pkg/apis/core/v1/helper:go_default_library",
1920
"//pkg/controller:go_default_library",
20-
"//pkg/controller/util/node:go_default_library",
2121
"//pkg/features:go_default_library",
2222
"//pkg/kubelet/apis:go_default_library",
2323
"//pkg/scheduler/api:go_default_library",
@@ -26,6 +26,7 @@ go_library(
2626
"//staging/src/k8s.io/api/core/v1:go_default_library",
2727
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
2828
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
29+
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
2930
"//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library",
3031
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
3132
"//staging/src/k8s.io/apimachinery/pkg/util/runtime:go_default_library",
@@ -51,6 +52,7 @@ go_test(
5152
name = "go_default_test",
5253
srcs = [
5354
"node_controller_test.go",
55+
"node_lifecycle_controller_test.go",
5456
"pvlcontroller_test.go",
5557
],
5658
embed = [":go_default_library"],
@@ -67,10 +69,10 @@ go_test(
6769
"//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library",
6870
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
6971
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
70-
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
7172
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
7273
"//staging/src/k8s.io/apiserver/pkg/util/feature/testing:go_default_library",
7374
"//staging/src/k8s.io/client-go/informers:go_default_library",
75+
"//staging/src/k8s.io/client-go/informers/core/v1:go_default_library",
7476
"//staging/src/k8s.io/client-go/kubernetes/fake:go_default_library",
7577
"//staging/src/k8s.io/client-go/kubernetes/scheme:go_default_library",
7678
"//staging/src/k8s.io/client-go/testing:go_default_library",

pkg/controller/cloud/node_controller.go

Lines changed: 4 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,6 @@ import (
3737
"k8s.io/client-go/tools/record"
3838
clientretry "k8s.io/client-go/util/retry"
3939
cloudprovider "k8s.io/cloud-provider"
40-
nodeutilv1 "k8s.io/kubernetes/pkg/api/v1/node"
41-
"k8s.io/kubernetes/pkg/controller"
42-
nodectrlutil "k8s.io/kubernetes/pkg/controller/util/node"
4340
kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
4441
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
4542
nodeutil "k8s.io/kubernetes/pkg/util/node"
@@ -58,11 +55,6 @@ type CloudNodeController struct {
5855

5956
cloud cloudprovider.Interface
6057

61-
// Value controlling NodeController monitoring period, i.e. how often does NodeController
62-
// check node status posted from kubelet. This value should be lower than nodeMonitorGracePeriod
63-
// set in controller-manager
64-
nodeMonitorPeriod time.Duration
65-
6658
nodeStatusUpdateFrequency time.Duration
6759
}
6860

@@ -79,7 +71,6 @@ func NewCloudNodeController(
7971
nodeInformer coreinformers.NodeInformer,
8072
kubeClient clientset.Interface,
8173
cloud cloudprovider.Interface,
82-
nodeMonitorPeriod time.Duration,
8374
nodeStatusUpdateFrequency time.Duration) *CloudNodeController {
8475

8576
eventBroadcaster := record.NewBroadcaster()
@@ -97,7 +88,6 @@ func NewCloudNodeController(
9788
kubeClient: kubeClient,
9889
recorder: recorder,
9990
cloud: cloud,
100-
nodeMonitorPeriod: nodeMonitorPeriod,
10191
nodeStatusUpdateFrequency: nodeStatusUpdateFrequency,
10292
}
10393

@@ -111,8 +101,9 @@ func NewCloudNodeController(
111101
return cnc
112102
}
113103

114-
// This controller deletes a node if kubelet is not reporting
115-
// and the node is gone from the cloud provider.
104+
// This controller updates newly registered nodes with information
105+
// from the cloud provider. This call is blocking so should be called
106+
// via a goroutine
116107
func (cnc *CloudNodeController) Run(stopCh <-chan struct{}) {
117108
defer utilruntime.HandleCrash()
118109

@@ -121,10 +112,7 @@ func (cnc *CloudNodeController) Run(stopCh <-chan struct{}) {
121112
// very infrequently. DO NOT MODIFY this to perform frequent operations.
122113

123114
// Start a loop to periodically update the node addresses obtained from the cloud
124-
go wait.Until(cnc.UpdateNodeStatus, cnc.nodeStatusUpdateFrequency, stopCh)
125-
126-
// Start a loop to periodically check if any nodes have been deleted from cloudprovider
127-
go wait.Until(cnc.MonitorNode, cnc.nodeMonitorPeriod, stopCh)
115+
wait.Until(cnc.UpdateNodeStatus, cnc.nodeStatusUpdateFrequency, stopCh)
128116
}
129117

130118
// UpdateNodeStatus updates the node status, such as node addresses
@@ -210,108 +198,6 @@ func (cnc *CloudNodeController) updateNodeAddress(node *v1.Node, instances cloud
210198
}
211199
}
212200

213-
// Monitor node queries the cloudprovider for non-ready nodes and deletes them
214-
// if they cannot be found in the cloud provider
215-
func (cnc *CloudNodeController) MonitorNode() {
216-
instances, ok := cnc.cloud.Instances()
217-
if !ok {
218-
utilruntime.HandleError(fmt.Errorf("failed to get instances from cloud provider"))
219-
return
220-
}
221-
222-
nodes, err := cnc.kubeClient.CoreV1().Nodes().List(metav1.ListOptions{ResourceVersion: "0"})
223-
if err != nil {
224-
klog.Errorf("Error monitoring node status: %v", err)
225-
return
226-
}
227-
228-
for i := range nodes.Items {
229-
var currentReadyCondition *v1.NodeCondition
230-
node := &nodes.Items[i]
231-
// Try to get the current node status
232-
// If node status is empty, then kubelet has not posted ready status yet. In this case, process next node
233-
for rep := 0; rep < nodeStatusUpdateRetry; rep++ {
234-
_, currentReadyCondition = nodeutilv1.GetNodeCondition(&node.Status, v1.NodeReady)
235-
if currentReadyCondition != nil {
236-
break
237-
}
238-
name := node.Name
239-
node, err = cnc.kubeClient.CoreV1().Nodes().Get(name, metav1.GetOptions{})
240-
if err != nil {
241-
klog.Errorf("Failed while getting a Node to retry updating NodeStatus. Probably Node %s was deleted.", name)
242-
break
243-
}
244-
time.Sleep(retrySleepTime)
245-
}
246-
if currentReadyCondition == nil {
247-
klog.Errorf("Update status of Node %v from CloudNodeController exceeds retry count or the Node was deleted.", node.Name)
248-
continue
249-
}
250-
// If the known node status says that Node is NotReady, then check if the node has been removed
251-
// from the cloud provider. If node cannot be found in cloudprovider, then delete the node immediately
252-
if currentReadyCondition != nil {
253-
if currentReadyCondition.Status != v1.ConditionTrue {
254-
// we need to check this first to get taint working in similar in all cloudproviders
255-
// current problem is that shutdown nodes are not working in similar way ie. all cloudproviders
256-
// does not delete node from kubernetes cluster when instance it is shutdown see issue #46442
257-
shutdown, err := nodectrlutil.ShutdownInCloudProvider(context.TODO(), cnc.cloud, node)
258-
if err != nil {
259-
klog.Errorf("Error checking if node %s is shutdown: %v", node.Name, err)
260-
}
261-
262-
if shutdown && err == nil {
263-
// if node is shutdown add shutdown taint
264-
err = controller.AddOrUpdateTaintOnNode(cnc.kubeClient, node.Name, controller.ShutdownTaint)
265-
if err != nil {
266-
klog.Errorf("Error patching node taints: %v", err)
267-
}
268-
// Continue checking the remaining nodes since the current one is shutdown.
269-
continue
270-
}
271-
272-
// Check with the cloud provider to see if the node still exists. If it
273-
// doesn't, delete the node immediately.
274-
exists, err := ensureNodeExistsByProviderID(instances, node)
275-
if err != nil {
276-
klog.Errorf("Error checking if node %s exists: %v", node.Name, err)
277-
continue
278-
}
279-
280-
if exists {
281-
// Continue checking the remaining nodes since the current one is fine.
282-
continue
283-
}
284-
285-
klog.V(2).Infof("Deleting node since it is no longer present in cloud provider: %s", node.Name)
286-
287-
ref := &v1.ObjectReference{
288-
Kind: "Node",
289-
Name: node.Name,
290-
UID: types.UID(node.UID),
291-
Namespace: "",
292-
}
293-
klog.V(2).Infof("Recording %s event message for node %s", "DeletingNode", node.Name)
294-
295-
cnc.recorder.Eventf(ref, v1.EventTypeNormal, fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name), "Node %s event: %s", node.Name, "DeletingNode")
296-
297-
go func(nodeName string) {
298-
defer utilruntime.HandleCrash()
299-
if err := cnc.kubeClient.CoreV1().Nodes().Delete(nodeName, nil); err != nil {
300-
klog.Errorf("unable to delete node %q: %v", nodeName, err)
301-
}
302-
}(node.Name)
303-
304-
} else {
305-
// if taint exist remove taint
306-
err = controller.RemoveTaintOffNode(cnc.kubeClient, node.Name, node, controller.ShutdownTaint)
307-
if err != nil {
308-
klog.Errorf("Error patching node taints: %v", err)
309-
}
310-
}
311-
}
312-
}
313-
}
314-
315201
func (cnc *CloudNodeController) UpdateCloudNode(_, newObj interface{}) {
316202
if _, ok := newObj.(*v1.Node); !ok {
317203
utilruntime.HandleError(fmt.Errorf("unexpected object type: %v", newObj))

0 commit comments

Comments
 (0)