Skip to content

Commit 0360916

Browse files
authored
Merge pull request kubernetes#120615 from mkowalski/OCPBUGS-18641
cloud-node-lifecycle controller: add fallback for empty providerID in shutdown
2 parents ebf46ce + 60a602f commit 0360916

File tree

5 files changed

+368
-26
lines changed

5 files changed

+368
-26
lines changed

staging/src/k8s.io/cloud-provider/cloud.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ func DefaultLoadBalancerName(service *v1.Service) string {
9898
}
9999

100100
// GetInstanceProviderID builds a ProviderID for a node in a cloud.
101+
// Note that if the instance does not exist, we must return ("", cloudprovider.InstanceNotFound)
102+
// cloudprovider.InstanceNotFound should NOT be returned for instances that exist but are stopped/sleeping
101103
func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.NodeName) (string, error) {
102104
instances, ok := cloud.Instances()
103105
if !ok {
@@ -108,8 +110,11 @@ func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.
108110
if err == NotImplemented {
109111
return "", err
110112
}
113+
if err == InstanceNotFound {
114+
return "", err
115+
}
111116

112-
return "", fmt.Errorf("failed to get instance ID from cloud provider: %v", err)
117+
return "", fmt.Errorf("failed to get instance ID from cloud provider: %w", err)
113118
}
114119
return cloud.ProviderName() + "://" + instanceID, nil
115120
}

staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go

Lines changed: 44 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
152152

153153
// At this point the node has NotReady status, we need to check if the node has been removed
154154
// from the cloud provider. If node cannot be found in cloudprovider, then delete the node
155-
exists, err := ensureNodeExistsByProviderID(ctx, c.cloud, node)
155+
exists, err := c.ensureNodeExistsByProviderID(ctx, node)
156156
if err != nil {
157157
klog.Errorf("error checking if node %s exists: %v", node.Name, err)
158158
continue
@@ -180,7 +180,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
180180
// Node exists. We need to check this to get taint working in similar in all cloudproviders
181181
// current problem is that shutdown nodes are not working in similar way ie. all cloudproviders
182182
// does not delete node from kubernetes cluster when instance it is shutdown see issue #46442
183-
shutdown, err := shutdownInCloudProvider(ctx, c.cloud, node)
183+
shutdown, err := c.shutdownInCloudProvider(ctx, node)
184184
if err != nil {
185185
klog.Errorf("error checking if node %s is shutdown: %v", node.Name, err)
186186
}
@@ -196,18 +196,49 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
196196
}
197197
}
198198

199+
// getProviderID returns the provider ID for the node. If Node CR has no provider ID,
200+
// it will be the one from the cloud provider.
201+
func (c *CloudNodeLifecycleController) getProviderID(ctx context.Context, node *v1.Node) (string, error) {
202+
if node.Spec.ProviderID != "" {
203+
return node.Spec.ProviderID, nil
204+
}
205+
206+
if instanceV2, ok := c.cloud.InstancesV2(); ok {
207+
metadata, err := instanceV2.InstanceMetadata(ctx, node)
208+
if err != nil {
209+
return "", err
210+
}
211+
return metadata.ProviderID, nil
212+
}
213+
214+
providerID, err := cloudprovider.GetInstanceProviderID(ctx, c.cloud, types.NodeName(node.Name))
215+
if err != nil {
216+
return "", err
217+
}
218+
219+
return providerID, nil
220+
}
221+
199222
// shutdownInCloudProvider returns true if the node is shutdown on the cloud provider
200-
func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) {
201-
if instanceV2, ok := cloud.InstancesV2(); ok {
223+
func (c *CloudNodeLifecycleController) shutdownInCloudProvider(ctx context.Context, node *v1.Node) (bool, error) {
224+
if instanceV2, ok := c.cloud.InstancesV2(); ok {
202225
return instanceV2.InstanceShutdown(ctx, node)
203226
}
204227

205-
instances, ok := cloud.Instances()
228+
instances, ok := c.cloud.Instances()
206229
if !ok {
207230
return false, errors.New("cloud provider does not support instances")
208231
}
209232

210-
shutdown, err := instances.InstanceShutdownByProviderID(ctx, node.Spec.ProviderID)
233+
providerID, err := c.getProviderID(ctx, node)
234+
if err != nil {
235+
if err == cloudprovider.InstanceNotFound {
236+
return false, nil
237+
}
238+
return false, err
239+
}
240+
241+
shutdown, err := instances.InstanceShutdownByProviderID(ctx, providerID)
211242
if err == cloudprovider.NotImplemented {
212243
return false, nil
213244
}
@@ -216,32 +247,22 @@ func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface,
216247
}
217248

218249
// ensureNodeExistsByProviderID checks if the instance exists by the provider id,
219-
// If provider id in spec is empty it calls instanceId with node name to get provider id
220-
func ensureNodeExistsByProviderID(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) {
221-
if instanceV2, ok := cloud.InstancesV2(); ok {
250+
func (c *CloudNodeLifecycleController) ensureNodeExistsByProviderID(ctx context.Context, node *v1.Node) (bool, error) {
251+
if instanceV2, ok := c.cloud.InstancesV2(); ok {
222252
return instanceV2.InstanceExists(ctx, node)
223253
}
224254

225-
instances, ok := cloud.Instances()
255+
instances, ok := c.cloud.Instances()
226256
if !ok {
227257
return false, errors.New("instances interface not supported in the cloud provider")
228258
}
229259

230-
providerID := node.Spec.ProviderID
231-
if providerID == "" {
232-
var err error
233-
providerID, err = instances.InstanceID(ctx, types.NodeName(node.Name))
234-
if err != nil {
235-
if err == cloudprovider.InstanceNotFound {
236-
return false, nil
237-
}
238-
return false, err
239-
}
240-
241-
if providerID == "" {
242-
klog.Warningf("Cannot find valid providerID for node name %q, assuming non existence", node.Name)
260+
providerID, err := c.getProviderID(ctx, node)
261+
if err != nil {
262+
if err == cloudprovider.InstanceNotFound {
243263
return false, nil
244264
}
265+
return false, err
245266
}
246267

247268
return instances.InstanceExistsByProviderID(ctx, providerID)

0 commit comments

Comments
 (0)