Skip to content

Commit e178cac

Browse files
authored
Merge pull request kubernetes#89589 from jsafrane/fix-node-startup
Wait for APIServer 'ok' forever during CSINode initialization during Kubelet init
2 parents 8d257ad + 8bdbd4d commit e178cac

File tree

3 files changed

+49
-18
lines changed

3 files changed

+49
-18
lines changed

pkg/volume/csi/csi_plugin.go

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,14 @@ limitations under the License.
1717
package csi
1818

1919
import (
20+
"context"
2021
"errors"
2122
"fmt"
2223
"os"
2324
"path/filepath"
2425
"strings"
2526
"time"
2627

27-
"context"
28-
2928
"k8s.io/klog"
3029

3130
api "k8s.io/api/core/v1"
@@ -227,10 +226,10 @@ func (p *csiPlugin) Init(host volume.VolumeHost) error {
227226

228227
if utilfeature.DefaultFeatureGate.Enabled(features.CSINodeInfo) &&
229228
utilfeature.DefaultFeatureGate.Enabled(features.CSIMigration) {
230-
// This function prevents Kubelet from posting Ready status until CSINodeInfo
229+
// This function prevents Kubelet from posting Ready status until CSINode
231230
// is both installed and initialized
232231
if err := initializeCSINode(host); err != nil {
233-
return errors.New(log("failed to initialize CSINodeInfo: %v", err))
232+
return errors.New(log("failed to initialize CSINode: %v", err))
234233
}
235234
}
236235

@@ -240,21 +239,28 @@ func (p *csiPlugin) Init(host volume.VolumeHost) error {
240239
func initializeCSINode(host volume.VolumeHost) error {
241240
kvh, ok := host.(volume.KubeletVolumeHost)
242241
if !ok {
243-
klog.V(4).Info("Cast from VolumeHost to KubeletVolumeHost failed. Skipping CSINodeInfo initialization, not running on kubelet")
242+
klog.V(4).Info("Cast from VolumeHost to KubeletVolumeHost failed. Skipping CSINode initialization, not running on kubelet")
244243
return nil
245244
}
246245
kubeClient := host.GetKubeClient()
247246
if kubeClient == nil {
248-
// Kubelet running in standalone mode. Skip CSINodeInfo initialization
249-
klog.Warning("Skipping CSINodeInfo initialization, kubelet running in standalone mode")
247+
// Kubelet running in standalone mode. Skip CSINode initialization
248+
klog.Warning("Skipping CSINode initialization, kubelet running in standalone mode")
250249
return nil
251250
}
252251

253-
kvh.SetKubeletError(errors.New("CSINodeInfo is not yet initialized"))
252+
kvh.SetKubeletError(errors.New("CSINode is not yet initialized"))
254253

255254
go func() {
256255
defer utilruntime.HandleCrash()
257256

257+
// First wait indefinitely to talk to Kube APIServer
258+
nodeName := host.GetNodeName()
259+
err := waitForAPIServerForever(kubeClient, nodeName)
260+
if err != nil {
261+
klog.Fatalf("Failed to initialize CSINode while waiting for API server to report ok: %v", err)
262+
}
263+
258264
// Backoff parameters tuned to retry over 140 seconds. Will fail and restart the Kubelet
259265
// after max retry steps.
260266
initBackoff := wait.Backoff{
@@ -263,12 +269,12 @@ func initializeCSINode(host volume.VolumeHost) error {
263269
Factor: 6.0,
264270
Jitter: 0.1,
265271
}
266-
err := wait.ExponentialBackoff(initBackoff, func() (bool, error) {
267-
klog.V(4).Infof("Initializing migrated drivers on CSINodeInfo")
272+
err = wait.ExponentialBackoff(initBackoff, func() (bool, error) {
273+
klog.V(4).Infof("Initializing migrated drivers on CSINode")
268274
err := nim.InitializeCSINodeWithAnnotation()
269275
if err != nil {
270-
kvh.SetKubeletError(fmt.Errorf("Failed to initialize CSINodeInfo: %v", err))
271-
klog.Errorf("Failed to initialize CSINodeInfo: %v", err)
276+
kvh.SetKubeletError(fmt.Errorf("Failed to initialize CSINode: %v", err))
277+
klog.Errorf("Failed to initialize CSINode: %v", err)
272278
return false, nil
273279
}
274280

@@ -282,7 +288,7 @@ func initializeCSINode(host volume.VolumeHost) error {
282288
// using CSI for all Migrated volume plugins. Then all the CSINode initialization
283289
// code can be dropped from Kubelet.
284290
// Kill the Kubelet process and allow it to restart to retry initialization
285-
klog.Fatalf("Failed to initialize CSINodeInfo after retrying")
291+
klog.Fatalf("Failed to initialize CSINode after retrying: %v", err)
286292
}
287293
}()
288294
return nil
@@ -914,3 +920,28 @@ func highestSupportedVersion(versions []string) (*utilversion.Version, error) {
914920
}
915921
return highestSupportedVersion, nil
916922
}
923+
924+
// waitForAPIServerForever waits forever to get a CSINode instance as a proxy
925+
// for a healthy APIServer
926+
func waitForAPIServerForever(client clientset.Interface, nodeName types.NodeName) error {
927+
var lastErr error
928+
err := wait.PollImmediateInfinite(time.Second, func() (bool, error) {
929+
// Get a CSINode from API server to make sure 1) kubelet can reach API server
930+
// and 2) it has enough permissions. Kubelet may have restricted permissions
931+
// when it's bootstrapping TLS.
932+
// https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet-tls-bootstrapping/
933+
_, lastErr = client.StorageV1().CSINodes().Get(context.TODO(), string(nodeName), meta.GetOptions{})
934+
if lastErr == nil || apierrors.IsNotFound(lastErr) {
935+
// API server contacted
936+
return true, nil
937+
}
938+
klog.V(2).Infof("Failed to contact API server when waiting for CSINode publishing: %s", lastErr)
939+
return false, nil
940+
})
941+
if err != nil {
942+
// In theory this is unreachable, but just in case:
943+
return fmt.Errorf("%v: %v", err, lastErr)
944+
}
945+
946+
return nil
947+
}

pkg/volume/csi/nodeinfomanager/nodeinfomanager.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -397,16 +397,16 @@ func (nim *nodeInfoManager) InitializeCSINodeWithAnnotation() error {
397397
return goerrors.New("error getting CSI client")
398398
}
399399

400-
var updateErrs []error
400+
var lastErr error
401401
err := wait.ExponentialBackoff(updateBackoff, func() (bool, error) {
402-
if err := nim.tryInitializeCSINodeWithAnnotation(csiKubeClient); err != nil {
403-
updateErrs = append(updateErrs, err)
402+
if lastErr = nim.tryInitializeCSINodeWithAnnotation(csiKubeClient); lastErr != nil {
403+
klog.V(2).Infof("Failed to publish CSINode: %v", lastErr)
404404
return false, nil
405405
}
406406
return true, nil
407407
})
408408
if err != nil {
409-
return fmt.Errorf("error updating CSINode annotation: %v; caused by: %v", err, utilerrors.NewAggregate(updateErrs))
409+
return fmt.Errorf("error updating CSINode annotation: %v; caused by: %v", err, lastErr)
410410
}
411411

412412
return nil

pkg/volume/testing/testing.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1870,7 +1870,7 @@ func (f *fakeVolumeHost) WaitForCacheSync() error {
18701870
}
18711871

18721872
func (f *fakeVolumeHost) WaitForKubeletErrNil() error {
1873-
return wait.PollImmediate(100*time.Millisecond, 10*time.Second, func() (bool, error) {
1873+
return wait.PollImmediate(10*time.Millisecond, 10*time.Second, func() (bool, error) {
18741874
f.mux.Lock()
18751875
defer f.mux.Unlock()
18761876
return f.kubeletErr == nil, nil

0 commit comments

Comments
 (0)