Skip to content

Commit f3194ac

Browse files
dhananjay-ngGouthamML
authored andcommitted
Adding retry with exponential backoff to fetch NodeMetadata during node driver startup
1 parent 1b795bd commit f3194ac

File tree

9 files changed

+638
-131
lines changed

9 files changed

+638
-131
lines changed

pkg/csi-util/utils.go

Lines changed: 80 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ import (
2727
"sync"
2828
"time"
2929

30+
"k8s.io/apimachinery/pkg/util/wait"
31+
3032
"github.com/container-storage-interface/spec/lib/go/csi"
3133
"github.com/oracle/oci-go-sdk/v65/core"
3234
"go.uber.org/zap"
@@ -103,15 +105,19 @@ type FSSVolumeHandler struct {
103105
FsExportPath string
104106
}
105107

106-
type NodeIpFamily struct {
107-
PreferredNodeIpFamily string
108-
Ipv4Enabled bool
109-
Ipv6Enabled bool
108+
type NodeMetadata struct {
109+
PreferredNodeIpFamily string
110+
Ipv4Enabled bool
111+
Ipv6Enabled bool
112+
AvailabilityDomain string
113+
FullAvailabilityDomain string
114+
IsNodeMetadataLoaded bool
110115
}
111116

112117
// CSIConfig represents the structure of the ConfigMap data.
113118
type CSIConfig struct {
114119
Lustre *DriverConfig `yaml:"lustre"`
120+
IsLoaded bool
115121
}
116122

117123
// DriverConfig represents driver-specific configurations.
@@ -134,25 +140,75 @@ func (u *Util) LookupNodeID(k kubernetes.Interface, nodeName string) (string, er
134140
return n.Spec.ProviderID, nil
135141
}
136142

137-
func (u *Util) LookupNodeAvailableDomain(k kubernetes.Interface, nodeID string) (string, string, error) {
138-
n, err := k.CoreV1().Nodes().Get(context.Background(), nodeID, metav1.GetOptions{})
143+
func (u *Util) WaitForKubeApiServerToBeReachableWithContext(ctx context.Context, k kubernetes.Interface, backOffCap time.Duration) {
144+
145+
waitForKubeApiServerCtx, waitForKubeApiServerCtxCancel := context.WithTimeout(ctx, time.Second * 45)
146+
defer waitForKubeApiServerCtxCancel()
147+
148+
backoff := wait.Backoff{
149+
Duration: 1 * time.Second,
150+
Factor: 2.0,
151+
Steps: 5,
152+
Cap: backOffCap,
153+
}
154+
155+
wait.ExponentialBackoffWithContext(
156+
waitForKubeApiServerCtx,
157+
backoff,
158+
func(waitForKubeApiServerCtx context.Context) (bool, error) {
159+
attemptCtx, attemptCancel := context.WithTimeout(waitForKubeApiServerCtx, backoff.Step())
160+
defer attemptCancel()
161+
_, err := k.CoreV1().RESTClient().Get().AbsPath("/version").Do(attemptCtx).Raw()
162+
if err != nil {
163+
u.Logger.With(zap.Error(err)).Errorf("Waiting for kube api server to be reachable, Retrying..")
164+
return false, nil
165+
}
166+
u.Logger.Infof("Kube Api Server is Reachable")
167+
return true, nil
168+
},
169+
)
170+
}
171+
172+
func (u *Util) LoadNodeMetadataFromApiServer(ctx context.Context, k kubernetes.Interface, nodeID string, nodeMetadata *NodeMetadata) (error) {
173+
174+
u.WaitForKubeApiServerToBeReachableWithContext(ctx, k, time.Second * 30)
175+
176+
node, err := k.CoreV1().Nodes().Get(ctx, nodeID, metav1.GetOptions{})
177+
139178
if err != nil {
140-
u.Logger.With(zap.Error(err)).With("nodeId", nodeID).Error("Failed to get Node by name.")
141-
return "", "", fmt.Errorf("failed to get node %s", nodeID)
179+
u.Logger.With(zap.Error(err)).With("nodeId", nodeID).Error("Failed to get Node information from kube api server, Please check if kube api server is accessible.")
180+
return fmt.Errorf("Failed to get node information from kube api server, please check if kube api server is accessible.")
142181
}
143-
if n.Labels != nil {
144-
ad, ok := n.Labels[kubeAPI.LabelTopologyZone]
182+
183+
var ok bool
184+
if node.Labels != nil {
185+
nodeMetadata.AvailabilityDomain, ok = node.Labels[kubeAPI.LabelTopologyZone]
145186
if !ok {
146-
ad, ok = n.Labels[kubeAPI.LabelZoneFailureDomain]
187+
nodeMetadata.AvailabilityDomain, ok = node.Labels[kubeAPI.LabelZoneFailureDomain]
147188
}
148189
if ok {
149-
fullAdName, _ := n.Labels[AvailabilityDomainLabel]
150-
return ad, fullAdName, nil
190+
nodeMetadata.FullAvailabilityDomain, _ = node.Labels[AvailabilityDomainLabel]
191+
}
192+
193+
if preferredIpFamily, ok := node.Labels[LabelIpFamilyPreferred]; ok {
194+
nodeMetadata.PreferredNodeIpFamily = FormatValidIpStackInK8SConvention(preferredIpFamily)
195+
}
196+
if ipv4Enabled, ok := node.Labels[LabelIpFamilyIpv4]; ok && strings.EqualFold(ipv4Enabled, "true") {
197+
nodeMetadata.Ipv4Enabled = true
198+
}
199+
if ipv6Enabled, ok := node.Labels[LabelIpFamilyIpv6]; ok && strings.EqualFold(ipv6Enabled, "true") {
200+
nodeMetadata.Ipv6Enabled = true
151201
}
152202
}
153-
errMsg := fmt.Sprintf("Did not find the label for the fault domain. Checked Topology Labels: %s, %s", kubeAPI.LabelTopologyZone, kubeAPI.LabelZoneFailureDomain)
154-
u.Logger.With("nodeId", nodeID).Error(errMsg)
155-
return "", "", fmt.Errorf(errMsg)
203+
if !nodeMetadata.Ipv4Enabled && !nodeMetadata.Ipv6Enabled {
204+
nodeMetadata.PreferredNodeIpFamily = Ipv4Stack
205+
nodeMetadata.Ipv4Enabled = true
206+
u.Logger.With("nodeId", nodeID, "nodeMetadata", nodeMetadata).Info("No IP family labels identified on node, defaulting to ipv4.")
207+
} else {
208+
u.Logger.With("nodeId", nodeID, "nodeMetadata", nodeMetadata).Info("Node IP family identified.")
209+
}
210+
nodeMetadata.IsNodeMetadataLoaded = true
211+
return nil
156212
}
157213

158214
// waitForPathToExist waits for for a given filesystem path to exist.
@@ -522,37 +578,6 @@ func GetIsFeatureEnabledFromEnv(logger *zap.SugaredLogger, featureName string, d
522578
return enableFeature
523579
}
524580

525-
func GetNodeIpFamily(k kubernetes.Interface, nodeID string, logger *zap.SugaredLogger) (*NodeIpFamily, error) {
526-
n, err := k.CoreV1().Nodes().Get(context.Background(), nodeID, metav1.GetOptions{})
527-
528-
if err != nil {
529-
logger.With(zap.Error(err)).With("nodeId", nodeID).Error("Failed to get Node information from kube api server, Please check if kube api server is accessible.")
530-
return nil, fmt.Errorf("Failed to get node information from kube api server, please check if kube api server is accessible.")
531-
}
532-
533-
nodeIpFamily := &NodeIpFamily{}
534-
535-
if n.Labels != nil {
536-
if preferredIpFamily, ok := n.Labels[LabelIpFamilyPreferred]; ok {
537-
nodeIpFamily.PreferredNodeIpFamily = FormatValidIpStackInK8SConvention(preferredIpFamily)
538-
}
539-
if ipv4Enabled, ok := n.Labels[LabelIpFamilyIpv4]; ok && strings.EqualFold(ipv4Enabled, "true") {
540-
nodeIpFamily.Ipv4Enabled = true
541-
}
542-
if ipv6Enabled, ok := n.Labels[LabelIpFamilyIpv6]; ok && strings.EqualFold(ipv6Enabled, "true") {
543-
nodeIpFamily.Ipv6Enabled = true
544-
}
545-
}
546-
if !nodeIpFamily.Ipv4Enabled && !nodeIpFamily.Ipv6Enabled {
547-
nodeIpFamily.PreferredNodeIpFamily = Ipv4Stack
548-
nodeIpFamily.Ipv4Enabled = true
549-
logger.With("nodeId", nodeID, "nodeIpFamily", *nodeIpFamily).Info("No IP family labels identified on node, defaulting to ipv4.")
550-
} else {
551-
logger.With("nodeId", nodeID, "nodeIpFamily", *nodeIpFamily).Info("Node IP family identified.")
552-
}
553-
554-
return nodeIpFamily, nil
555-
}
556581
func ConvertIscsiIpFromIpv4ToIpv6(ipv4IscsiIp string) (string, error) {
557582
ipv4IscsiIP := net.ParseIP(ipv4IscsiIp).To4()
558583
if ipv4IscsiIP == nil {
@@ -607,30 +632,27 @@ func IsValidIpFamilyPresentInClusterIpFamily(clusterIpFamily string) bool {
607632
return len(clusterIpFamily) > 0 && (strings.Contains(clusterIpFamily, Ipv4Stack) || strings.Contains(clusterIpFamily, Ipv6Stack))
608633
}
609634

610-
func IsIpv6SingleStackNode(nodeIpFamily *NodeIpFamily) bool {
611-
if nodeIpFamily == nil {
635+
func IsIpv6SingleStackNode(nodeMetadata *NodeMetadata) bool {
636+
if nodeMetadata == nil {
612637
return false
613638
}
614-
return nodeIpFamily.Ipv6Enabled == true && nodeIpFamily.Ipv4Enabled == false
639+
return nodeMetadata.Ipv6Enabled == true && nodeMetadata.Ipv4Enabled == false
615640
}
616641

617-
func LoadCSIConfigFromConfigMap(k kubernetes.Interface, configMapName string, logger *zap.SugaredLogger) *CSIConfig {
642+
func LoadCSIConfigFromConfigMap(csiConfig *CSIConfig, k kubernetes.Interface, configMapName string, logger *zap.SugaredLogger) {
618643
// Get the ConfigMap
619644
// Parse the configuration for each driver
620-
config := &CSIConfig{}
621645
cm, err := k.CoreV1().ConfigMaps("kube-system").Get(context.Background(), configMapName, metav1.GetOptions{})
622646
if err != nil {
623647
logger.Debugf("Failed to load ConfigMap %v due to error %v. Using default configuration.", configMapName, err)
624-
return config
648+
return
625649
}
626650

627651
if lustreConfig, exists := cm.Data["lustre"]; exists {
628-
if err := yaml.Unmarshal([]byte(lustreConfig), &config.Lustre); err != nil {
629-
logger.Debugf("Failed to parse lustre key in config map %v. Error: %v", configMapName, err)
630-
return config
652+
if err := yaml.Unmarshal([]byte(lustreConfig), &csiConfig.Lustre); err != nil {
653+
logger.Debugf("Failed to parse lustre key in config map %v. Error: %v",configMapName, err)
654+
return
631655
}
632656
logger.Infof("Successfully loaded ConfigMap %v. Using customized configuration for csi driver.", configMapName)
633657
}
634-
635-
return config
636658
}

pkg/csi-util/utils_test.go

Lines changed: 81 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,20 @@
1515
package csi_util
1616

1717
import (
18+
"context"
1819
"fmt"
20+
"log"
21+
"os"
1922
"reflect"
2023
"regexp"
2124
"strings"
2225
"testing"
26+
"time"
2327

2428
"github.com/oracle/oci-cloud-controller-manager/pkg/util"
2529
"github.com/oracle/oci-go-sdk/v65/core"
2630
"go.uber.org/zap"
31+
"k8s.io/client-go/kubernetes"
2732
"k8s.io/utils/pointer"
2833
)
2934

@@ -344,18 +349,21 @@ func Test_DiskByPathPatternForPV(t *testing.T) {
344349
}
345350
}
346351

347-
func Test_GetNodeIpFamily(t *testing.T) {
352+
func Test_LoadNodeMetadataFromApiServer(t *testing.T) {
348353

349354
tests := []struct {
350355
name string
351356
nodeName string
352-
want *NodeIpFamily
357+
want *NodeMetadata
358+
kubeclient kubernetes.Interface
353359
err error
354360
}{
355361
{
356362
name: "should return ipv6 for ipv6 preferred node",
357363
nodeName: "ipv6Preferred",
358-
want: &NodeIpFamily{
364+
want: &NodeMetadata{
365+
FullAvailabilityDomain: "xyz:PHX-AD-3",
366+
AvailabilityDomain: "PHX-AD-3",
359367
PreferredNodeIpFamily: Ipv6Stack,
360368
Ipv4Enabled: true,
361369
Ipv6Enabled: true,
@@ -364,16 +372,18 @@ func Test_GetNodeIpFamily(t *testing.T) {
364372
{
365373
name: "should return ipv4 for ipv4 preferred node",
366374
nodeName: "ipv4Preferred",
367-
want: &NodeIpFamily{
375+
want: &NodeMetadata{
368376
PreferredNodeIpFamily: Ipv4Stack,
377+
AvailabilityDomain: "PHX-AD-3",
369378
Ipv4Enabled: true,
370379
Ipv6Enabled: true,
371380
},
372381
},
373382
{
374383
name: "should return default IPv4 family for no ip preference",
375384
nodeName: "noIpPreference",
376-
want: &NodeIpFamily{
385+
want: &NodeMetadata{
386+
AvailabilityDomain: "PHX-AD-3",
377387
PreferredNodeIpFamily: Ipv4Stack,
378388
Ipv4Enabled: true,
379389
Ipv6Enabled: false,
@@ -382,21 +392,73 @@ func Test_GetNodeIpFamily(t *testing.T) {
382392
{
383393
name: "should return error for invalid node",
384394
nodeName: "InvalidNode",
385-
want: nil,
395+
want: &NodeMetadata{},
386396
err: fmt.Errorf("Failed to get node information from kube api server, please check if kube api server is accessible."),
387397
},
398+
{
399+
name: "should return error for node with any ad labels",
400+
nodeName: "nodeWithMissingAdLabels",
401+
want: &NodeMetadata{
402+
PreferredNodeIpFamily: Ipv4Stack,
403+
Ipv4Enabled: true,
404+
Ipv6Enabled: false,
405+
},
406+
err: fmt.Errorf("Failed to get node information from kube api server, please check if kube api server is accessible."),
407+
},
408+
{
409+
name: "Call to get node info is done even if health check fails",
410+
nodeName: "ipv4Preferred",
411+
want: &NodeMetadata{
412+
PreferredNodeIpFamily: Ipv4Stack,
413+
AvailabilityDomain: "PHX-AD-3",
414+
Ipv4Enabled: true,
415+
Ipv6Enabled: true,
416+
},
417+
kubeclient: &util.MockKubeClientWithFailingRestClient{
418+
CoreClient: &util.MockCoreClientWithFailingRestClient{},
419+
},
420+
},
421+
{
422+
name: "should return error for invalid node and failing health check",
423+
nodeName: "InvalidNode",
424+
want: &NodeMetadata{},
425+
err: fmt.Errorf("Failed to get node information from kube api server, please check if kube api server is accessible."),
426+
kubeclient: &util.MockKubeClientWithFailingRestClient{
427+
CoreClient: &util.MockCoreClientWithFailingRestClient{},
428+
},
429+
},
430+
}
431+
432+
logger, _ := zap.NewDevelopment()
433+
sugar := logger.Sugar()
434+
u := &Util{
435+
Logger: sugar,
388436
}
437+
389438
for _, tt := range tests {
390439
t.Run(tt.name, func(t *testing.T) {
391-
got, err := GetNodeIpFamily(&util.MockKubeClient{
392-
CoreClient: &util.MockCoreClient{},
393-
}, tt.nodeName, zap.S())
394-
if (tt.want != got) && (tt.want.PreferredNodeIpFamily != got.PreferredNodeIpFamily ||
395-
tt.want.Ipv6Enabled != got.Ipv6Enabled || tt.want.Ipv4Enabled != got.Ipv4Enabled) {
396-
t.Errorf("GetNodeIpFamily() = %v, want %v", got, tt.want)
440+
441+
442+
log.SetOutput(os.Stdout)
443+
nodeMetadata := &NodeMetadata{}
444+
ctx, cancel := context.WithTimeout(context.Background(), 10 * time.Second)
445+
defer cancel()
446+
447+
448+
var k kubernetes.Interface
449+
if tt.kubeclient != nil {
450+
k = tt.kubeclient
451+
} else {
452+
k = &util.MockKubeClient{CoreClient: &util.MockCoreClient{}}
453+
}
454+
455+
err := u.LoadNodeMetadataFromApiServer(ctx, k, tt.nodeName, nodeMetadata)
456+
if (tt.want != nodeMetadata) && (tt.want.PreferredNodeIpFamily != nodeMetadata.PreferredNodeIpFamily ||
457+
tt.want.Ipv6Enabled != nodeMetadata.Ipv6Enabled || tt.want.Ipv4Enabled != nodeMetadata.Ipv4Enabled) {
458+
t.Errorf("LoadNodeMetadataFromApiServer() = %v, want %v", nodeMetadata, tt.want)
397459
}
398460
if err != nil && !strings.EqualFold(tt.err.Error(), err.Error()) {
399-
t.Errorf("GetNodeIpFamily() = %v, want %v", err, tt.err)
461+
t.Errorf("LoadNodeMetadataFromApiServer() = %v, want %v", err, tt.err)
400462
}
401463

402464
})
@@ -764,18 +826,20 @@ func Test_LoadCSIConfigFromConfigMap(t *testing.T) {
764826
{
765827
name: "Return default config if config map is not present",
766828
configMapName: "invalid",
767-
want: &CSIConfig{},
829+
want: &CSIConfig{
830+
},
768831
},
769832
}
770833

771834
for _, tt := range tests {
772835
t.Run(tt.name, func(t *testing.T) {
773-
got := LoadCSIConfigFromConfigMap(&util.MockKubeClient{
836+
csiConfig := &CSIConfig{}
837+
LoadCSIConfigFromConfigMap(csiConfig, &util.MockKubeClient{
774838
CoreClient: &util.MockCoreClient{},
775839
}, tt.configMapName, zap.S())
776840

777-
if !reflect.DeepEqual(tt.want, got) {
778-
t.Errorf("LoadCSIConfigFromConfigMap() => got : %v, want : %v", got, tt.want)
841+
if !reflect.DeepEqual(tt.want, csiConfig) {
842+
t.Errorf("LoadCSIConfigFromConfigMap() => got : %v, want : %v", csiConfig, tt.want)
779843
}
780844
})
781845
}

0 commit comments

Comments
 (0)