Skip to content

Commit 723acf5

Browse files
hungnguyen243cemakd
authored andcommitted
update test setup to avoid running Datacache setup on machines not supporting LSSDs
1 parent 4212f84 commit 723acf5

File tree

6 files changed

+64
-61
lines changed

6 files changed

+64
-61
lines changed

cmd/gce-pd-csi-driver/main.go

Lines changed: 20 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ var (
7171
formatAndMountTimeout = flag.Duration("format-and-mount-timeout", 1*time.Minute, "The maximum duration of a format and mount operation before another such operation will be started. Used only if --serialize-format-and-mount")
7272
fallbackRequisiteZonesFlag = flag.String("fallback-requisite-zones", "", "Comma separated list of requisite zones that will be used if there are not sufficient zones present in requisite topologies when provisioning a disk")
7373
enableStoragePoolsFlag = flag.Bool("enable-storage-pools", false, "If set to true, the CSI Driver will allow volumes to be provisioned in Storage Pools")
74-
enableDataCacheFlag = flag.Bool("enable-data-cache", false, "If set to true, the CSI Driver will allow volumes to be provisioned with data cache configuration")
74+
enableDataCacheFlag = flag.Bool("enable-data-cache", false, "If set to true, the CSI Driver will allow volumes to be provisioned with Data Cache configuration")
7575
nodeName = flag.String("node-name", "", "The node this driver is running on")
7676

7777
multiZoneVolumeHandleDiskTypesFlag = flag.String("multi-zone-volume-handle-disk-types", "", "Comma separated list of allowed disk types that can use the multi-zone volumeHandle. Used only if --multi-zone-volume-handle-enable")
@@ -123,7 +123,7 @@ func handle() {
123123
if version == "" {
124124
klog.Fatalf("version must be set at compile time")
125125
}
126-
klog.V(2).Infof("Driver vendor version %v", version)
126+
klog.V(4).Infof("Driver vendor version %v", version)
127127

128128
// Start tracing as soon as possible
129129
if *enableOtelTracing {
@@ -235,14 +235,14 @@ func handle() {
235235
if *maxConcurrentFormatAndMount > 0 {
236236
nodeServer = nodeServer.WithSerializedFormatAndMount(*formatAndMountTimeout, *maxConcurrentFormatAndMount)
237237
}
238-
if *enableDataCacheFlag {
239-
if nodeName == nil || *nodeName == "" {
240-
klog.Errorf("Data Cache enabled, but --node-name not passed")
241-
}
242-
if err := setupDataCache(ctx, *nodeName, nodeServer.MetadataService.GetName()); err != nil {
243-
klog.Errorf("DataCache setup failed: %v", err)
244-
}
245-
go driver.StartWatcher(*nodeName)
238+
}
239+
240+
if *enableDataCacheFlag {
241+
if nodeName == nil || *nodeName == "" {
242+
klog.Errorf("Data Cache enabled, but --node-name not passed")
243+
}
244+
if err := setupDataCache(ctx, *nodeName); err != nil {
245+
klog.Errorf("Data Cache setup failed: %v", err)
246246
}
247247
}
248248

@@ -351,7 +351,7 @@ func fetchLssdsForRaiding(lssdCount int) ([]string, error) {
351351
return nil, fmt.Errorf("Error listing LSSDs with empty mountpoint: %v", err)
352352
}
353353

354-
// We need to ensure the disks to be used for Datacache are both unRAIDed & not containing mountpoints for ephemeral storage already
354+
// We need to ensure the disks to be used for Data Cache are both unRAIDed & not containing mountpoints for ephemeral storage already
355355
availableLssds := slices.Filter(nil, unRaidedLssds, func(e string) bool {
356356
return slices.Contains(LSSDsWithEmptyMountPoint, e)
357357
})
@@ -369,36 +369,31 @@ func fetchLssdsForRaiding(lssdCount int) ([]string, error) {
369369
func setupDataCache(ctx context.Context, nodeName string) error {
370370
isAlreadyRaided, err := driver.IsRaided()
371371
if err != nil {
372-
klog.V(2).Infof("Errored while scanning for available LocalSSDs err:%v; continuing Raiding", err)
372+
klog.V(4).Infof("Errored while scanning for available LocalSSDs err:%v; continuing Raiding", err)
373373
} else if isAlreadyRaided {
374-
klog.V(2).Infof("Local SSDs are already RAIDed. Skipping Datacache setup.")
374+
klog.V(4).Infof("Local SSDs are already RAIDed. Skipping Data Cache setup.")
375375
return nil
376376
}
377377

378378
lssdCount := common.LocalSSDCountForDataCache
379379
if nodeName != common.TestNode {
380380
var err error
381381
lssdCount, err = driver.GetDataCacheCountFromNodeLabel(ctx, nodeName)
382-
if lssdCount == 0 {
383-
klog.Infof("Datacache is not enabled on node %v", nodeName)
384-
return nil
385-
}
386382
if err != nil {
387383
return err
388384
}
385+
if lssdCount == 0 {
386+
klog.V(4).Infof("Data Cache is not enabled on node %v, so skipping caching setup", nodeName)
387+
return nil
388+
}
389389
}
390390
lssdNames, err := fetchLssdsForRaiding(lssdCount)
391391
if err != nil {
392-
klog.Fatalf("Failed to get sufficient SSDs for Datacache's caching setup: %v", err)
392+
klog.Fatalf("Failed to get sufficient SSDs for Data Cache's caching setup: %v", err)
393393
}
394-
klog.V(2).Infof("Raiding local ssds to setup data cache: %v", lssdNames)
394+
klog.V(4).Infof("Raiding local ssds to setup Data Cache: %v", lssdNames)
395395
if err := driver.RaidLocalSsds(lssdNames); err != nil {
396-
return fmt.Errorf("Failed to Raid local SSDs, unable to setup data caching, got error %v", err)
397-
}
398-
399-
// Initializing data cache node (VG checks w/ raided lssd)
400-
if err := driver.InitializeDataCacheNode(nodeId); err != nil {
401-
return err
396+
return fmt.Errorf("Failed to Raid local SSDs, unable to setup Data Cache, got error %v", err)
402397
}
403398

404399
klog.V(4).Infof("LSSD caching is setup for the Data Cache enabled node %s", nodeName)

pkg/common/constants.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ const (
4848
// Default LSSD count for datacache E2E tests
4949
LocalSSDCountForDataCache = 2
5050

51-
// Node label for datacache
51+
// Node label for Data Cache (only applicable to GKE nodes)
5252
NodeLabelPrefix = "cloud.google.com/%s"
5353
DataCacheLssdCountLabel = "gke-data-cache-disk"
5454
)

pkg/gce-pd-csi-driver/cache.go

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,10 @@ import (
1616
)
1717

1818
const (
19-
cacheSuffix = "csi-fast"
20-
mainLvSuffix = "csi-main"
21-
raidedLocalSsdName = "csi-driver-data-cache"
22-
raidMode = "0"
23-
initialRaidedLocalSsdPath = "/dev/md0"
19+
cacheSuffix = "csi-fast"
20+
mainLvSuffix = "csi-main"
21+
raidedLocalSsdName = "csi-driver-data-cache"
22+
raidMode = "0"
2423
)
2524

2625
func fetchRAIDedLocalSsdPath() (string, error) {
@@ -30,12 +29,13 @@ func fetchRAIDedLocalSsdPath() (string, error) {
3029
}
3130
info, err := common.RunCommand("grep", []string{raidedLocalSsdName}, "mdadm", args...)
3231
if err != nil || len(info) == 0 {
33-
return "", fmt.Errorf("Error getting RAIDed device path for Datacache %v, output:%v ===============", err, string(info))
32+
return "", fmt.Errorf("Error getting RAIDed device path for Data Cache %v, output:%v", err, string(info))
3433
}
3534
infoString := strings.TrimSpace(string(info))
3635
infoSlice := strings.Split(infoString, " ")
3736

38-
// We want to get the second element in the array, which is the path to the RAIDed device
37+
// We want to get the second element in the array (sample: ARRAY /dev/md126 metadata=1.2 name=csi-driver-data-cache UUID=*),
38+
// which is the path to the RAIDed device
3939
return infoSlice[1], nil
4040
}
4141

@@ -51,7 +51,7 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
5151
volumeGroupName := getVolumeGroupName(nodeId)
5252
mainDevicePath := "/dev/" + volumeGroupName + "/" + getLvName(mainLvSuffix, volumeId)
5353
mainLvName := getLvName(mainLvSuffix, volumeId)
54-
klog.V(2).Infof("Volume group available on node %v ", volumeGroupName)
54+
klog.V(4).Infof("Volume group available on node %v ", volumeGroupName)
5555
vgExists := checkVgExists(volumeGroupName)
5656
if vgExists {
5757
// Clean up Volume Group before adding the PD
@@ -82,9 +82,9 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
8282
infoString = strings.ReplaceAll(infoString, "\"", "")
8383
infoSlice := strings.Split(strings.TrimSpace(infoString), " ")
8484
vgNameForPv := strings.TrimSpace(infoSlice[(len(infoSlice) - 1)])
85-
klog.V(2).Infof("============================== Physical volume is part of Volume group: %v ==============================", vgNameForPv)
85+
klog.V(4).Infof("Physical volume is part of Volume group: %v", vgNameForPv)
8686
if vgNameForPv == volumeGroupName {
87-
klog.V(2).Infof("============================== Physical Volume(PV) already exists in the Volume Group ==============================")
87+
klog.V(4).Infof("Physical Volume(PV) already exists in the Volume Group")
8888
} else if vgNameForPv != "VG" && vgNameForPv != "" {
8989

9090
info, err = common.RunCommand("" /* pipedCmd */, nil /* pipedCmdArg */, "vgchange", []string{"-an", vgNameForPv}...)
@@ -157,7 +157,7 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
157157
cacheLvName := getLvName(cacheSuffix, volumeId)
158158
if isCached {
159159
// Validate that cache is setup for required size
160-
klog.V(2).Infof("Assuming valid data cache size and mode, resizing cache is not supported")
160+
klog.V(4).Infof("Assuming valid data cache size and mode, resizing cache is not supported")
161161
} else {
162162
fastCacheSize := req.GetPublishContext()[common.ContextDataCacheSize]
163163
chunkSize := "960" // Cannot use default chunk size(64KiB) as it errors on maxChunksAllowed. Unit - KiB
@@ -207,8 +207,8 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
207207
return mainDevicePath, nil
208208
}
209209

210-
func ValidateDataCacheConfig(dataCacheMode string, datacacheSize string, ctx context.Context, nodeName string) error {
211-
if dataCacheMode != "" && datacacheSize != "" {
210+
func ValidateDataCacheConfig(dataCacheMode string, dataCacheSize string, ctx context.Context, nodeName string) error {
211+
if dataCacheMode != "" && dataCacheSize != "" {
212212
isAlreadyRaided, err := IsRaided()
213213
if err != nil {
214214
return fmt.Errorf("Local SSDs are not setup for caching; got error: %v", err)
@@ -218,48 +218,50 @@ func ValidateDataCacheConfig(dataCacheMode string, datacacheSize string, ctx con
218218
}
219219
return nil
220220
}
221-
klog.Infof("Data cache is not enabled for PVC")
221+
klog.V(4).Infof("Data Cache is not enabled for PVC (data-cache-size: %v, data-cache-mode: %v). Please set both these parameters in StorageClass to enable caching", dataCacheSize, dataCacheMode)
222222
return nil
223223
}
224224

225225
func GetDataCacheCountFromNodeLabel(ctx context.Context, nodeName string) (int, error) {
226-
if nodeName == common.TestNode {
227-
return common.LocalSSDCountForDataCache, nil
228-
}
229226
cfg, err := rest.InClusterConfig()
230227
// We want to capture API errors with node label fetching, so return -1
231228
// in those cases instead of 0.
232229
if err != nil {
233-
return -1, err
230+
return 0, err
234231
}
235232
kubeClient, err := kubernetes.NewForConfig(cfg)
236233
if err != nil {
237-
return -1, err
234+
return 0, err
238235
}
239236
node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
240237
if err != nil {
241238
// We could retry, but this error will also crashloop the driver which may be as good a way to retry as any.
242-
return -1, err
239+
return 0, err
243240
}
244241
if val, found := node.GetLabels()[fmt.Sprintf(common.NodeLabelPrefix, common.DataCacheLssdCountLabel)]; found {
245242
dataCacheCount, err := strconv.Atoi(val)
246243
if err != nil {
247-
return -1, fmt.Errorf("Error getting Datacache's LSSD count from node label: %v", err)
244+
return 0, fmt.Errorf("Error getting Data Cache's LSSD count from node label: %v", err)
248245
}
249-
klog.Infof("Number of local SSDs requested for Datacache: %v", dataCacheCount)
246+
klog.V(4).Infof("Number of local SSDs requested for Data Cache: %v", dataCacheCount)
250247
return dataCacheCount, nil
251248
}
252-
return 0, fmt.Errorf("Cannot get Datacache's LSSD count from node label")
249+
// This will be returned for a non-Data-Cache node pool
250+
return 0, nil
253251
}
254252

255253
func FetchRaidedLssdCountForDatacache() (int, error) {
254+
raidedPath, err := fetchRAIDedLocalSsdPath()
255+
if err != nil {
256+
return 0, err
257+
}
256258
args := []string{
257259
"--detail",
258-
initialRaidedLocalSsdPath,
260+
raidedPath,
259261
}
260262
info, err := common.RunCommand("grep", []string{"Raid Devices"}, "mdadm", args...)
261263
if err != nil {
262-
return 0, fmt.Errorf("Error getting RAIDed devices for Datacache")
264+
return 0, fmt.Errorf("Error getting RAIDed devices for Data Cache")
263265
}
264266
if len(info) != 0 {
265267
raidedDeviceInfo := strings.Split(strings.TrimSpace(string(info)), ":")
@@ -294,7 +296,7 @@ func FetchRaidedLssds() ([]string, error) {
294296
}
295297
}
296298

297-
klog.V(2).Infof("Raided NVME list %v", raidedLssdList)
299+
klog.V(4).Infof("Raided NVME list %v", raidedLssdList)
298300

299301
return raidedLssdList, nil
300302
}
@@ -309,7 +311,7 @@ func FetchAllLssds() ([]string, error) {
309311
infoList := strings.Split(strings.TrimSpace(string(info)), "\n")
310312
re, err := regexp.Compile("nvme_card([0-9]+)?$")
311313
if err != nil {
312-
klog.V(2).ErrorS(err, "Errored while compiling to check PD or LSSD")
314+
klog.V(4).ErrorS(err, "Errored while compiling to check PD or LSSD")
313315
}
314316
for _, ssd := range infoList {
315317
ssd = strings.TrimSpace(ssd)
@@ -322,7 +324,7 @@ func FetchAllLssds() ([]string, error) {
322324
}
323325
}
324326

325-
klog.V(2).Infof("NVME list %v", diskList)
327+
klog.V(4).Infof("NVME list %v", diskList)
326328

327329
return diskList, nil
328330
}
@@ -358,6 +360,7 @@ func cleanupCache(volumeId string, nodeId string) error {
358360
// If volume group doesn't exist then there's nothing to uncache
359361
return nil
360362
}
363+
reduceVolumeGroup(volumeGroupName, true)
361364
mainLvName := getLvName(mainLvSuffix, volumeId)
362365
args := []string{
363366
"-an",
@@ -404,7 +407,7 @@ func createVg(volumeGroupName string, raidedLocalSsds string) error {
404407
if err != nil {
405408
return fmt.Errorf("Volume group creation failed %w: %s", err, info)
406409
}
407-
klog.Infof("Volume group creation succeeded for %v", volumeGroupName)
410+
klog.V(4).Infof("Volume group creation succeeded for %v", volumeGroupName)
408411

409412
args = []string{}
410413
info, err = common.RunCommand("" /* pipedCmd */, nil /* pipedCmdArg */, "vgscan", args...)
@@ -431,8 +434,6 @@ func reduceVolumeGroup(volumeGroupName string, force bool) {
431434
func RaidLocalSsds(availableLssds []string) error {
432435
args := []string{
433436
"--create",
434-
initialRaidedLocalSsdPath,
435-
"--name",
436437
raidedLocalSsdName,
437438
"-l" + raidMode,
438439
// Force RAIDing as sometime it might fail for caution if there is just 1 LSSD present as 1 LSSD need not be RAIDed
@@ -448,7 +449,7 @@ func RaidLocalSsds(availableLssds []string) error {
448449
// Validate if Raided successfully
449450
isAlreadyRaided, err := IsRaided()
450451
if err != nil {
451-
klog.V(2).Infof("Errored while scanning for available raided LocalSSDs err:%v=", err)
452+
klog.V(4).Infof("Errored while scanning for available raided LocalSSDs err:%v=", err)
452453
}
453454
if !isAlreadyRaided {
454455
return fmt.Errorf("failed raiding, raided device not found on scanning")

pkg/gce-pd-csi-driver/node.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
338338
}
339339
configError := ValidateDataCacheConfig(req.GetPublishContext()[common.ContextDataCacheMode], req.GetPublishContext()[common.ContextDataCacheSize], ctx, nodeId)
340340
if configError != nil {
341-
return nil, status.Error(codes.Internal, fmt.Sprintf("Error validate configuration for Datacache: %v", err.Error()))
341+
return nil, status.Error(codes.Internal, fmt.Sprintf("Error validate configuration for Data Cache: %v", err.Error()))
342342
}
343343
devicePath, err = setupCaching(devFsPath, req, nodeId)
344344
if err != nil {

test/e2e/utils/utils.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,11 @@ func GCEClientAndDriverSetup(instance *remote.InstanceInfo, driverConfig DriverC
6767
"--use-instance-api-to-poll-attachment-disk-types=pd-ssd",
6868
"--use-instance-api-to-list-volumes-published-nodes",
6969
fmt.Sprintf("--fallback-requisite-zones=%s", strings.Join(driverConfig.Zones, ",")),
70-
"--enable-data-cache",
71-
fmt.Sprintf("--node-name=%s", utilcommon.TestNode),
70+
}
71+
72+
if instance.GetLocalSSD() > 0 {
73+
extra_flags = append(extra_flags, "--enable-data-cache")
74+
extra_flags = append(extra_flags, fmt.Sprintf("--node-name=%s", utilcommon.TestNode))
7275
}
7376
extra_flags = append(extra_flags, fmt.Sprintf("--compute-endpoint=%s", driverConfig.ComputeEndpoint))
7477
extra_flags = append(extra_flags, driverConfig.ExtraFlags...)

test/remote/instance.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ func (i *InstanceInfo) GetNodeID() string {
8080
return common.CreateNodeID(i.cfg.Project, i.cfg.Zone, i.cfg.Name)
8181
}
8282

83+
func (i *InstanceInfo) GetLocalSSD() int64 {
84+
return i.cfg.LocalSSDCount
85+
}
86+
8387
func machineTypeMismatch(curInst *compute.Instance, newInst *compute.Instance) bool {
8488
if !strings.Contains(curInst.MachineType, newInst.MachineType) {
8589
klog.Infof("Machine type mismatch")

0 commit comments

Comments
 (0)