Skip to content

Commit 040ec58

Browse files
halimsamcemakd
authored andcommitted
Implementing watcher & reboot stability for data cache to master branch.
1 parent 8b4d9e1 commit 040ec58

34 files changed

+4241
-209
lines changed

cmd/gce-pd-csi-driver/main.go

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -241,14 +241,14 @@ func handle() {
241241
if *maxConcurrentFormatAndMount > 0 {
242242
nodeServer = nodeServer.WithSerializedFormatAndMount(*formatAndMountTimeout, *maxConcurrentFormatAndMount)
243243
}
244-
}
245-
246-
if *enableDataCacheFlag {
247-
if nodeName == nil || *nodeName == "" {
248-
klog.Errorf("Data cache enabled, but --node-name not passed")
249-
}
250-
if err := setupDataCache(ctx, *nodeName); err != nil {
251-
klog.Errorf("DataCache setup failed: %v", err)
244+
if *enableDataCacheFlag {
245+
if nodeName == nil || *nodeName == "" {
246+
klog.Errorf("Data Cache enabled, but --node-name not passed")
247+
}
248+
if err := setupDataCache(ctx, *nodeName, nodeServer.MetadataService.GetName()); err != nil {
249+
klog.Errorf("DataCache setup failed: %v", err)
250+
}
251+
go driver.StartWatcher(*nodeName)
252252
}
253253
}
254254

@@ -331,8 +331,16 @@ func urlFlag(target **url.URL, name string, usage string) {
331331
})
332332
}
333333

334-
func setupDataCache(ctx context.Context, nodeName string) error {
335-
klog.V(2).Infof("Setting up data cache for node %s", nodeName)
334+
func setupDataCache(ctx context.Context, nodeName string, nodeId string) error {
335+
isAlreadyRaided, err := driver.IsRaided()
336+
if err != nil {
337+
klog.V(4).Infof("Errored while scanning for available LocalSSDs err:%v; continuing Raiding", err)
338+
} else if isAlreadyRaided {
339+
klog.V(4).Infof("Local SSDs are already RAIDed. Skipping Data Cache setup.")
340+
return nil
341+
}
342+
343+
lssdCount := common.LocalSSDCountForDataCache
336344
if nodeName != common.TestNode {
337345
cfg, err := rest.InClusterConfig()
338346
if err != nil {
@@ -357,6 +365,11 @@ func setupDataCache(ctx context.Context, nodeName string) error {
357365
return fmt.Errorf("Failed to Raid local SSDs, unable to setup data caching, got error %v", err)
358366
}
359367

360-
klog.V(2).Infof("Datacache enabled for node %s", nodeName)
368+
// Initializing data cache node (VG checks w/ raided lssd)
369+
if err := driver.InitializeDataCacheNode(nodeId); err != nil {
370+
return err
371+
}
372+
373+
klog.V(4).Infof("LSSD caching is setup for the Data Cache enabled node %s", nodeName)
361374
return nil
362375
}

go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ require (
5757
github.com/davecgh/go-spew v1.1.1 // indirect
5858
github.com/emicklei/go-restful v2.9.5+incompatible // indirect
5959
github.com/felixge/httpsnoop v1.0.4 // indirect
60-
github.com/fsnotify/fsnotify v1.5.4 // indirect
61-
github.com/go-logr/logr v1.4.1 // indirect
60+
github.com/fsnotify/fsnotify v1.8.0 // indirect
61+
github.com/go-logr/logr v1.4.2 // indirect
6262
github.com/go-logr/stdr v1.2.2 // indirect
6363
github.com/go-openapi/jsonpointer v0.20.0 // indirect
6464
github.com/go-openapi/jsonreference v0.19.6 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1031,6 +1031,8 @@ github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4
10311031
github.com/fsnotify/fsnotify v1.5.1/go.mod h1:T3375wBYaZdLLcVNkcVbzGHY7f1l/uK5T5Ai1i3InKU=
10321032
github.com/fsnotify/fsnotify v1.5.4 h1:jRbGcIw6P2Meqdwuo0H1p6JVLbL5DHKAKlYndzMwVZI=
10331033
github.com/fsnotify/fsnotify v1.5.4/go.mod h1:OVB6XrOHzAwXMpEM7uPOzcehqUV2UqJxmVXmkdnm1bU=
1034+
github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M=
1035+
github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
10341036
github.com/fsouza/fake-gcs-server v0.0.0-20180612165233-e85be23bdaa8/go.mod h1:1/HufuJ+eaDf4KTnYdS6HJMGvMRU8d4cYTuu/1QaBbI=
10351037
github.com/fsouza/fake-gcs-server v1.19.4/go.mod h1:I0/88nHCASqJJ5M7zVF0zKODkYTcuXFW5J5yajsNJnE=
10361038
github.com/fvbommel/sortorder v1.0.1/go.mod h1:uk88iVf1ovNn1iLfgUVU2F9o5eO30ui720w+kxuqRs0=

pkg/gce-pd-csi-driver/cache.go

Lines changed: 115 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@ import (
77
"strings"
88

99
csi "github.com/container-storage-interface/spec/lib/go/csi"
10-
10+
fsnotify "github.com/fsnotify/fsnotify"
11+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
12+
"k8s.io/client-go/kubernetes"
13+
"k8s.io/client-go/rest"
1114
"k8s.io/klog/v2"
1215

1316
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
@@ -42,7 +45,7 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
4245
// Clean up Volume Group before adding the PD
4346
reduceVolumeGroup(volumeGroupName, true)
4447
} else {
45-
err := createVg(volumeGroupName, devicePath, raidedLocalSsdPath)
48+
err := createVg(volumeGroupName, raidedLocalSsdPath)
4649
if err != nil {
4750
return mainDevicePath, err
4851
}
@@ -241,7 +244,7 @@ func getLvName(suffix string, volumeId string) string {
241244
return fmt.Sprintf("%s-%s", suffix, pvcName)
242245
}
243246

244-
func createVg(volumeGroupName string, devicePath string, raidedLocalSsds string) error {
247+
func createVg(volumeGroupName string, raidedLocalSsds string) error {
245248
args := []string{
246249
"--zero",
247250
"y",
@@ -366,3 +369,112 @@ func isCachingSetup(mainLvName string) (error, bool) {
366369
}
367370
return nil, false
368371
}
372+
373+
func fetchChunkSizeKiB(cacheSize string) (string, error) {
374+
var chunkSize float64
375+
376+
cacheSizeInt, err := common.ConvertGiStringToInt64(cacheSize)
377+
if err != nil {
378+
return "0", err
379+
}
380+
// Chunksize should be divisible by 32Kib so we need (chunksize/32*1024)*32*1024
381+
chunkSize = (float64(cacheSizeInt) * GiB) / float64(maxAllowedChunks)
382+
chunkSize = math.Round(chunkSize/(32*KiB)) * (32 * KiB)
383+
chunkSize = math.Min(math.Max(chunkSize, minChunkSize), maxChunkSize) / KiB
384+
// default chunk size unit KiB
385+
return strconv.FormatInt(int64(chunkSize), 10) + "KiB", nil
386+
}
387+
388+
func InitializeDataCacheNode(nodeId string) error {
389+
raidedLocalSsdPath, err := fetchRAIDedLocalSsdPath()
390+
if err != nil {
391+
return err
392+
}
393+
volumeGroupName := getVolumeGroupName(nodeId)
394+
395+
vgExists := checkVgExists(volumeGroupName)
396+
// Check if the required volume group already exists
397+
if vgExists {
398+
// Clean up Volume Group before adding the PD
399+
reduceVolumeGroup(volumeGroupName, true)
400+
401+
// validate that raidedLSSD is part of VG
402+
err = validateRaidedLSSDinVG(volumeGroupName, raidedLocalSsdPath)
403+
if err != nil {
404+
return fmt.Errorf("failed validate local ssd in vg %v: %v", volumeGroupName, err)
405+
}
406+
} else {
407+
err := createVg(volumeGroupName, raidedLocalSsdPath)
408+
if err != nil {
409+
return err
410+
}
411+
}
412+
return nil
413+
}
414+
415+
func StartWatcher(nodeName string) {
416+
dirToWatch := "/dev/"
417+
watcher, err := fsnotify.NewWatcher()
418+
if err != nil {
419+
klog.V(2).ErrorS(err, "errored while creating watcher")
420+
}
421+
klog.V(2).Infof("Watcher started for directory %v", dirToWatch)
422+
defer watcher.Close()
423+
424+
// out of the box fsnotify can watch a single file, or a single directory
425+
if err := watcher.Add(dirToWatch); err != nil {
426+
klog.V(2).ErrorS(err, "errored while adding watcher directory")
427+
}
428+
errorCh := make(chan error, 1)
429+
// Handle the error received from the watcher goroutine
430+
go watchDiskDetaches(watcher, nodeName, errorCh)
431+
432+
select {
433+
case err := <-errorCh:
434+
klog.Errorf("watcher encountered an error: %v", err)
435+
}
436+
}
437+
438+
func watchDiskDetaches(watcher *fsnotify.Watcher, nodeName string, errorCh chan error) error {
439+
for {
440+
select {
441+
// watch for errors
442+
case err := <-watcher.Errors:
443+
errorCh <- fmt.Errorf("disk update event errored: %v", err)
444+
// watch for events
445+
case event := <-watcher.Events:
446+
// In case of an event i.e. creation or deletion of any new PV, we update the VG metadata.
447+
// This might include some non-LVM changes, no harm in updating metadata multiple times.
448+
reduceVolumeGroup(getVolumeGroupName(nodeName), true)
449+
klog.V(2).Infof("disk attach/detach event %#v\n", event)
450+
}
451+
}
452+
}
453+
454+
func validateRaidedLSSDinVG(vgName string, lssdPath string) error {
455+
args := []string{
456+
"--noheadings",
457+
"-o",
458+
"pv_name",
459+
"--select",
460+
"vg_name=" + vgName,
461+
}
462+
info, err := common.RunCommand("" /* pipedCmd */, nil /* pipedCmdArg */, "pvs", args...)
463+
if err != nil {
464+
return fmt.Errorf("errored while checking physical volume details %v: %s", err, info)
465+
// On error info contains the error message which we cannot use for further steps
466+
}
467+
468+
if !strings.Contains(string(info), lssdPath) {
469+
return addRaidedLSSDToVg(vgName, lssdPath)
470+
}
471+
return nil
472+
}
473+
474+
func addRaidedLSSDToVg(vgName, lssdPath string) error {
475+
info, err := common.RunCommand("" /* pipedCmd */, nil /* pipedCmdArg */, "vgextend", []string{vgName, lssdPath}...)
476+
if err != nil {
477+
return fmt.Errorf("errored while extending VGs %v: %s", err, info)
478+
}
479+
return nil
480+
}

vendor/github.com/fsnotify/fsnotify/.cirrus.yml

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/fsnotify/fsnotify/.gitignore

Lines changed: 9 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)