Skip to content

Commit f61a8ab

Browse files
committed
Adding Raid LSSD logic
1 parent 8408dc6 commit f61a8ab

File tree

4 files changed

+92
-13
lines changed

4 files changed

+92
-13
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ RUN GOARCH=$(echo $TARGETPLATFORM | cut -f2 -d '/') GCE_PD_CSI_STAGING_VERSION=$
2525
FROM gke.gcr.io/debian-base:bullseye-v1.4.3-gke.5 as debian
2626
# Install necessary dependencies
2727
# google_nvme_id script depends on the following packages: nvme-cli, xxd, bash
28-
RUN clean-install util-linux e2fsprogs mount ca-certificates udev xfsprogs nvme-cli xxd bash kmod lvm2
28+
RUN clean-install util-linux e2fsprogs mount ca-certificates udev xfsprogs nvme-cli xxd bash kmod lvm2 mdadm
2929

3030
# Since we're leveraging apt to pull in dependencies, we use `gcr.io/distroless/base` because it includes glibc.
3131
FROM gcr.io/distroless/base-debian11 as distroless-base

cmd/gce-pd-csi-driver/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ func handle() {
251251
gce.WaitForOpBackoff.Steps = *waitForOpBackoffSteps
252252
gce.WaitForOpBackoff.Cap = *waitForOpBackoffCap
253253

254+
driver.RaidLocalSsds()
254255
gceDriver.Run(*endpoint, *grpcLogCharCap, *enableOtelTracing)
255256
}
256257

pkg/gce-pd-csi-driver/cache.go

Lines changed: 88 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package gceGCEDriver
22

33
import (
44
"fmt"
5+
"strconv"
56
"strings"
67

78
csi "github.com/container-storage-interface/spec/lib/go/csi"
@@ -13,11 +14,15 @@ import (
1314

1415
const cacheSuffix = "csi-fast"
1516
const mainLvSuffix = "csi-main"
17+
const raidedLocalSsdName = "csi-driver-data-cache"
18+
const raidMode = "0"
19+
const raidedLssdPrefix = "/dev/md/"
1620

17-
func SetupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId string) (string, error) {
21+
func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId string) (string, error) {
1822
volumeId := req.GetVolumeId()
1923
volumeGroupName := getVolumeGroupName(nodeId)
2024
mainDevicePath := "/dev/" + volumeGroupName + "/" + getLvName(mainLvSuffix, volumeId)
25+
mainLvName := getLvName(mainLvSuffix, volumeId)
2126
klog.V(2).Infof("====== Start LVM PoC NodeStageVolume Steps ======")
2227
klog.V(2).Infof("====== volumeGroupName is %v ======", volumeGroupName)
2328

@@ -34,7 +39,7 @@ func SetupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
3439
klog.V(2).Infof("============= VG exists, now check if PD is part of VG============")
3540

3641
// Clean up Volume Group before adding the PD
37-
reduceVolumeGroup(volumeGroupName)
42+
reduceVolumeGroup(volumeGroupName, true)
3843
} else {
3944
err := createVg(volumeGroupName, devicePath)
4045
if err != nil {
@@ -89,16 +94,26 @@ func SetupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
8994
if err != nil {
9095
klog.Errorf("Errored while deactivating VG %v: err: %v: %s", vgNameForPv, err, info)
9196
}
97+
// Uncache LV
98+
args = []string{
99+
"--uncache",
100+
vgNameForPv + "/" + mainLvName,
101+
}
102+
info, err = common.RunCommand("lvconvert", args...)
103+
if err != nil {
104+
klog.Errorf("errored while uncaching main LV %v: %s", err, info)
105+
// On error info contains the error message which we cannot use for further steps
106+
}
92107

93-
reduceVolumeGroup(vgNameForPv)
108+
reduceVolumeGroup(vgNameForPv, false)
94109
klog.V(2).Infof("==========Merge VG %v to Node VG %v==========", vgNameForPv, volumeGroupName)
95110
info, err = common.RunCommand("vgmerge", []string{volumeGroupName, vgNameForPv}...)
96111
if err != nil {
97112
klog.Errorf("Errored while merging Volume group %s into %s %v: %s", vgNameForPv, volumeGroupName, err, info)
98113
}
99114

100115
klog.V(2).Infof("==========Remove VG from node %v ==========", vgNameForPv)
101-
info, err = common.RunCommand("cgremove", []string{vgNameForPv, "-y"}...)
116+
info, err = common.RunCommand("vgremove", []string{vgNameForPv, "-y"}...)
102117
if err != nil {
103118
klog.Errorf("Errored while removing Volume group %s: info:%s, error:%v", vgNameForPv, err, info)
104119
}
@@ -111,7 +126,6 @@ func SetupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
111126
}
112127
}
113128

114-
mainLvName := getLvName(mainLvSuffix, volumeId)
115129
// Create LV if not already created
116130
args = []string{
117131
"--select",
@@ -124,7 +138,7 @@ func SetupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
124138
klog.V(2).Infof("====== lvs error %v: %s ======", err, info)
125139
return mainDevicePath, fmt.Errorf("lv list error %w: %s", err, info)
126140
}
127-
klog.Info("=============== Got LVs %s on Volume group %s ============", lvList, volumeGroupName)
141+
klog.Info("=============== Got LVs %s on Volume group %s ============", string(lvList), volumeGroupName)
128142
if !strings.Contains(string(lvList), mainLvName) {
129143
// lvcreate -n main -l 100%PVS cachegroup /dev/sdb
130144
klog.V(2).Infof("====== lvcreate main cache layer ======")
@@ -215,7 +229,7 @@ func SetupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
215229
return mainDevicePath, nil
216230
}
217231

218-
func CleanupCache(volumeId string, nodeId string) error {
232+
func cleanupCache(volumeId string, nodeId string) error {
219233

220234
volumeGroupName := getVolumeGroupName(nodeId)
221235
klog.V(2).Infof("=============Deactivating volume %s/%s=====", volumeGroupName, volumeId)
@@ -236,7 +250,6 @@ func CleanupCache(volumeId string, nodeId string) error {
236250
klog.Errorf("Errored while uncaching the disk %v: %s", err, info)
237251
return fmt.Errorf("errored while uncaching the disk %w: %s", err, info)
238252
}
239-
reduceVolumeGroup(volumeGroupName)
240253
return nil
241254
}
242255

@@ -278,15 +291,80 @@ func createVg(volumeGroupName string, devicePath string) error {
278291
return nil
279292
}
280293

281-
func reduceVolumeGroup(volumeGroupName string) {
294+
func reduceVolumeGroup(volumeGroupName string, force bool) {
282295
klog.V(2).Infof("=========Cleanup VG========")
283296
args := []string{
284297
"--removemissing",
285-
"--force",
286298
volumeGroupName,
287299
}
300+
if force {
301+
args = append(args, "--force")
302+
}
288303
info, err := common.RunCommand("vgreduce", args...)
289304
if err != nil {
290305
klog.Errorf("Errored while cleaning up volume group %v: %s", err, info)
291306
}
292307
}
308+
309+
func RaidLocalSsds() error {
310+
isRaided, err := isRaided()
311+
if err != nil {
312+
klog.V(2).Info("======Errored while scanning for available LocalSSDs err:%v; continuing Raiding=======", err)
313+
} else if isRaided {
314+
klog.V(2).Infof("===============Local SSDs are already RAIDed==============")
315+
return nil
316+
}
317+
info, err := common.RunCommand("nvme", []string{"list"}...)
318+
if err != nil {
319+
klog.Errorf("nvme list error %v: %s", err, info)
320+
return fmt.Errorf("errored while scanning available NVME disks info: %v; err:%v", info, err)
321+
}
322+
klog.V(2).Infof("==========NVME list %v========", string(info))
323+
infoString := strings.TrimSpace(strings.ReplaceAll(string(info), "\n", " "))
324+
infoString = strings.ReplaceAll(infoString, "\"", "")
325+
infoSlice := strings.Split(strings.TrimSpace(infoString), " ")
326+
diskList := []string{}
327+
for _, diskInfo := range infoSlice {
328+
diskName := strings.TrimSpace(diskInfo)
329+
if strings.HasPrefix(diskName, "/dev/n") {
330+
diskList = append(diskList, diskName)
331+
}
332+
}
333+
nvmeDiskCount := len(diskList)
334+
nvmeDiskList := strings.Join(diskList, " ")
335+
klog.V(2).Infof("========= nvmeDiskCount %v; nvmeDislList: %v ================", nvmeDiskCount, nvmeDiskList)
336+
args := []string{
337+
"--create",
338+
raidedLssdPrefix + raidedLocalSsdName,
339+
"-l=" + raidMode,
340+
// Force RAIDing as sometime it might fail for caution if there is just 1 LSSD present as 1 LSSD need not be RAIDed
341+
"--force",
342+
"-n",
343+
strconv.Itoa(nvmeDiskCount),
344+
nvmeDiskList,
345+
}
346+
info, err = common.RunCommand("mdadm", args...)
347+
if err != nil {
348+
klog.Errorf("Errored while RAIDing LSSDs %v: %s", err, info)
349+
return fmt.Errorf("errored while RAIDing LSSDs info: %v; err:%v", info, err)
350+
} else {
351+
klog.V(2).Infof("========RAIDed Local SSDs in mode 0===========")
352+
}
353+
return nil
354+
}
355+
356+
func isRaided() (bool, error) {
357+
args := []string{
358+
"--detail",
359+
"--scan",
360+
}
361+
info, err := common.RunCommand("mdadm", args...)
362+
if err != nil {
363+
return false, fmt.Errorf("errored while scanning for raided LSSD %v: %s", err, info)
364+
}
365+
klog.V(2).Infof("=========== Got LSSDs %v===========", string(info))
366+
if strings.Contains(string(info), raidedLssdPrefix+raidedLocalSsdName) {
367+
return true, nil
368+
}
369+
return false, nil
370+
}

pkg/gce-pd-csi-driver/node.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
325325
if err != nil {
326326
klog.Errorf("filepath.EvalSymlinks(%q) failed when trying to create volume group: %v", devicePath, err)
327327
}
328-
devicePath, err = SetupCaching(devFsPath, req, nodeId)
328+
devicePath, err = setupCaching(devFsPath, req, nodeId)
329329
if err != nil {
330330
return nil, status.Error(codes.Internal, fmt.Sprintf("Error setting up cache: %v", err.Error()))
331331
}
@@ -492,7 +492,7 @@ func (ns *GCENodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUns
492492
// // volumeID format is projects/songsunny-joonix/zones/us-central1-b/disks/pvc-ef877b3e-b116-411e-9553-42f7c74bbcd4
493493
// volumeGroupName := "cache-" + pvcNameStringSlice[len(pvcNameStringSlice)-1]
494494
nodeId := ns.MetadataService.GetName()
495-
err = CleanupCache(volumeID, nodeId)
495+
err = cleanupCache(volumeID, nodeId)
496496
if err != nil {
497497
klog.Errorf("Failed to cleanup cache: %v", err)
498498
}

0 commit comments

Comments
 (0)