Skip to content

Commit 8f80ec3

Browse files
authored
Add Local SSD support (#3879)
## Motivation When deploying networks on GCP, using local SSDs give us good performance improvements. ## Proposal Add support for deploying networks using local SSDs. Keep in mind that this will only be used when deploying GCP networks, this won't be used ever for local networks for obvious reasons (it already always runs on the local disk). It made more sense for the YAML files to setup the local CSI driver to live here to avoid some directory gymnastics on the `linera-infra` side. ## Test Plan Deployed networks using this code ## Release Plan - Nothing to do / These changes follow the usual release cycle.
1 parent 12b82bc commit 8f80ec3

File tree

12 files changed

+357
-2
lines changed

12 files changed

+357
-2
lines changed

kubernetes/linera-validator/helmfile.yaml

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,30 @@ environments:
33
values:
44
- writeToGrafanaCloud: {{ env "LINERA_WRITE_TO_GRAFANA_CLOUD" | default "false" }}
55
validatorLabel: {{ env "LINERA_VALIDATOR_LABEL" | default (printf "local-%s" (env "USER")) }}
6+
usingLocalSsd: {{ env "LINERA_HELMFILE_SET_USING_LOCAL_SSD" | default "false" }}
7+
8+
helmDefaults:
9+
wait: true
10+
recreatePods: false
11+
12+
{{- if .Values.usingLocalSsd }}
13+
hooks:
14+
- events: ["prepare"]
15+
showlogs: true
16+
command: bash
17+
args:
18+
- -c
19+
- |
20+
echo "Ensuring RAID0 and Local‑CSI driver are setup..."
21+
22+
kubectl create -f ./scylla-setup/gke-daemonset-raid-disks.yaml
23+
kubectl -n default rollout status daemonset/gke-raid-disks
24+
25+
kubectl apply -f ./scylla-setup/local-csi-driver
26+
kubectl -n local-csi-driver rollout status daemonset.apps/local-csi-driver
27+
28+
kubectl apply -f ./scylla-setup/local-ssd-sc.yaml
29+
{{- end }}
630

731
---
832

@@ -69,4 +93,3 @@ releases:
6993
set:
7094
- name: crds.enabled
7195
value: "true"
72-
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
apiVersion: apps/v1
2+
kind: DaemonSet
3+
metadata:
4+
name: gke-raid-disks
5+
namespace: default
6+
labels:
7+
k8s-app: gke-raid-disks
8+
spec:
9+
selector:
10+
matchLabels:
11+
name: gke-raid-disks
12+
template:
13+
metadata:
14+
labels:
15+
name: gke-raid-disks
16+
spec:
17+
nodeSelector:
18+
cloud.google.com/gke-local-nvme-ssd: "true"
19+
hostPID: true
20+
containers:
21+
- name: startup-script
22+
image: registry.k8s.io/startup-script:v1
23+
securityContext:
24+
privileged: true
25+
env:
26+
- name: STARTUP_SCRIPT
27+
value: |
28+
set -o errexit
29+
set -o nounset
30+
set -o pipefail
31+
32+
# Ensure we have the XFS tools
33+
if ! command -v mkfs.xfs >/dev/null; then
34+
echo "mkfs.xfs not found! Installing xfsprogs..."
35+
apt-get update
36+
DEBIAN_FRONTEND=noninteractive \
37+
apt-get install -y --no-install-recommends xfsprogs
38+
fi
39+
40+
devices=()
41+
for ssd in /dev/disk/by-id/google-local-ssd-block*; do
42+
if [ -e "${ssd}" ]; then
43+
devices+=("${ssd}")
44+
fi
45+
done
46+
if [ "${#devices[@]}" -eq 0 ]; then
47+
echo "No Local NVMe SSD disks found."
48+
exit 1
49+
fi
50+
51+
seen_arrays=(/dev/md/*)
52+
device=${seen_arrays[0]}
53+
echo "Setting RAID array with Local SSDs on device ${device}"
54+
if [ ! -e "$device" ]; then
55+
device="/dev/md/0"
56+
echo "y" | mdadm --create "${device}" --level=0 --force --raid-devices=${#devices[@]} "${devices[@]}"
57+
fi
58+
59+
if ! blkid "${device}" >/dev/null 2>&1 ; then
60+
echo "Formatting '${device}'"
61+
mkfs.xfs -f "${device}"
62+
fi
63+
64+
mountpoint=/mnt/disks/raid
65+
mkdir -p "${mountpoint}"
66+
echo "Mounting '${device}' at '${mountpoint}'"
67+
mount -o discard,prjquota,noatime,nodiratime "${device}" "${mountpoint}"
68+
chmod a+w "${mountpoint}"
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: v1
2+
kind: Namespace
3+
metadata:
4+
name: local-csi-driver
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
apiVersion: storage.k8s.io/v1
2+
kind: CSIDriver
3+
metadata:
4+
name: local.csi.scylladb.com
5+
spec:
6+
attachRequired: false
7+
storageCapacity: true
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
apiVersion: v1
2+
kind: ServiceAccount
3+
metadata:
4+
name: local-csi-driver
5+
namespace: local-csi-driver
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
kind: ClusterRole
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
metadata:
4+
name: scylladb:csi-external-provisioner
5+
rules:
6+
- apiGroups:
7+
- ""
8+
resources:
9+
- "persistentvolumes"
10+
verbs:
11+
- "get"
12+
- "list"
13+
- "watch"
14+
- "create"
15+
- "delete"
16+
- apiGroups:
17+
- ""
18+
resources:
19+
- "persistentvolumeclaims"
20+
verbs:
21+
- "get"
22+
- "list"
23+
- "watch"
24+
- "update"
25+
- apiGroups:
26+
- "storage.k8s.io"
27+
resources:
28+
- "storageclasses"
29+
verbs:
30+
- "get"
31+
- "list"
32+
- "watch"
33+
- apiGroups:
34+
- ""
35+
resources:
36+
- "events"
37+
verbs:
38+
- "list"
39+
- "watch"
40+
- "create"
41+
- "update"
42+
- "patch"
43+
- apiGroups:
44+
- "snapshot.storage.k8s.io"
45+
resources:
46+
- "volumesnapshots"
47+
verbs:
48+
- "get"
49+
- "list"
50+
- apiGroups:
51+
- "snapshot.storage.k8s.io"
52+
resources:
53+
- "volumesnapshotcontents"
54+
verbs:
55+
- "get"
56+
- "list"
57+
- apiGroups:
58+
- "storage.k8s.io"
59+
resources:
60+
- "csinodes"
61+
verbs:
62+
- "get"
63+
- "list"
64+
- "watch"
65+
- apiGroups:
66+
- ""
67+
resources:
68+
- "nodes"
69+
verbs:
70+
- "get"
71+
- "list"
72+
- "watch"
73+
- apiGroups:
74+
- "storage.k8s.io"
75+
resources:
76+
- "csistoragecapacities"
77+
verbs:
78+
- "get"
79+
- "list"
80+
- "watch"
81+
- "create"
82+
- "update"
83+
- "patch"
84+
- "delete"
85+
# The GET permissions below are needed for walking up the ownership chain
86+
# for CSIStorageCapacity. They are sufficient for deployment via
87+
# StatefulSet (only needs to get Pod) and Deployment (needs to get
88+
# Pod and then ReplicaSet to find the Deployment).
89+
- apiGroups:
90+
- ""
91+
resources:
92+
- "pods"
93+
verbs:
94+
- "get"
95+
- apiGroups:
96+
- "apps"
97+
resources:
98+
- "replicasets"
99+
verbs:
100+
- "get"
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
kind: ClusterRoleBinding
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
metadata:
4+
name: scylladb:csi-external-provisioner
5+
subjects:
6+
- kind: ServiceAccount
7+
name: local-csi-driver
8+
namespace: local-csi-driver
9+
roleRef:
10+
kind: ClusterRole
11+
name: scylladb:csi-external-provisioner
12+
apiGroup: rbac.authorization.k8s.io
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
kind: DaemonSet
2+
apiVersion: apps/v1
3+
metadata:
4+
name: local-csi-driver
5+
namespace: local-csi-driver
6+
labels:
7+
app.kubernetes.io/name: local-csi-driver
8+
spec:
9+
selector:
10+
matchLabels:
11+
app.kubernetes.io/name: local-csi-driver
12+
template:
13+
metadata:
14+
labels:
15+
app.kubernetes.io/name: local-csi-driver
16+
spec:
17+
nodeSelector:
18+
kubernetes.io/os: linux
19+
serviceAccountName: local-csi-driver
20+
tolerations:
21+
- operator: Exists
22+
containers:
23+
- name: local-csi-driver
24+
securityContext:
25+
privileged: true
26+
image: docker.io/scylladb/local-csi-driver:latest
27+
imagePullPolicy: IfNotPresent
28+
args:
29+
- --listen=/csi/csi.sock
30+
- --node-name=$(NODE_NAME)
31+
- --volumes-dir=/mnt/disks/raid
32+
- --v=2
33+
env:
34+
- name: NODE_NAME
35+
valueFrom:
36+
fieldRef:
37+
fieldPath: spec.nodeName
38+
volumeMounts:
39+
- name: kubelet-dir
40+
mountPath: /var/lib/kubelet
41+
mountPropagation: "Bidirectional"
42+
- name: plugin-dir
43+
mountPath: /csi
44+
- name: volumes-dir
45+
mountPath: /mnt/disks/raid
46+
ports:
47+
- name: healthz
48+
containerPort: 9809
49+
protocol: TCP
50+
livenessProbe:
51+
httpGet:
52+
path: /healthz
53+
port: healthz
54+
initialDelaySeconds: 10
55+
timeoutSeconds: 3
56+
periodSeconds: 2
57+
failureThreshold: 5
58+
- name: csi-driver-registrar
59+
image: registry.k8s.io/sig-storage/csi-node-driver-registrar@sha256:fdff3ee285341bc58033b6b2458a5d45fd90ec6922a8ba6ebdd49b0c41e2cd34
60+
imagePullPolicy: IfNotPresent
61+
args:
62+
- --csi-address=/csi/csi.sock
63+
- --kubelet-registration-path=/var/lib/kubelet/plugins/local.csi.scylladb.com/csi.sock
64+
volumeMounts:
65+
- name: plugin-dir
66+
mountPath: /csi
67+
- name: registration-dir
68+
mountPath: /registration
69+
- name: liveness-probe
70+
image: registry.k8s.io/sig-storage/livenessprobe@sha256:cacee2b5c36dd59d4c7e8469c05c9e4ef53ecb2df9025fa8c10cdaf61bce62f0
71+
imagePullPolicy: IfNotPresent
72+
args:
73+
- --csi-address=/csi/csi.sock
74+
- --health-port=9809
75+
- --v=2
76+
volumeMounts:
77+
- name: plugin-dir
78+
mountPath: /csi
79+
- name: csi-provisioner
80+
image: registry.k8s.io/sig-storage/csi-provisioner@sha256:ee3b525d5b89db99da3b8eb521d9cd90cb6e9ef0fbb651e98bb37be78d36b5b8
81+
imagePullPolicy: IfNotPresent
82+
args:
83+
- --csi-address=/csi/csi.sock
84+
- --v=2
85+
- --node-deployment
86+
- --feature-gates=Topology=true
87+
- --immediate-topology=false
88+
- --enable-capacity
89+
- --capacity-ownerref-level=0
90+
- --capacity-poll-interval=30s
91+
- --default-fstype=xfs
92+
env:
93+
- name: NAMESPACE
94+
valueFrom:
95+
fieldRef:
96+
fieldPath: metadata.namespace
97+
- name: POD_NAME
98+
valueFrom:
99+
fieldRef:
100+
fieldPath: metadata.name
101+
- name: NODE_NAME
102+
valueFrom:
103+
fieldRef:
104+
fieldPath: spec.nodeName
105+
volumeMounts:
106+
- name: plugin-dir
107+
mountPath: /csi
108+
volumes:
109+
- name: kubelet-dir
110+
hostPath:
111+
path: /var/lib/kubelet
112+
type: Directory
113+
- name: plugin-dir
114+
hostPath:
115+
path: /var/lib/kubelet/plugins/local.csi.scylladb.com/
116+
type: DirectoryOrCreate
117+
- name: registration-dir
118+
hostPath:
119+
path: /var/lib/kubelet/plugins_registry/
120+
type: Directory
121+
- name: volumes-dir
122+
hostPath:
123+
path: /mnt/disks/raid
124+
type: Directory
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
apiVersion: storage.k8s.io/v1
2+
kind: StorageClass
3+
metadata:
4+
name: nvme-ssd-block
5+
provisioner: local.csi.scylladb.com
6+
volumeBindingMode: WaitForFirstConsumer
7+
reclaimPolicy: Delete
8+
allowVolumeExpansion: false

kubernetes/linera-validator/scylla.values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ sysctls:
33
- "fs.aio-max-nr=4082080"
44
datacenter: validator
55
racks:
6-
- name: rack-1
6+
- name: rack
77
members: 1
88
scyllaConfig: "scylla-config"
99
storage:

0 commit comments

Comments
 (0)