Skip to content

Commit 7a0ca67

Browse files
authored
SCHED-1071: Local NVMe support (#874)
1 parent 763b4d1 commit 7a0ca67

File tree

13 files changed

+277
-12
lines changed

13 files changed

+277
-12
lines changed

soperator/installations/example/main.tf

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ locals {
5454
subset_index = subset
5555
preemptible = nodeset.preemptible
5656
reservation_policy = nodeset.reservation_policy
57+
local_nvme = {
58+
enabled = try(nodeset.local_nvme.enabled, false)
59+
mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme")
60+
filesystem_type = try(nodeset.local_nvme.filesystem_type, "ext4")
61+
}
5762
}
5863
]])
5964
}
@@ -63,6 +68,7 @@ resource "terraform_data" "check_variables" {
6368
terraform_data.check_slurm_nodeset,
6469
terraform_data.check_slurm_nodeset_accounting,
6570
terraform_data.check_nfs,
71+
terraform_data.check_local_nvme,
6672
]
6773
}
6874

@@ -414,7 +420,6 @@ module "slurm" {
414420
storage_class_name = replace("${local.storage_class_prefix}-${lower(var.node_local_image_disk.spec.disk_type)}-${lower(var.node_local_image_disk.spec.filesystem_type)}", "_", "-")
415421
} : null
416422
}
417-
418423
nfs = {
419424
enabled = var.nfs.enabled
420425
path = var.nfs.enabled ? module.nfs-server[0].nfs_export_path : null
@@ -479,6 +484,11 @@ module "slurm" {
479484
gres_config = lookup(module.resources.gres_config_by_platform, nodeset.resource.platform, null)
480485
create_partition = nodeset.create_partition != null ? nodeset.create_partition : false
481486
ephemeral_nodes = nodeset.ephemeral_nodes
487+
local_nvme = {
488+
enabled = try(nodeset.local_nvme.enabled, false)
489+
mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme")
490+
filesystem_type = try(nodeset.local_nvme.filesystem_type, "ext4")
491+
}
482492
}]
483493

484494
login_allocation_id = module.k8s.static_ip_allocation_id

soperator/installations/example/terraform.tfvars

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,14 @@ slurm_nodeset_workers = [
337337
# When true, nodes will use dynamic topology injection and power management.
338338
# By default, false.
339339
ephemeral_nodes = false
340+
# Optional local NVMe passthrough for this nodeset only.
341+
# Uses local instance disks, creates a RAID0 array and mounts it on the host via cloud-init.
342+
# mount_path: path used for both host RAID mount and jail submount.
343+
# local_nvme = {
344+
# enabled = true
345+
# mount_path = "/mnt/local-nvme"
346+
# filesystem_type = "ext4"
347+
# }
340348
},
341349
]
342350

soperator/installations/example/variables.tf

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -722,6 +722,11 @@ variable "slurm_nodeset_workers" {
722722
features = optional(list(string))
723723
create_partition = optional(bool)
724724
ephemeral_nodes = optional(bool, false)
725+
local_nvme = optional(object({
726+
enabled = optional(bool, false)
727+
mount_path = optional(string, "/mnt/local-nvme")
728+
filesystem_type = optional(string, "ext4")
729+
}), {})
725730
}))
726731
nullable = false
727732
default = [{
@@ -771,6 +776,24 @@ variable "slurm_nodeset_workers" {
771776
])
772777
error_message = "Worker nodeset autoscaling.min_size must be less than or equal to size."
773778
}
779+
780+
validation {
781+
condition = alltrue([
782+
for worker in var.slurm_nodeset_workers :
783+
!try(worker.local_nvme.enabled, false) || (
784+
startswith(try(worker.local_nvme.mount_path, "/mnt/local-nvme"), "/")
785+
)
786+
])
787+
error_message = "When worker local NVMe is enabled, mount_path must be an absolute path."
788+
}
789+
790+
validation {
791+
condition = alltrue([
792+
for worker in var.slurm_nodeset_workers :
793+
contains(["ext4", "xfs"], try(worker.local_nvme.filesystem_type, "ext4"))
794+
])
795+
error_message = "When worker local NVMe filesystem_type is set, it must be `ext4` or `xfs`."
796+
}
774797
}
775798

776799
variable "slurm_nodeset_login" {
@@ -923,6 +946,26 @@ resource "terraform_data" "check_slurm_nodeset" {
923946
}
924947
}
925948

949+
resource "terraform_data" "check_local_nvme" {
950+
lifecycle {
951+
precondition {
952+
condition = (
953+
!anytrue([
954+
for worker in var.slurm_nodeset_workers :
955+
try(worker.local_nvme.enabled, false)
956+
]) ||
957+
alltrue([
958+
for worker in var.slurm_nodeset_workers :
959+
!try(worker.local_nvme.enabled, false) || (
960+
try(module.resources.local_nvme_supported_by_region_platform_preset[var.region][worker.resource.platform][worker.resource.preset], false)
961+
)
962+
])
963+
)
964+
error_message = "Local NVMe is enabled, but one or more worker nodesets use unsupported region/platform/preset."
965+
}
966+
}
967+
}
968+
926969
# region Worker
927970

928971
variable "slurm_worker_sshd_config_map_ref_name" {

soperator/modules/available_resources/outputs.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ output "by_platform" {
2323
value = local.presets_by_platforms
2424
}
2525

26+
output "local_nvme_supported_by_region_platform_preset" {
27+
description = "Local NVMe support matrix by region/platform/preset."
28+
value = local.local_nvme_supported_by_region_platform_preset
29+
}
30+
2631
output "k8s_ephemeral_storage_coefficient" {
2732
value = local.reserve.ephemeral_storage.coefficient
2833
}

soperator/modules/available_resources/preset.tf

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,15 @@ locals {
309309
}
310310
}
311311

312-
presets_by_platforms = tomap({
312+
# Allow-list: "${region}/${platform}/${preset}"
313+
local_nvme_supported_true_region_platform_preset = toset([
314+
# gpu-b300-sxm
315+
"${local.regions.uk-south1}/${local.platforms.gpu-b300-sxm}/${local.presets.p-8g-128c-1600g}",
316+
"${local.regions.uk-south1}/${local.platforms.gpu-b300-sxm}/${local.presets.p-8g-160c-1792g}",
317+
"${local.regions.uk-south1}/${local.platforms.gpu-b300-sxm}/${local.presets.p-8g-192c-2768g}",
318+
])
319+
320+
presets_by_platforms_raw = tomap({
313321
(local.platforms.cpu-e2) = tomap({
314322
(local.presets.p-2c-8g) = local.presets_cpu.c-2vcpu-8gb
315323
(local.presets.p-4c-16g) = local.presets_cpu.c-4vcpu-16gb
@@ -362,4 +370,25 @@ locals {
362370
(local.presets.p-8g-192c-2768g) = local.presets_gpu.g-8gpu-192vcpu-2768gb
363371
})
364372
})
373+
374+
local_nvme_supported_by_region_platform_preset = tomap({
375+
for region in [for _, region in local.regions : region] : region => tomap({
376+
for platform, presets in local.presets_by_platforms_raw : platform => tomap({
377+
for preset, _ in presets : preset => contains(local.local_nvme_supported_true_region_platform_preset, "${region}/${platform}/${preset}")
378+
})
379+
})
380+
})
381+
382+
presets_by_platforms = tomap({
383+
for platform, presets in local.presets_by_platforms_raw : platform => tomap({
384+
for preset, resources in presets : preset => merge(resources, {
385+
local_nvme_supported = anytrue([
386+
for region in [for _, region in local.regions : region] : try(local.local_nvme_supported_by_region_platform_preset[region][platform][preset], false)
387+
])
388+
local_nvme_supported_by_region = tomap({
389+
for region in [for _, region in local.regions : region] : region => try(local.local_nvme_supported_by_region_platform_preset[region][platform][preset], false)
390+
})
391+
})
392+
})
393+
})
365394
}

soperator/modules/k8s/k8s_ng_workers_v2.tf

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,15 @@ resource "nebius_mk8s_v1_node_group" "worker_v2" {
137137
block_size_bytes = provider::units::from_kib(var.node_group_workers_v2[count.index].boot_disk.block_size_kibibytes)
138138
}
139139

140+
local_disks = try(var.node_group_workers_v2[count.index].local_nvme.enabled, false) ? {
141+
config = {
142+
none = true
143+
}
144+
passthrough_group = {
145+
requested = true
146+
}
147+
} : null
148+
140149
filesystems = concat(
141150
[
142151
{
@@ -166,11 +175,17 @@ resource "nebius_mk8s_v1_node_group" "worker_v2" {
166175

167176
os = "ubuntu24.04"
168177

169-
cloud_init_user_data = local.node_group_gpu_present_v2.worker[count.index] ? (
170-
local.node_cloud_init.enabled ? local.node_cloud_init.cloud_init_data : null
171-
) : (
172-
local.node_ssh_access.enabled ? local.node_cloud_init.cloud_init_data_no_nvidia : null
173-
)
178+
cloud_init_user_data = (
179+
local.node_ssh_access.enabled ||
180+
(local.node_group_gpu_present_v2.worker[count.index] && length(var.nvidia_admin_conf_lines) > 0) ||
181+
try(var.node_group_workers_v2[count.index].local_nvme.enabled, false)
182+
) ? templatefile("${path.module}/templates/cloud_init.yaml.tftpl", {
183+
ssh_users = var.node_ssh_access_users
184+
nvidia_admin_conf_lines = local.node_group_gpu_present_v2.worker[count.index] ? var.nvidia_admin_conf_lines : []
185+
local_nvme_enabled = try(var.node_group_workers_v2[count.index].local_nvme.enabled, false)
186+
local_nvme_mount_path = try(var.node_group_workers_v2[count.index].local_nvme.mount_path, "/mnt/local-nvme")
187+
local_nvme_filesystem_type = try(var.node_group_workers_v2[count.index].local_nvme.filesystem_type, "ext4")
188+
}) : null
174189
}
175190

176191
lifecycle {

soperator/modules/k8s/locals.tf

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,18 @@ locals {
66
node_cloud_init = {
77
enabled = length(var.node_ssh_access_users) > 0 || length(var.nvidia_admin_conf_lines) > 0
88
cloud_init_data = templatefile("${path.module}/templates/cloud_init.yaml.tftpl", {
9-
ssh_users = var.node_ssh_access_users
10-
nvidia_admin_conf_lines = var.nvidia_admin_conf_lines
9+
ssh_users = var.node_ssh_access_users
10+
nvidia_admin_conf_lines = var.nvidia_admin_conf_lines
11+
local_nvme_enabled = false
12+
local_nvme_mount_path = "/mnt/local-nvme"
13+
local_nvme_filesystem_type = "ext4"
1114
})
1215
cloud_init_data_no_nvidia = templatefile("${path.module}/templates/cloud_init.yaml.tftpl", {
13-
ssh_users = var.node_ssh_access_users
14-
nvidia_admin_conf_lines = []
16+
ssh_users = var.node_ssh_access_users
17+
nvidia_admin_conf_lines = []
18+
local_nvme_enabled = false
19+
local_nvme_mount_path = "/mnt/local-nvme"
20+
local_nvme_filesystem_type = "ext4"
1521
})
1622
}
1723

soperator/modules/k8s/templates/cloud_init.yaml.tftpl

Lines changed: 116 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,19 @@ users:
1212
%{ endfor }
1313
%{ endif }
1414

15-
%{ if length(nvidia_admin_conf_lines) > 0 }
15+
%{ if local_nvme_enabled }
16+
package_update: true
17+
packages:
18+
- mdadm
19+
- nvme-cli
20+
%{ if local_nvme_filesystem_type == "xfs" }
21+
- xfsprogs
22+
%{ endif }
23+
%{ endif }
24+
25+
%{ if length(nvidia_admin_conf_lines) > 0 || local_nvme_enabled }
1626
write_files:
27+
%{ if length(nvidia_admin_conf_lines) > 0 }
1728
- path: /etc/modprobe.d/nvidia_admin.conf
1829
owner: root:root
1930
permissions: "0644"
@@ -91,7 +102,111 @@ write_files:
91102
fi
92103

93104
log "end"
105+
%{ endif }
106+
%{ if local_nvme_enabled }
107+
- path: /usr/local/sbin/prepare-disks.sh
108+
owner: root:root
109+
permissions: "0755"
110+
content: |
111+
#!/usr/bin/env bash
112+
set -euo pipefail
113+
114+
MOUNTPOINT="$${1:-/mnt/local-nvme}"
115+
FILESYSTEM_TYPE="${local_nvme_filesystem_type}"
116+
MD_DEV="/dev/md0"
117+
case "$${FILESYSTEM_TYPE}" in
118+
ext4)
119+
MOUNT_OPTS="noatime,nodiratime,lazytime,commit=60"
120+
MKFS_CMD=(mkfs.ext4 -F -m 0)
121+
;;
122+
xfs)
123+
MOUNT_OPTS="noatime,nodiratime,logbufs=8,inode64"
124+
MKFS_CMD=(mkfs.xfs -f)
125+
;;
126+
*)
127+
echo "Unsupported filesystem type: $${FILESYSTEM_TYPE}"
128+
exit 1
129+
;;
130+
esac
131+
132+
echo "Detecting NVMe disks..."
133+
nvme list
134+
135+
mapfile -t NVME_DISKS < <(
136+
nvme list | awk 'NR>2 && $1 ~ /^\/dev\/nvme[0-9]+n[0-9]+$/ { print $1 }' | sort -V | uniq
137+
)
138+
139+
ROOT_SOURCE="$(findmnt -n -o SOURCE / || true)"
140+
ROOT_PKNAME="$(lsblk -no PKNAME "$${ROOT_SOURCE}" 2>/dev/null || true)"
141+
if [[ -n "$${ROOT_PKNAME}" ]]; then
142+
ROOT_DISK="/dev/$${ROOT_PKNAME}"
143+
FILTERED=()
144+
for d in "$${NVME_DISKS[@]}"; do
145+
if [[ "$${d}" != "$${ROOT_DISK}" ]]; then
146+
FILTERED+=("$${d}")
147+
fi
148+
done
149+
NVME_DISKS=("$${FILTERED[@]}")
150+
fi
151+
152+
DISK_COUNT="$${#NVME_DISKS[@]}"
153+
if (( DISK_COUNT < 2 || DISK_COUNT > 8 )); then
154+
echo "Expected 2..8 NVMe disks for RAID0, found $${DISK_COUNT}: $${NVME_DISKS[*]}"
155+
exit 1
156+
fi
157+
158+
for d in "$${NVME_DISKS[@]}"; do
159+
[[ -b "$${d}" ]] || { echo "Block device not found: $${d}"; exit 1; }
160+
done
161+
162+
echo "Using $${DISK_COUNT} NVMe disk(s): $${NVME_DISKS[*]}"
163+
164+
for d in "$${NVME_DISKS[@]}"; do
165+
DISK_BASENAME="$(basename "$${d}")"
166+
SCHEDULER_PATH="/sys/block/$${DISK_BASENAME}/queue/scheduler"
167+
if [[ -w "$${SCHEDULER_PATH}" ]]; then
168+
echo none | tee "$${SCHEDULER_PATH}" >/dev/null
169+
else
170+
echo "Skipping scheduler update for $${d}: $${SCHEDULER_PATH} is not writable"
171+
fi
172+
done
173+
174+
if [[ -e "$${MD_DEV}" ]]; then
175+
mdadm --stop "$${MD_DEV}" || true
176+
fi
94177

178+
mdadm --create "$${MD_DEV}" \
179+
--level=0 \
180+
--raid-devices="$${DISK_COUNT}" \
181+
"$${NVME_DISKS[@]}"
182+
183+
udevadm settle
184+
185+
"$${MKFS_CMD[@]}" "$${MD_DEV}"
186+
187+
mkdir -p "$${MOUNTPOINT}"
188+
mount -o "$${MOUNT_OPTS}" "$${MD_DEV}" "$${MOUNTPOINT}"
189+
190+
UUID="$(blkid -s UUID -o value "$${MD_DEV}")"
191+
grep -q "$${UUID}" /etc/fstab || echo "UUID=$${UUID} $${MOUNTPOINT} $${FILESYSTEM_TYPE} $${MOUNT_OPTS},nofail 0 2" >> /etc/fstab
192+
193+
if [[ -d /etc/mdadm ]]; then
194+
mdadm --detail --scan > /etc/mdadm/mdadm.conf
195+
fi
196+
197+
echo "Done."
198+
echo "RAID device: $${MD_DEV}"
199+
echo "Mounted at : $${MOUNTPOINT}"
200+
df -h "$${MOUNTPOINT}"
201+
%{ endif }
202+
%{ endif }
203+
204+
%{ if length(nvidia_admin_conf_lines) > 0 || local_nvme_enabled }
95205
runcmd:
206+
%{ if length(nvidia_admin_conf_lines) > 0 }
96207
- [ bash, -lc, "/usr/local/bin/nvidia-conf-check.sh >> /var/log/nvidia-conf-check.log 2>&1" ]
97208
%{ endif }
209+
%{ if local_nvme_enabled }
210+
- [ bash, -lc, "/usr/local/sbin/prepare-disks.sh ${local_nvme_mount_path} > /var/log/prepare-disks.log 2>&1" ]
211+
%{ endif }
212+
%{ endif }

soperator/modules/k8s/variables.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,11 @@ variable "node_group_workers_v2" {
155155
policy = optional(string)
156156
reservation_ids = optional(list(string))
157157
}))
158+
local_nvme = optional(object({
159+
enabled = optional(bool, false)
160+
mount_path = optional(string, "/mnt/local-nvme")
161+
filesystem_type = optional(string, "ext4")
162+
}), {})
158163
nodeset_index = number
159164
subset_index = number
160165
}))

soperator/modules/slurm/main.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ resource "helm_release" "soperator_fluxcd_cm" {
9797
cluster_name = var.cluster_name
9898
region = var.region
9999
public_o11y_enabled = var.public_o11y_enabled
100+
has_local_nvme = anytrue([for nodeset in var.worker_nodesets : try(nodeset.local_nvme.enabled, false)])
100101
metrics_collector = local.metrics_collector
101102
create_pvcs = var.create_pvcs
102103

0 commit comments

Comments
 (0)