diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index 22cee978d..abf34d87f 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -54,6 +54,11 @@ locals { subset_index = subset preemptible = nodeset.preemptible reservation_policy = nodeset.reservation_policy + local_nvme = { + enabled = try(nodeset.local_nvme.enabled, false) + mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme") + filesystem_type = try(nodeset.local_nvme.filesystem_type, "ext4") + } } ]]) } @@ -63,6 +68,7 @@ resource "terraform_data" "check_variables" { terraform_data.check_slurm_nodeset, terraform_data.check_slurm_nodeset_accounting, terraform_data.check_nfs, + terraform_data.check_local_nvme, ] } @@ -414,7 +420,6 @@ module "slurm" { storage_class_name = replace("${local.storage_class_prefix}-${lower(var.node_local_image_disk.spec.disk_type)}-${lower(var.node_local_image_disk.spec.filesystem_type)}", "_", "-") } : null } - nfs = { enabled = var.nfs.enabled path = var.nfs.enabled ? module.nfs-server[0].nfs_export_path : null @@ -479,6 +484,11 @@ module "slurm" { gres_config = lookup(module.resources.gres_config_by_platform, nodeset.resource.platform, null) create_partition = nodeset.create_partition != null ? nodeset.create_partition : false ephemeral_nodes = nodeset.ephemeral_nodes + local_nvme = { + enabled = try(nodeset.local_nvme.enabled, false) + mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme") + filesystem_type = try(nodeset.local_nvme.filesystem_type, "ext4") + } }] login_allocation_id = module.k8s.static_ip_allocation_id diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index cf85444fa..597e565a8 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -337,6 +337,14 @@ slurm_nodeset_workers = [ # When true, nodes will use dynamic topology injection and power management. # By default, false. ephemeral_nodes = false + # Optional local NVMe passthrough for this nodeset only. + # Uses local instance disks, creates a RAID0 array and mounts it on the host via cloud-init. + # mount_path: path used for both host RAID mount and jail submount. + # local_nvme = { + # enabled = true + # mount_path = "/mnt/local-nvme" + # filesystem_type = "ext4" + # } }, ] diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index fd0904efe..b75be4787 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -722,6 +722,11 @@ variable "slurm_nodeset_workers" { features = optional(list(string)) create_partition = optional(bool) ephemeral_nodes = optional(bool, false) + local_nvme = optional(object({ + enabled = optional(bool, false) + mount_path = optional(string, "/mnt/local-nvme") + filesystem_type = optional(string, "ext4") + }), {}) })) nullable = false default = [{ @@ -771,6 +776,24 @@ variable "slurm_nodeset_workers" { ]) error_message = "Worker nodeset autoscaling.min_size must be less than or equal to size." } + + validation { + condition = alltrue([ + for worker in var.slurm_nodeset_workers : + !try(worker.local_nvme.enabled, false) || ( + startswith(try(worker.local_nvme.mount_path, "/mnt/local-nvme"), "/") + ) + ]) + error_message = "When worker local NVMe is enabled, mount_path must be an absolute path." + } + + validation { + condition = alltrue([ + for worker in var.slurm_nodeset_workers : + contains(["ext4", "xfs"], try(worker.local_nvme.filesystem_type, "ext4")) + ]) + error_message = "When worker local NVMe filesystem_type is set, it must be `ext4` or `xfs`." + } } variable "slurm_nodeset_login" { @@ -923,6 +946,26 @@ resource "terraform_data" "check_slurm_nodeset" { } } +resource "terraform_data" "check_local_nvme" { + lifecycle { + precondition { + condition = ( + !anytrue([ + for worker in var.slurm_nodeset_workers : + try(worker.local_nvme.enabled, false) + ]) || + alltrue([ + for worker in var.slurm_nodeset_workers : + !try(worker.local_nvme.enabled, false) || ( + try(module.resources.local_nvme_supported_by_region_platform_preset[var.region][worker.resource.platform][worker.resource.preset], false) + ) + ]) + ) + error_message = "Local NVMe is enabled, but one or more worker nodesets use unsupported region/platform/preset." + } + } +} + # region Worker variable "slurm_worker_sshd_config_map_ref_name" { diff --git a/soperator/modules/available_resources/outputs.tf b/soperator/modules/available_resources/outputs.tf index f28bfc786..e79b774c9 100644 --- a/soperator/modules/available_resources/outputs.tf +++ b/soperator/modules/available_resources/outputs.tf @@ -23,6 +23,11 @@ output "by_platform" { value = local.presets_by_platforms } +output "local_nvme_supported_by_region_platform_preset" { + description = "Local NVMe support matrix by region/platform/preset." + value = local.local_nvme_supported_by_region_platform_preset +} + output "k8s_ephemeral_storage_coefficient" { value = local.reserve.ephemeral_storage.coefficient } diff --git a/soperator/modules/available_resources/preset.tf b/soperator/modules/available_resources/preset.tf index e07dfcee3..b25d2d4ea 100644 --- a/soperator/modules/available_resources/preset.tf +++ b/soperator/modules/available_resources/preset.tf @@ -309,7 +309,15 @@ locals { } } - presets_by_platforms = tomap({ + # Allow-list: "${region}/${platform}/${preset}" + local_nvme_supported_true_region_platform_preset = toset([ + # gpu-b300-sxm + "${local.regions.uk-south1}/${local.platforms.gpu-b300-sxm}/${local.presets.p-8g-128c-1600g}", + "${local.regions.uk-south1}/${local.platforms.gpu-b300-sxm}/${local.presets.p-8g-160c-1792g}", + "${local.regions.uk-south1}/${local.platforms.gpu-b300-sxm}/${local.presets.p-8g-192c-2768g}", + ]) + + presets_by_platforms_raw = tomap({ (local.platforms.cpu-e2) = tomap({ (local.presets.p-2c-8g) = local.presets_cpu.c-2vcpu-8gb (local.presets.p-4c-16g) = local.presets_cpu.c-4vcpu-16gb @@ -362,4 +370,25 @@ locals { (local.presets.p-8g-192c-2768g) = local.presets_gpu.g-8gpu-192vcpu-2768gb }) }) + + local_nvme_supported_by_region_platform_preset = tomap({ + for region in [for _, region in local.regions : region] : region => tomap({ + for platform, presets in local.presets_by_platforms_raw : platform => tomap({ + for preset, _ in presets : preset => contains(local.local_nvme_supported_true_region_platform_preset, "${region}/${platform}/${preset}") + }) + }) + }) + + presets_by_platforms = tomap({ + for platform, presets in local.presets_by_platforms_raw : platform => tomap({ + for preset, resources in presets : preset => merge(resources, { + local_nvme_supported = anytrue([ + for region in [for _, region in local.regions : region] : try(local.local_nvme_supported_by_region_platform_preset[region][platform][preset], false) + ]) + local_nvme_supported_by_region = tomap({ + for region in [for _, region in local.regions : region] : region => try(local.local_nvme_supported_by_region_platform_preset[region][platform][preset], false) + }) + }) + }) + }) } diff --git a/soperator/modules/k8s/k8s_ng_workers_v2.tf b/soperator/modules/k8s/k8s_ng_workers_v2.tf index 976d134da..6232f707c 100644 --- a/soperator/modules/k8s/k8s_ng_workers_v2.tf +++ b/soperator/modules/k8s/k8s_ng_workers_v2.tf @@ -137,6 +137,15 @@ resource "nebius_mk8s_v1_node_group" "worker_v2" { block_size_bytes = provider::units::from_kib(var.node_group_workers_v2[count.index].boot_disk.block_size_kibibytes) } + local_disks = try(var.node_group_workers_v2[count.index].local_nvme.enabled, false) ? { + config = { + none = true + } + passthrough_group = { + requested = true + } + } : null + filesystems = concat( [ { @@ -166,11 +175,17 @@ resource "nebius_mk8s_v1_node_group" "worker_v2" { os = "ubuntu24.04" - cloud_init_user_data = local.node_group_gpu_present_v2.worker[count.index] ? ( - local.node_cloud_init.enabled ? local.node_cloud_init.cloud_init_data : null - ) : ( - local.node_ssh_access.enabled ? local.node_cloud_init.cloud_init_data_no_nvidia : null - ) + cloud_init_user_data = ( + local.node_ssh_access.enabled || + (local.node_group_gpu_present_v2.worker[count.index] && length(var.nvidia_admin_conf_lines) > 0) || + try(var.node_group_workers_v2[count.index].local_nvme.enabled, false) + ) ? templatefile("${path.module}/templates/cloud_init.yaml.tftpl", { + ssh_users = var.node_ssh_access_users + nvidia_admin_conf_lines = local.node_group_gpu_present_v2.worker[count.index] ? var.nvidia_admin_conf_lines : [] + local_nvme_enabled = try(var.node_group_workers_v2[count.index].local_nvme.enabled, false) + local_nvme_mount_path = try(var.node_group_workers_v2[count.index].local_nvme.mount_path, "/mnt/local-nvme") + local_nvme_filesystem_type = try(var.node_group_workers_v2[count.index].local_nvme.filesystem_type, "ext4") + }) : null } lifecycle { diff --git a/soperator/modules/k8s/locals.tf b/soperator/modules/k8s/locals.tf index 3b64555b8..684bc00f9 100644 --- a/soperator/modules/k8s/locals.tf +++ b/soperator/modules/k8s/locals.tf @@ -6,12 +6,18 @@ locals { node_cloud_init = { enabled = length(var.node_ssh_access_users) > 0 || length(var.nvidia_admin_conf_lines) > 0 cloud_init_data = templatefile("${path.module}/templates/cloud_init.yaml.tftpl", { - ssh_users = var.node_ssh_access_users - nvidia_admin_conf_lines = var.nvidia_admin_conf_lines + ssh_users = var.node_ssh_access_users + nvidia_admin_conf_lines = var.nvidia_admin_conf_lines + local_nvme_enabled = false + local_nvme_mount_path = "/mnt/local-nvme" + local_nvme_filesystem_type = "ext4" }) cloud_init_data_no_nvidia = templatefile("${path.module}/templates/cloud_init.yaml.tftpl", { - ssh_users = var.node_ssh_access_users - nvidia_admin_conf_lines = [] + ssh_users = var.node_ssh_access_users + nvidia_admin_conf_lines = [] + local_nvme_enabled = false + local_nvme_mount_path = "/mnt/local-nvme" + local_nvme_filesystem_type = "ext4" }) } diff --git a/soperator/modules/k8s/templates/cloud_init.yaml.tftpl b/soperator/modules/k8s/templates/cloud_init.yaml.tftpl index b8e8f8b1c..2eb732dd5 100644 --- a/soperator/modules/k8s/templates/cloud_init.yaml.tftpl +++ b/soperator/modules/k8s/templates/cloud_init.yaml.tftpl @@ -12,8 +12,19 @@ users: %{ endfor } %{ endif } -%{ if length(nvidia_admin_conf_lines) > 0 } +%{ if local_nvme_enabled } +package_update: true +packages: + - mdadm + - nvme-cli +%{ if local_nvme_filesystem_type == "xfs" } + - xfsprogs +%{ endif } +%{ endif } + +%{ if length(nvidia_admin_conf_lines) > 0 || local_nvme_enabled } write_files: +%{ if length(nvidia_admin_conf_lines) > 0 } - path: /etc/modprobe.d/nvidia_admin.conf owner: root:root permissions: "0644" @@ -91,7 +102,111 @@ write_files: fi log "end" +%{ endif } +%{ if local_nvme_enabled } + - path: /usr/local/sbin/prepare-disks.sh + owner: root:root + permissions: "0755" + content: | + #!/usr/bin/env bash + set -euo pipefail + + MOUNTPOINT="$${1:-/mnt/local-nvme}" + FILESYSTEM_TYPE="${local_nvme_filesystem_type}" + MD_DEV="/dev/md0" + case "$${FILESYSTEM_TYPE}" in + ext4) + MOUNT_OPTS="noatime,nodiratime,lazytime,commit=60" + MKFS_CMD=(mkfs.ext4 -F -m 0) + ;; + xfs) + MOUNT_OPTS="noatime,nodiratime,logbufs=8,inode64" + MKFS_CMD=(mkfs.xfs -f) + ;; + *) + echo "Unsupported filesystem type: $${FILESYSTEM_TYPE}" + exit 1 + ;; + esac + + echo "Detecting NVMe disks..." + nvme list + + mapfile -t NVME_DISKS < <( + nvme list | awk 'NR>2 && $1 ~ /^\/dev\/nvme[0-9]+n[0-9]+$/ { print $1 }' | sort -V | uniq + ) + + ROOT_SOURCE="$(findmnt -n -o SOURCE / || true)" + ROOT_PKNAME="$(lsblk -no PKNAME "$${ROOT_SOURCE}" 2>/dev/null || true)" + if [[ -n "$${ROOT_PKNAME}" ]]; then + ROOT_DISK="/dev/$${ROOT_PKNAME}" + FILTERED=() + for d in "$${NVME_DISKS[@]}"; do + if [[ "$${d}" != "$${ROOT_DISK}" ]]; then + FILTERED+=("$${d}") + fi + done + NVME_DISKS=("$${FILTERED[@]}") + fi + + DISK_COUNT="$${#NVME_DISKS[@]}" + if (( DISK_COUNT < 2 || DISK_COUNT > 8 )); then + echo "Expected 2..8 NVMe disks for RAID0, found $${DISK_COUNT}: $${NVME_DISKS[*]}" + exit 1 + fi + + for d in "$${NVME_DISKS[@]}"; do + [[ -b "$${d}" ]] || { echo "Block device not found: $${d}"; exit 1; } + done + + echo "Using $${DISK_COUNT} NVMe disk(s): $${NVME_DISKS[*]}" + + for d in "$${NVME_DISKS[@]}"; do + DISK_BASENAME="$(basename "$${d}")" + SCHEDULER_PATH="/sys/block/$${DISK_BASENAME}/queue/scheduler" + if [[ -w "$${SCHEDULER_PATH}" ]]; then + echo none | tee "$${SCHEDULER_PATH}" >/dev/null + else + echo "Skipping scheduler update for $${d}: $${SCHEDULER_PATH} is not writable" + fi + done + + if [[ -e "$${MD_DEV}" ]]; then + mdadm --stop "$${MD_DEV}" || true + fi + mdadm --create "$${MD_DEV}" \ + --level=0 \ + --raid-devices="$${DISK_COUNT}" \ + "$${NVME_DISKS[@]}" + + udevadm settle + + "$${MKFS_CMD[@]}" "$${MD_DEV}" + + mkdir -p "$${MOUNTPOINT}" + mount -o "$${MOUNT_OPTS}" "$${MD_DEV}" "$${MOUNTPOINT}" + + UUID="$(blkid -s UUID -o value "$${MD_DEV}")" + grep -q "$${UUID}" /etc/fstab || echo "UUID=$${UUID} $${MOUNTPOINT} $${FILESYSTEM_TYPE} $${MOUNT_OPTS},nofail 0 2" >> /etc/fstab + + if [[ -d /etc/mdadm ]]; then + mdadm --detail --scan > /etc/mdadm/mdadm.conf + fi + + echo "Done." + echo "RAID device: $${MD_DEV}" + echo "Mounted at : $${MOUNTPOINT}" + df -h "$${MOUNTPOINT}" +%{ endif } +%{ endif } + +%{ if length(nvidia_admin_conf_lines) > 0 || local_nvme_enabled } runcmd: +%{ if length(nvidia_admin_conf_lines) > 0 } - [ bash, -lc, "/usr/local/bin/nvidia-conf-check.sh >> /var/log/nvidia-conf-check.log 2>&1" ] %{ endif } +%{ if local_nvme_enabled } + - [ bash, -lc, "/usr/local/sbin/prepare-disks.sh ${local_nvme_mount_path} > /var/log/prepare-disks.log 2>&1" ] +%{ endif } +%{ endif } diff --git a/soperator/modules/k8s/variables.tf b/soperator/modules/k8s/variables.tf index 7197b08b4..8f9f33016 100644 --- a/soperator/modules/k8s/variables.tf +++ b/soperator/modules/k8s/variables.tf @@ -155,6 +155,11 @@ variable "node_group_workers_v2" { policy = optional(string) reservation_ids = optional(list(string)) })) + local_nvme = optional(object({ + enabled = optional(bool, false) + mount_path = optional(string, "/mnt/local-nvme") + filesystem_type = optional(string, "ext4") + }), {}) nodeset_index = number subset_index = number })) diff --git a/soperator/modules/slurm/main.tf b/soperator/modules/slurm/main.tf index fe2aa9931..4519e2332 100644 --- a/soperator/modules/slurm/main.tf +++ b/soperator/modules/slurm/main.tf @@ -97,6 +97,7 @@ resource "helm_release" "soperator_fluxcd_cm" { cluster_name = var.cluster_name region = var.region public_o11y_enabled = var.public_o11y_enabled + has_local_nvme = anytrue([for nodeset in var.worker_nodesets : try(nodeset.local_nvme.enabled, false)]) metrics_collector = local.metrics_collector create_pvcs = var.create_pvcs diff --git a/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl index 303c7b3e4..43f6bad77 100644 --- a/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl @@ -107,6 +107,15 @@ nodesets: storage: ${sub_mount.size_gibibytes}Gi %{~ endfor ~} + %{~ if try(nodeset.local_nvme.enabled, false) ~} + - name: local-nvme + mountPath: ${try(nodeset.local_nvme.mount_path, "/mnt/local-nvme")} + volumeSource: + hostPath: + path: ${try(nodeset.local_nvme.mount_path, "/mnt/local-nvme")} + type: Directory + %{~ endif ~} + %{~ if jail_submounts.image_storage.enabled ~} - name: image-storage mountPath: /mnt/image-storage @@ -230,6 +239,13 @@ nodesets: command: ["grep", ' /volume-mount ${sub_mount.filesystem_type} ', "/proc/mounts"] %{~ endfor ~} + %{~ if try(nodeset.local_nvme.enabled, false) ~} + - name: ensure-node-local-jail-submount-local-nvme-${try(nodeset.local_nvme.filesystem_type, "ext4")} + image: cr.eu-north1.nebius.cloud/soperator/busybox + volumeMounts: [{ name: 'local-nvme', mountPath: /volume-mount }] + command: ["grep", ' /volume-mount ${try(nodeset.local_nvme.filesystem_type, "ext4")} ', "/proc/mounts"] + %{~ endif ~} + %{~ if jail_submounts.image_storage.enabled ~} - name: ensure-node-local-image-storage-${jail_submounts.image_storage.spec.filesystem_type} image: cr.eu-north1.nebius.cloud/soperator/busybox diff --git a/soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl index 9781db571..8e695050a 100644 --- a/soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl @@ -244,6 +244,13 @@ resources: useDefaultAppArmorProfile: ${apparmor_enabled} maintenance: ${slurm_cluster.maintenance} + %{~ if has_local_nvme ~} + slurmScripts: + builtIn: + nvme_raid_health.sh: + enabled: true + %{~ endif ~} + partitionConfiguration: configType: structured partitions: diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index d54a876bb..98ecaedc0 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -753,6 +753,11 @@ variable "worker_nodesets" { gres_config = list(string) create_partition = bool ephemeral_nodes = optional(bool, false) + local_nvme = optional(object({ + enabled = optional(bool, false) + mount_path = optional(string, "/mnt/local-nvme") + filesystem_type = optional(string, "ext4") + }), {}) })) default = [] }