From fa7b9bdacd47128f769c84eeede5d03cb7b2b6c6 Mon Sep 17 00:00:00 2001 From: Ali Sattari Date: Fri, 6 Mar 2026 12:15:01 +0100 Subject: [PATCH 1/4] let there be nvme check region/platform/preset for validation small fix one mount_path to rule them all if nvme then enable hc script fixes and xfs based on info in #proj-local-disks --- soperator/installations/example/main.tf | 11 +- .../installations/example/terraform.tfvars | 7 ++ soperator/installations/example/variables.tf | 34 ++++++ .../modules/available_resources/outputs.tf | 5 + .../modules/available_resources/preset.tf | 31 +++++- soperator/modules/k8s/k8s_ng_workers_v2.tf | 24 ++++- soperator/modules/k8s/locals.tf | 4 + .../k8s/templates/cloud_init.yaml.tftpl | 101 +++++++++++++++++- soperator/modules/k8s/variables.tf | 4 + .../modules/slurm/flux_release_nodesets.tf | 10 +- soperator/modules/slurm/main.tf | 8 +- .../flux_release_nodesets.yaml.tftpl | 23 ++++ .../terraform_fluxcd_values.yaml.tftpl | 7 ++ soperator/modules/slurm/variables.tf | 5 + 14 files changed, 263 insertions(+), 11 deletions(-) diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index 22cee978d..19f8d2a52 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -54,6 +54,10 @@ locals { subset_index = subset preemptible = nodeset.preemptible reservation_policy = nodeset.reservation_policy + local_nvme = { + enabled = try(nodeset.local_nvme.enabled, false) + mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme") + } } ]]) } @@ -63,6 +67,7 @@ resource "terraform_data" "check_variables" { terraform_data.check_slurm_nodeset, terraform_data.check_slurm_nodeset_accounting, terraform_data.check_nfs, + terraform_data.check_local_nvme, ] } @@ -414,7 +419,6 @@ module "slurm" { storage_class_name = replace("${local.storage_class_prefix}-${lower(var.node_local_image_disk.spec.disk_type)}-${lower(var.node_local_image_disk.spec.filesystem_type)}", "_", "-") } : null } - nfs = { enabled = var.nfs.enabled path = var.nfs.enabled ? module.nfs-server[0].nfs_export_path : null @@ -479,6 +483,11 @@ module "slurm" { gres_config = lookup(module.resources.gres_config_by_platform, nodeset.resource.platform, null) create_partition = nodeset.create_partition != null ? nodeset.create_partition : false ephemeral_nodes = nodeset.ephemeral_nodes + local_nvme = { + enabled = try(nodeset.local_nvme.enabled, false) + mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme") + filesystem_type = "xfs" + } }] login_allocation_id = module.k8s.static_ip_allocation_id diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index cf85444fa..6c2ca3b0a 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -337,6 +337,13 @@ slurm_nodeset_workers = [ # When true, nodes will use dynamic topology injection and power management. # By default, false. ephemeral_nodes = false + # Optional local NVMe passthrough for this nodeset only. + # Uses local instance disks, creates a RAID0 array and mounts it on the host via cloud-init. + # mount_path: path used for both host RAID mount and jail submount. + # local_nvme = { + # enabled = true + # mount_path = "/mnt/local-nvme" + # } }, ] diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index fd0904efe..d47147a5d 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -722,6 +722,10 @@ variable "slurm_nodeset_workers" { features = optional(list(string)) create_partition = optional(bool) ephemeral_nodes = optional(bool, false) + local_nvme = optional(object({ + enabled = optional(bool, false) + mount_path = optional(string, "/mnt/local-nvme") + }), {}) })) nullable = false default = [{ @@ -771,6 +775,16 @@ variable "slurm_nodeset_workers" { ]) error_message = "Worker nodeset autoscaling.min_size must be less than or equal to size." } + + validation { + condition = alltrue([ + for worker in var.slurm_nodeset_workers : + !try(worker.local_nvme.enabled, false) || ( + startswith(try(worker.local_nvme.mount_path, "/mnt/local-nvme"), "/") + ) + ]) + error_message = "When worker local NVMe is enabled, mount_path must be an absolute path." + } } variable "slurm_nodeset_login" { @@ -923,6 +937,26 @@ resource "terraform_data" "check_slurm_nodeset" { } } +resource "terraform_data" "check_local_nvme" { + lifecycle { + precondition { + condition = ( + !anytrue([ + for worker in var.slurm_nodeset_workers : + try(worker.local_nvme.enabled, false) + ]) || + alltrue([ + for worker in var.slurm_nodeset_workers : + !try(worker.local_nvme.enabled, false) || ( + try(module.resources.local_nvme_supported_by_region_platform_preset[var.region][worker.resource.platform][worker.resource.preset], false) + ) + ]) + ) + error_message = "Local NVMe is enabled, but one or more worker nodesets use unsupported region/platform/preset." + } + } +} + # region Worker variable "slurm_worker_sshd_config_map_ref_name" { diff --git a/soperator/modules/available_resources/outputs.tf b/soperator/modules/available_resources/outputs.tf index f28bfc786..e79b774c9 100644 --- a/soperator/modules/available_resources/outputs.tf +++ b/soperator/modules/available_resources/outputs.tf @@ -23,6 +23,11 @@ output "by_platform" { value = local.presets_by_platforms } +output "local_nvme_supported_by_region_platform_preset" { + description = "Local NVMe support matrix by region/platform/preset." + value = local.local_nvme_supported_by_region_platform_preset +} + output "k8s_ephemeral_storage_coefficient" { value = local.reserve.ephemeral_storage.coefficient } diff --git a/soperator/modules/available_resources/preset.tf b/soperator/modules/available_resources/preset.tf index e07dfcee3..b25d2d4ea 100644 --- a/soperator/modules/available_resources/preset.tf +++ b/soperator/modules/available_resources/preset.tf @@ -309,7 +309,15 @@ locals { } } - presets_by_platforms = tomap({ + # Allow-list: "${region}/${platform}/${preset}" + local_nvme_supported_true_region_platform_preset = toset([ + # gpu-b300-sxm + "${local.regions.uk-south1}/${local.platforms.gpu-b300-sxm}/${local.presets.p-8g-128c-1600g}", + "${local.regions.uk-south1}/${local.platforms.gpu-b300-sxm}/${local.presets.p-8g-160c-1792g}", + "${local.regions.uk-south1}/${local.platforms.gpu-b300-sxm}/${local.presets.p-8g-192c-2768g}", + ]) + + presets_by_platforms_raw = tomap({ (local.platforms.cpu-e2) = tomap({ (local.presets.p-2c-8g) = local.presets_cpu.c-2vcpu-8gb (local.presets.p-4c-16g) = local.presets_cpu.c-4vcpu-16gb @@ -362,4 +370,25 @@ locals { (local.presets.p-8g-192c-2768g) = local.presets_gpu.g-8gpu-192vcpu-2768gb }) }) + + local_nvme_supported_by_region_platform_preset = tomap({ + for region in [for _, region in local.regions : region] : region => tomap({ + for platform, presets in local.presets_by_platforms_raw : platform => tomap({ + for preset, _ in presets : preset => contains(local.local_nvme_supported_true_region_platform_preset, "${region}/${platform}/${preset}") + }) + }) + }) + + presets_by_platforms = tomap({ + for platform, presets in local.presets_by_platforms_raw : platform => tomap({ + for preset, resources in presets : preset => merge(resources, { + local_nvme_supported = anytrue([ + for region in [for _, region in local.regions : region] : try(local.local_nvme_supported_by_region_platform_preset[region][platform][preset], false) + ]) + local_nvme_supported_by_region = tomap({ + for region in [for _, region in local.regions : region] : region => try(local.local_nvme_supported_by_region_platform_preset[region][platform][preset], false) + }) + }) + }) + }) } diff --git a/soperator/modules/k8s/k8s_ng_workers_v2.tf b/soperator/modules/k8s/k8s_ng_workers_v2.tf index 976d134da..c8af62880 100644 --- a/soperator/modules/k8s/k8s_ng_workers_v2.tf +++ b/soperator/modules/k8s/k8s_ng_workers_v2.tf @@ -137,6 +137,15 @@ resource "nebius_mk8s_v1_node_group" "worker_v2" { block_size_bytes = provider::units::from_kib(var.node_group_workers_v2[count.index].boot_disk.block_size_kibibytes) } + local_disks = try(var.node_group_workers_v2[count.index].local_nvme.enabled, false) ? { + config = { + none = true + } + passthrough_group = { + requested = true + } + } : null + filesystems = concat( [ { @@ -166,11 +175,16 @@ resource "nebius_mk8s_v1_node_group" "worker_v2" { os = "ubuntu24.04" - cloud_init_user_data = local.node_group_gpu_present_v2.worker[count.index] ? ( - local.node_cloud_init.enabled ? local.node_cloud_init.cloud_init_data : null - ) : ( - local.node_ssh_access.enabled ? local.node_cloud_init.cloud_init_data_no_nvidia : null - ) + cloud_init_user_data = ( + local.node_ssh_access.enabled || + (local.node_group_gpu_present_v2.worker[count.index] && length(var.nvidia_admin_conf_lines) > 0) || + try(var.node_group_workers_v2[count.index].local_nvme.enabled, false) + ) ? templatefile("${path.module}/templates/cloud_init.yaml.tftpl", { + ssh_users = var.node_ssh_access_users + nvidia_admin_conf_lines = local.node_group_gpu_present_v2.worker[count.index] ? var.nvidia_admin_conf_lines : [] + local_nvme_enabled = try(var.node_group_workers_v2[count.index].local_nvme.enabled, false) + local_nvme_mount_path = try(var.node_group_workers_v2[count.index].local_nvme.mount_path, "/mnt/local-nvme") + }) : null } lifecycle { diff --git a/soperator/modules/k8s/locals.tf b/soperator/modules/k8s/locals.tf index 3b64555b8..c0492b716 100644 --- a/soperator/modules/k8s/locals.tf +++ b/soperator/modules/k8s/locals.tf @@ -8,10 +8,14 @@ locals { cloud_init_data = templatefile("${path.module}/templates/cloud_init.yaml.tftpl", { ssh_users = var.node_ssh_access_users nvidia_admin_conf_lines = var.nvidia_admin_conf_lines + local_nvme_enabled = false + local_nvme_mount_path = "/mnt/local-nvme" }) cloud_init_data_no_nvidia = templatefile("${path.module}/templates/cloud_init.yaml.tftpl", { ssh_users = var.node_ssh_access_users nvidia_admin_conf_lines = [] + local_nvme_enabled = false + local_nvme_mount_path = "/mnt/local-nvme" }) } diff --git a/soperator/modules/k8s/templates/cloud_init.yaml.tftpl b/soperator/modules/k8s/templates/cloud_init.yaml.tftpl index b8e8f8b1c..14066c5a3 100644 --- a/soperator/modules/k8s/templates/cloud_init.yaml.tftpl +++ b/soperator/modules/k8s/templates/cloud_init.yaml.tftpl @@ -12,8 +12,17 @@ users: %{ endfor } %{ endif } -%{ if length(nvidia_admin_conf_lines) > 0 } +%{ if local_nvme_enabled } +package_update: true +packages: + - mdadm + - nvme-cli + - xfsprogs +%{ endif } + +%{ if length(nvidia_admin_conf_lines) > 0 || local_nvme_enabled } write_files: +%{ if length(nvidia_admin_conf_lines) > 0 } - path: /etc/modprobe.d/nvidia_admin.conf owner: root:root permissions: "0644" @@ -91,7 +100,97 @@ write_files: fi log "end" +%{ endif } +%{ if local_nvme_enabled } + - path: /usr/local/sbin/prepare-disks.sh + owner: root:root + permissions: "0755" + content: | + #!/usr/bin/env bash + set -euo pipefail + + MOUNTPOINT="$${1:-/mnt/local-nvme}" + MD_DEV="/dev/md0" + MOUNT_OPTS="noatime,nodiratime" + + echo "Detecting NVMe disks..." + nvme list + + mapfile -t NVME_DISKS < <( + nvme list | awk 'NR>2 && $1 ~ /^\/dev\/nvme[0-9]+n[0-9]+$/ { print $1 }' | sort -V | uniq + ) + + ROOT_SOURCE="$(findmnt -n -o SOURCE / || true)" + ROOT_PKNAME="$(lsblk -no PKNAME "$${ROOT_SOURCE}" 2>/dev/null || true)" + if [[ -n "$${ROOT_PKNAME}" ]]; then + ROOT_DISK="/dev/$${ROOT_PKNAME}" + FILTERED=() + for d in "$${NVME_DISKS[@]}"; do + if [[ "$${d}" != "$${ROOT_DISK}" ]]; then + FILTERED+=("$${d}") + fi + done + NVME_DISKS=("$${FILTERED[@]}") + fi + + DISK_COUNT="$${#NVME_DISKS[@]}" + if (( DISK_COUNT < 2 || DISK_COUNT > 8 )); then + echo "Expected 2..8 NVMe disks for RAID0, found $${DISK_COUNT}: $${NVME_DISKS[*]}" + exit 1 + fi + + for d in "$${NVME_DISKS[@]}"; do + [[ -b "$${d}" ]] || { echo "Block device not found: $${d}"; exit 1; } + done + + echo "Using $${DISK_COUNT} NVMe disk(s): $${NVME_DISKS[*]}" + + for d in "$${NVME_DISKS[@]}"; do + DISK_BASENAME="$(basename "$${d}")" + SCHEDULER_PATH="/sys/block/$${DISK_BASENAME}/queue/scheduler" + if [[ -w "$${SCHEDULER_PATH}" ]]; then + echo none | tee "$${SCHEDULER_PATH}" >/dev/null + else + echo "Skipping scheduler update for $${d}: $${SCHEDULER_PATH} is not writable" + fi + done + + if [[ -e "$${MD_DEV}" ]]; then + mdadm --stop "$${MD_DEV}" || true + fi + + mdadm --create "$${MD_DEV}" \ + --level=0 \ + --raid-devices="$${DISK_COUNT}" \ + "$${NVME_DISKS[@]}" + udevadm settle + + mkfs.xfs -f "$${MD_DEV}" + + mkdir -p "$${MOUNTPOINT}" + mount -o "$${MOUNT_OPTS}" "$${MD_DEV}" "$${MOUNTPOINT}" + + UUID="$(blkid -s UUID -o value "$${MD_DEV}")" + grep -q "$${UUID}" /etc/fstab || echo "UUID=$${UUID} $${MOUNTPOINT} xfs $${MOUNT_OPTS},nofail 0 2" >> /etc/fstab + + if [[ -d /etc/mdadm ]]; then + mdadm --detail --scan > /etc/mdadm/mdadm.conf + fi + + echo "Done." + echo "RAID device: $${MD_DEV}" + echo "Mounted at : $${MOUNTPOINT}" + df -h "$${MOUNTPOINT}" +%{ endif } +%{ endif } + +%{ if length(nvidia_admin_conf_lines) > 0 || local_nvme_enabled } runcmd: +%{ if length(nvidia_admin_conf_lines) > 0 } - [ bash, -lc, "/usr/local/bin/nvidia-conf-check.sh >> /var/log/nvidia-conf-check.log 2>&1" ] %{ endif } +%{ if local_nvme_enabled } + - [ bash, -lc, "/usr/local/sbin/prepare-disks.sh ${local_nvme_mount_path} > /var/log/prepare-disks.log 2>&1" ] +%{ endif } +%{ endif } diff --git a/soperator/modules/k8s/variables.tf b/soperator/modules/k8s/variables.tf index 7197b08b4..613ad6aec 100644 --- a/soperator/modules/k8s/variables.tf +++ b/soperator/modules/k8s/variables.tf @@ -155,6 +155,10 @@ variable "node_group_workers_v2" { policy = optional(string) reservation_ids = optional(list(string)) })) + local_nvme = optional(object({ + enabled = optional(bool, false) + mount_path = optional(string, "/mnt/local-nvme") + }), {}) nodeset_index = number subset_index = number })) diff --git a/soperator/modules/slurm/flux_release_nodesets.tf b/soperator/modules/slurm/flux_release_nodesets.tf index d2b0369d7..5ae5942eb 100644 --- a/soperator/modules/slurm/flux_release_nodesets.tf +++ b/soperator/modules/slurm/flux_release_nodesets.tf @@ -26,7 +26,15 @@ resource "local_file" "flux_release_rendered_nodesets" { mount_path = submount.mount_path }] - local = var.node_local_jail_submounts + local = [for submount in var.node_local_jail_submounts : { + name = submount.name + mount_path = submount.mount_path + source_type = "volume_claim_template" + host_path = null + filesystem_type = submount.filesystem_type + storage_class_name = submount.storage_class_name + size_gibibytes = submount.size_gibibytes + }] image_storage = var.node_local_image_storage } diff --git a/soperator/modules/slurm/main.tf b/soperator/modules/slurm/main.tf index fe2aa9931..ee434461c 100644 --- a/soperator/modules/slurm/main.tf +++ b/soperator/modules/slurm/main.tf @@ -97,6 +97,7 @@ resource "helm_release" "soperator_fluxcd_cm" { cluster_name = var.cluster_name region = var.region public_o11y_enabled = var.public_o11y_enabled + has_local_nvme = anytrue([for nodeset in var.worker_nodesets : try(nodeset.local_nvme.enabled, false)]) metrics_collector = local.metrics_collector create_pvcs = var.create_pvcs @@ -145,8 +146,11 @@ resource "helm_release" "soperator_fluxcd_cm" { node_local_image_storage = var.node_local_image_storage jail_submounts = [for submount in var.filestores.jail_submounts : { - name = submount.name - mount_path = submount.mount_path + name = submount.name + mount_path = submount.mount_path + source_type = "filestore" + host_path = null + filesystem_type = null }] controller_state_on_filestore = var.controller_state_on_filestore diff --git a/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl index 303c7b3e4..2dc37c34e 100644 --- a/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl @@ -98,6 +98,12 @@ nodesets: %{~ for sub_mount in jail_submounts.local ~} - name: ${sub_mount.name} mountPath: ${sub_mount.mount_path} + %{~ if sub_mount.source_type == "host_path" ~} + volumeSource: + hostPath: + path: ${sub_mount.host_path} + type: Directory + %{~ else ~} volumeClaimTemplateSpec: accessModes: - ReadWriteOnce @@ -105,8 +111,18 @@ nodesets: resources: requests: storage: ${sub_mount.size_gibibytes}Gi + %{~ endif ~} %{~ endfor ~} + %{~ if try(nodeset.local_nvme.enabled, false) ~} + - name: local-nvme + mountPath: ${try(nodeset.local_nvme.mount_path, "/mnt/local-nvme")} + volumeSource: + hostPath: + path: ${try(nodeset.local_nvme.mount_path, "/mnt/local-nvme")} + type: Directory + %{~ endif ~} + %{~ if jail_submounts.image_storage.enabled ~} - name: image-storage mountPath: /mnt/image-storage @@ -230,6 +246,13 @@ nodesets: command: ["grep", ' /volume-mount ${sub_mount.filesystem_type} ', "/proc/mounts"] %{~ endfor ~} + %{~ if try(nodeset.local_nvme.enabled, false) ~} + - name: ensure-node-local-jail-submount-local-nvme-${try(nodeset.local_nvme.filesystem_type, "xfs")} + image: cr.eu-north1.nebius.cloud/soperator/busybox + volumeMounts: [{ name: 'local-nvme', mountPath: /volume-mount }] + command: ["grep", ' /volume-mount ${try(nodeset.local_nvme.filesystem_type, "xfs")} ', "/proc/mounts"] + %{~ endif ~} + %{~ if jail_submounts.image_storage.enabled ~} - name: ensure-node-local-image-storage-${jail_submounts.image_storage.spec.filesystem_type} image: cr.eu-north1.nebius.cloud/soperator/busybox diff --git a/soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl index 9781db571..8e695050a 100644 --- a/soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/terraform_fluxcd_values.yaml.tftpl @@ -244,6 +244,13 @@ resources: useDefaultAppArmorProfile: ${apparmor_enabled} maintenance: ${slurm_cluster.maintenance} + %{~ if has_local_nvme ~} + slurmScripts: + builtIn: + nvme_raid_health.sh: + enabled: true + %{~ endif ~} + partitionConfiguration: configType: structured partitions: diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index d54a876bb..dda1edc6e 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -753,6 +753,11 @@ variable "worker_nodesets" { gres_config = list(string) create_partition = bool ephemeral_nodes = optional(bool, false) + local_nvme = optional(object({ + enabled = optional(bool, false) + mount_path = optional(string, "/mnt/local-nvme") + filesystem_type = optional(string, "xfs") + }), {}) })) default = [] } From 7b83840fa7afc9dffed81762756b1afe35d6f25f Mon Sep 17 00:00:00 2001 From: Ali Sattari Date: Mon, 16 Mar 2026 17:11:50 +0100 Subject: [PATCH 2/4] default to ext4 after some perf tests --- soperator/installations/example/main.tf | 7 +++--- .../installations/example/terraform.tfvars | 5 +++-- soperator/installations/example/variables.tf | 13 +++++++++-- soperator/modules/k8s/k8s_ng_workers_v2.tf | 9 ++++---- soperator/modules/k8s/locals.tf | 18 ++++++++------- .../k8s/templates/cloud_init.yaml.tftpl | 22 ++++++++++++++++--- soperator/modules/k8s/variables.tf | 5 +++-- .../flux_release_nodesets.yaml.tftpl | 4 ++-- soperator/modules/slurm/variables.tf | 2 +- 9 files changed, 58 insertions(+), 27 deletions(-) diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index 19f8d2a52..abf34d87f 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -55,8 +55,9 @@ locals { preemptible = nodeset.preemptible reservation_policy = nodeset.reservation_policy local_nvme = { - enabled = try(nodeset.local_nvme.enabled, false) - mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme") + enabled = try(nodeset.local_nvme.enabled, false) + mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme") + filesystem_type = try(nodeset.local_nvme.filesystem_type, "ext4") } } ]]) @@ -486,7 +487,7 @@ module "slurm" { local_nvme = { enabled = try(nodeset.local_nvme.enabled, false) mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme") - filesystem_type = "xfs" + filesystem_type = try(nodeset.local_nvme.filesystem_type, "ext4") } }] diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 6c2ca3b0a..597e565a8 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -341,8 +341,9 @@ slurm_nodeset_workers = [ # Uses local instance disks, creates a RAID0 array and mounts it on the host via cloud-init. # mount_path: path used for both host RAID mount and jail submount. # local_nvme = { - # enabled = true - # mount_path = "/mnt/local-nvme" + # enabled = true + # mount_path = "/mnt/local-nvme" + # filesystem_type = "ext4" # } }, ] diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index d47147a5d..b75be4787 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -723,8 +723,9 @@ variable "slurm_nodeset_workers" { create_partition = optional(bool) ephemeral_nodes = optional(bool, false) local_nvme = optional(object({ - enabled = optional(bool, false) - mount_path = optional(string, "/mnt/local-nvme") + enabled = optional(bool, false) + mount_path = optional(string, "/mnt/local-nvme") + filesystem_type = optional(string, "ext4") }), {}) })) nullable = false @@ -785,6 +786,14 @@ variable "slurm_nodeset_workers" { ]) error_message = "When worker local NVMe is enabled, mount_path must be an absolute path." } + + validation { + condition = alltrue([ + for worker in var.slurm_nodeset_workers : + contains(["ext4", "xfs"], try(worker.local_nvme.filesystem_type, "ext4")) + ]) + error_message = "When worker local NVMe filesystem_type is set, it must be `ext4` or `xfs`." + } } variable "slurm_nodeset_login" { diff --git a/soperator/modules/k8s/k8s_ng_workers_v2.tf b/soperator/modules/k8s/k8s_ng_workers_v2.tf index c8af62880..6232f707c 100644 --- a/soperator/modules/k8s/k8s_ng_workers_v2.tf +++ b/soperator/modules/k8s/k8s_ng_workers_v2.tf @@ -180,10 +180,11 @@ resource "nebius_mk8s_v1_node_group" "worker_v2" { (local.node_group_gpu_present_v2.worker[count.index] && length(var.nvidia_admin_conf_lines) > 0) || try(var.node_group_workers_v2[count.index].local_nvme.enabled, false) ) ? templatefile("${path.module}/templates/cloud_init.yaml.tftpl", { - ssh_users = var.node_ssh_access_users - nvidia_admin_conf_lines = local.node_group_gpu_present_v2.worker[count.index] ? var.nvidia_admin_conf_lines : [] - local_nvme_enabled = try(var.node_group_workers_v2[count.index].local_nvme.enabled, false) - local_nvme_mount_path = try(var.node_group_workers_v2[count.index].local_nvme.mount_path, "/mnt/local-nvme") + ssh_users = var.node_ssh_access_users + nvidia_admin_conf_lines = local.node_group_gpu_present_v2.worker[count.index] ? var.nvidia_admin_conf_lines : [] + local_nvme_enabled = try(var.node_group_workers_v2[count.index].local_nvme.enabled, false) + local_nvme_mount_path = try(var.node_group_workers_v2[count.index].local_nvme.mount_path, "/mnt/local-nvme") + local_nvme_filesystem_type = try(var.node_group_workers_v2[count.index].local_nvme.filesystem_type, "ext4") }) : null } diff --git a/soperator/modules/k8s/locals.tf b/soperator/modules/k8s/locals.tf index c0492b716..684bc00f9 100644 --- a/soperator/modules/k8s/locals.tf +++ b/soperator/modules/k8s/locals.tf @@ -6,16 +6,18 @@ locals { node_cloud_init = { enabled = length(var.node_ssh_access_users) > 0 || length(var.nvidia_admin_conf_lines) > 0 cloud_init_data = templatefile("${path.module}/templates/cloud_init.yaml.tftpl", { - ssh_users = var.node_ssh_access_users - nvidia_admin_conf_lines = var.nvidia_admin_conf_lines - local_nvme_enabled = false - local_nvme_mount_path = "/mnt/local-nvme" + ssh_users = var.node_ssh_access_users + nvidia_admin_conf_lines = var.nvidia_admin_conf_lines + local_nvme_enabled = false + local_nvme_mount_path = "/mnt/local-nvme" + local_nvme_filesystem_type = "ext4" }) cloud_init_data_no_nvidia = templatefile("${path.module}/templates/cloud_init.yaml.tftpl", { - ssh_users = var.node_ssh_access_users - nvidia_admin_conf_lines = [] - local_nvme_enabled = false - local_nvme_mount_path = "/mnt/local-nvme" + ssh_users = var.node_ssh_access_users + nvidia_admin_conf_lines = [] + local_nvme_enabled = false + local_nvme_mount_path = "/mnt/local-nvme" + local_nvme_filesystem_type = "ext4" }) } diff --git a/soperator/modules/k8s/templates/cloud_init.yaml.tftpl b/soperator/modules/k8s/templates/cloud_init.yaml.tftpl index 14066c5a3..2eb732dd5 100644 --- a/soperator/modules/k8s/templates/cloud_init.yaml.tftpl +++ b/soperator/modules/k8s/templates/cloud_init.yaml.tftpl @@ -17,8 +17,10 @@ package_update: true packages: - mdadm - nvme-cli +%{ if local_nvme_filesystem_type == "xfs" } - xfsprogs %{ endif } +%{ endif } %{ if length(nvidia_admin_conf_lines) > 0 || local_nvme_enabled } write_files: @@ -110,8 +112,22 @@ write_files: set -euo pipefail MOUNTPOINT="$${1:-/mnt/local-nvme}" + FILESYSTEM_TYPE="${local_nvme_filesystem_type}" MD_DEV="/dev/md0" - MOUNT_OPTS="noatime,nodiratime" + case "$${FILESYSTEM_TYPE}" in + ext4) + MOUNT_OPTS="noatime,nodiratime,lazytime,commit=60" + MKFS_CMD=(mkfs.ext4 -F -m 0) + ;; + xfs) + MOUNT_OPTS="noatime,nodiratime,logbufs=8,inode64" + MKFS_CMD=(mkfs.xfs -f) + ;; + *) + echo "Unsupported filesystem type: $${FILESYSTEM_TYPE}" + exit 1 + ;; + esac echo "Detecting NVMe disks..." nvme list @@ -166,13 +182,13 @@ write_files: udevadm settle - mkfs.xfs -f "$${MD_DEV}" + "$${MKFS_CMD[@]}" "$${MD_DEV}" mkdir -p "$${MOUNTPOINT}" mount -o "$${MOUNT_OPTS}" "$${MD_DEV}" "$${MOUNTPOINT}" UUID="$(blkid -s UUID -o value "$${MD_DEV}")" - grep -q "$${UUID}" /etc/fstab || echo "UUID=$${UUID} $${MOUNTPOINT} xfs $${MOUNT_OPTS},nofail 0 2" >> /etc/fstab + grep -q "$${UUID}" /etc/fstab || echo "UUID=$${UUID} $${MOUNTPOINT} $${FILESYSTEM_TYPE} $${MOUNT_OPTS},nofail 0 2" >> /etc/fstab if [[ -d /etc/mdadm ]]; then mdadm --detail --scan > /etc/mdadm/mdadm.conf diff --git a/soperator/modules/k8s/variables.tf b/soperator/modules/k8s/variables.tf index 613ad6aec..8f9f33016 100644 --- a/soperator/modules/k8s/variables.tf +++ b/soperator/modules/k8s/variables.tf @@ -156,8 +156,9 @@ variable "node_group_workers_v2" { reservation_ids = optional(list(string)) })) local_nvme = optional(object({ - enabled = optional(bool, false) - mount_path = optional(string, "/mnt/local-nvme") + enabled = optional(bool, false) + mount_path = optional(string, "/mnt/local-nvme") + filesystem_type = optional(string, "ext4") }), {}) nodeset_index = number subset_index = number diff --git a/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl index 2dc37c34e..02a473ab7 100644 --- a/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl @@ -247,10 +247,10 @@ nodesets: %{~ endfor ~} %{~ if try(nodeset.local_nvme.enabled, false) ~} - - name: ensure-node-local-jail-submount-local-nvme-${try(nodeset.local_nvme.filesystem_type, "xfs")} + - name: ensure-node-local-jail-submount-local-nvme-${try(nodeset.local_nvme.filesystem_type, "ext4")} image: cr.eu-north1.nebius.cloud/soperator/busybox volumeMounts: [{ name: 'local-nvme', mountPath: /volume-mount }] - command: ["grep", ' /volume-mount ${try(nodeset.local_nvme.filesystem_type, "xfs")} ', "/proc/mounts"] + command: ["grep", ' /volume-mount ${try(nodeset.local_nvme.filesystem_type, "ext4")} ', "/proc/mounts"] %{~ endif ~} %{~ if jail_submounts.image_storage.enabled ~} diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index dda1edc6e..98ecaedc0 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -756,7 +756,7 @@ variable "worker_nodesets" { local_nvme = optional(object({ enabled = optional(bool, false) mount_path = optional(string, "/mnt/local-nvme") - filesystem_type = optional(string, "xfs") + filesystem_type = optional(string, "ext4") }), {}) })) default = [] From c303ca4646d7d1bc673c47952a658f0c064b1795 Mon Sep 17 00:00:00 2001 From: Ali Sattari Date: Mon, 16 Mar 2026 17:40:22 +0100 Subject: [PATCH 3/4] remove deadcode --- soperator/modules/slurm/flux_release_nodesets.tf | 10 +--------- soperator/modules/slurm/main.tf | 3 --- .../helm_values/flux_release_nodesets.yaml.tftpl | 7 ------- 3 files changed, 1 insertion(+), 19 deletions(-) diff --git a/soperator/modules/slurm/flux_release_nodesets.tf b/soperator/modules/slurm/flux_release_nodesets.tf index 5ae5942eb..d2b0369d7 100644 --- a/soperator/modules/slurm/flux_release_nodesets.tf +++ b/soperator/modules/slurm/flux_release_nodesets.tf @@ -26,15 +26,7 @@ resource "local_file" "flux_release_rendered_nodesets" { mount_path = submount.mount_path }] - local = [for submount in var.node_local_jail_submounts : { - name = submount.name - mount_path = submount.mount_path - source_type = "volume_claim_template" - host_path = null - filesystem_type = submount.filesystem_type - storage_class_name = submount.storage_class_name - size_gibibytes = submount.size_gibibytes - }] + local = var.node_local_jail_submounts image_storage = var.node_local_image_storage } diff --git a/soperator/modules/slurm/main.tf b/soperator/modules/slurm/main.tf index ee434461c..4c45efb14 100644 --- a/soperator/modules/slurm/main.tf +++ b/soperator/modules/slurm/main.tf @@ -148,9 +148,6 @@ resource "helm_release" "soperator_fluxcd_cm" { jail_submounts = [for submount in var.filestores.jail_submounts : { name = submount.name mount_path = submount.mount_path - source_type = "filestore" - host_path = null - filesystem_type = null }] controller_state_on_filestore = var.controller_state_on_filestore diff --git a/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl index 02a473ab7..43f6bad77 100644 --- a/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/flux_release_nodesets.yaml.tftpl @@ -98,12 +98,6 @@ nodesets: %{~ for sub_mount in jail_submounts.local ~} - name: ${sub_mount.name} mountPath: ${sub_mount.mount_path} - %{~ if sub_mount.source_type == "host_path" ~} - volumeSource: - hostPath: - path: ${sub_mount.host_path} - type: Directory - %{~ else ~} volumeClaimTemplateSpec: accessModes: - ReadWriteOnce @@ -111,7 +105,6 @@ nodesets: resources: requests: storage: ${sub_mount.size_gibibytes}Gi - %{~ endif ~} %{~ endfor ~} %{~ if try(nodeset.local_nvme.enabled, false) ~} From 50cb3be2a88101e8ef0b0701aa44b39a2b26dbf6 Mon Sep 17 00:00:00 2001 From: Ali Sattari Date: Mon, 16 Mar 2026 17:48:16 +0100 Subject: [PATCH 4/4] fmt --- soperator/modules/slurm/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/soperator/modules/slurm/main.tf b/soperator/modules/slurm/main.tf index 4c45efb14..4519e2332 100644 --- a/soperator/modules/slurm/main.tf +++ b/soperator/modules/slurm/main.tf @@ -146,8 +146,8 @@ resource "helm_release" "soperator_fluxcd_cm" { node_local_image_storage = var.node_local_image_storage jail_submounts = [for submount in var.filestores.jail_submounts : { - name = submount.name - mount_path = submount.mount_path + name = submount.name + mount_path = submount.mount_path }] controller_state_on_filestore = var.controller_state_on_filestore