Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion soperator/installations/example/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ locals {
subset_index = subset
preemptible = nodeset.preemptible
reservation_policy = nodeset.reservation_policy
local_nvme = {
enabled = try(nodeset.local_nvme.enabled, false)
mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme")
filesystem_type = try(nodeset.local_nvme.filesystem_type, "ext4")
}
}
]])
}
Expand All @@ -63,6 +68,7 @@ resource "terraform_data" "check_variables" {
terraform_data.check_slurm_nodeset,
terraform_data.check_slurm_nodeset_accounting,
terraform_data.check_nfs,
terraform_data.check_local_nvme,
]
}

Expand Down Expand Up @@ -414,7 +420,6 @@ module "slurm" {
storage_class_name = replace("${local.storage_class_prefix}-${lower(var.node_local_image_disk.spec.disk_type)}-${lower(var.node_local_image_disk.spec.filesystem_type)}", "_", "-")
} : null
}

nfs = {
enabled = var.nfs.enabled
path = var.nfs.enabled ? module.nfs-server[0].nfs_export_path : null
Expand Down Expand Up @@ -479,6 +484,11 @@ module "slurm" {
gres_config = lookup(module.resources.gres_config_by_platform, nodeset.resource.platform, null)
create_partition = nodeset.create_partition != null ? nodeset.create_partition : false
ephemeral_nodes = nodeset.ephemeral_nodes
local_nvme = {
enabled = try(nodeset.local_nvme.enabled, false)
mount_path = try(nodeset.local_nvme.mount_path, "/mnt/local-nvme")
filesystem_type = try(nodeset.local_nvme.filesystem_type, "ext4")
}
}]

login_allocation_id = module.k8s.static_ip_allocation_id
Expand Down
8 changes: 8 additions & 0 deletions soperator/installations/example/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,14 @@ slurm_nodeset_workers = [
# When true, nodes will use dynamic topology injection and power management.
# By default, false.
ephemeral_nodes = false
# Optional local NVMe passthrough for this nodeset only.
# Uses local instance disks, creates a RAID0 array and mounts it on the host via cloud-init.
# mount_path: path used for both host RAID mount and jail submount.
# local_nvme = {
# enabled = true
# mount_path = "/mnt/local-nvme"
# filesystem_type = "ext4"
# }
},
]

Expand Down
43 changes: 43 additions & 0 deletions soperator/installations/example/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,11 @@ variable "slurm_nodeset_workers" {
features = optional(list(string))
create_partition = optional(bool)
ephemeral_nodes = optional(bool, false)
local_nvme = optional(object({
enabled = optional(bool, false)
mount_path = optional(string, "/mnt/local-nvme")
filesystem_type = optional(string, "ext4")
}), {})
}))
nullable = false
default = [{
Expand Down Expand Up @@ -771,6 +776,24 @@ variable "slurm_nodeset_workers" {
])
error_message = "Worker nodeset autoscaling.min_size must be less than or equal to size."
}

validation {
condition = alltrue([
for worker in var.slurm_nodeset_workers :
!try(worker.local_nvme.enabled, false) || (
startswith(try(worker.local_nvme.mount_path, "/mnt/local-nvme"), "/")
)
])
error_message = "When worker local NVMe is enabled, mount_path must be an absolute path."
}

validation {
condition = alltrue([
for worker in var.slurm_nodeset_workers :
contains(["ext4", "xfs"], try(worker.local_nvme.filesystem_type, "ext4"))
])
error_message = "When worker local NVMe filesystem_type is set, it must be `ext4` or `xfs`."
}
}

variable "slurm_nodeset_login" {
Expand Down Expand Up @@ -923,6 +946,26 @@ resource "terraform_data" "check_slurm_nodeset" {
}
}

resource "terraform_data" "check_local_nvme" {
lifecycle {
precondition {
condition = (
!anytrue([
for worker in var.slurm_nodeset_workers :
try(worker.local_nvme.enabled, false)
]) ||
alltrue([
for worker in var.slurm_nodeset_workers :
!try(worker.local_nvme.enabled, false) || (
try(module.resources.local_nvme_supported_by_region_platform_preset[var.region][worker.resource.platform][worker.resource.preset], false)
)
])
)
error_message = "Local NVMe is enabled, but one or more worker nodesets use unsupported region/platform/preset."
}
}
}

# region Worker

variable "slurm_worker_sshd_config_map_ref_name" {
Expand Down
5 changes: 5 additions & 0 deletions soperator/modules/available_resources/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ output "by_platform" {
value = local.presets_by_platforms
}

output "local_nvme_supported_by_region_platform_preset" {
description = "Local NVMe support matrix by region/platform/preset."
value = local.local_nvme_supported_by_region_platform_preset
}

output "k8s_ephemeral_storage_coefficient" {
value = local.reserve.ephemeral_storage.coefficient
}
Expand Down
31 changes: 30 additions & 1 deletion soperator/modules/available_resources/preset.tf
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,15 @@ locals {
}
}

presets_by_platforms = tomap({
# Allow-list: "${region}/${platform}/${preset}"
local_nvme_supported_true_region_platform_preset = toset([
# gpu-b300-sxm
"${local.regions.uk-south1}/${local.platforms.gpu-b300-sxm}/${local.presets.p-8g-128c-1600g}",
"${local.regions.uk-south1}/${local.platforms.gpu-b300-sxm}/${local.presets.p-8g-160c-1792g}",
"${local.regions.uk-south1}/${local.platforms.gpu-b300-sxm}/${local.presets.p-8g-192c-2768g}",
])

presets_by_platforms_raw = tomap({
(local.platforms.cpu-e2) = tomap({
(local.presets.p-2c-8g) = local.presets_cpu.c-2vcpu-8gb
(local.presets.p-4c-16g) = local.presets_cpu.c-4vcpu-16gb
Expand Down Expand Up @@ -362,4 +370,25 @@ locals {
(local.presets.p-8g-192c-2768g) = local.presets_gpu.g-8gpu-192vcpu-2768gb
})
})

local_nvme_supported_by_region_platform_preset = tomap({
for region in [for _, region in local.regions : region] : region => tomap({
for platform, presets in local.presets_by_platforms_raw : platform => tomap({
for preset, _ in presets : preset => contains(local.local_nvme_supported_true_region_platform_preset, "${region}/${platform}/${preset}")
})
})
})

presets_by_platforms = tomap({
for platform, presets in local.presets_by_platforms_raw : platform => tomap({
for preset, resources in presets : preset => merge(resources, {
local_nvme_supported = anytrue([
for region in [for _, region in local.regions : region] : try(local.local_nvme_supported_by_region_platform_preset[region][platform][preset], false)
])
local_nvme_supported_by_region = tomap({
for region in [for _, region in local.regions : region] : region => try(local.local_nvme_supported_by_region_platform_preset[region][platform][preset], false)
})
})
})
})
}
25 changes: 20 additions & 5 deletions soperator/modules/k8s/k8s_ng_workers_v2.tf
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,15 @@ resource "nebius_mk8s_v1_node_group" "worker_v2" {
block_size_bytes = provider::units::from_kib(var.node_group_workers_v2[count.index].boot_disk.block_size_kibibytes)
}

local_disks = try(var.node_group_workers_v2[count.index].local_nvme.enabled, false) ? {
config = {
none = true
}
passthrough_group = {
requested = true
}
} : null

filesystems = concat(
[
{
Expand Down Expand Up @@ -166,11 +175,17 @@ resource "nebius_mk8s_v1_node_group" "worker_v2" {

os = "ubuntu24.04"

cloud_init_user_data = local.node_group_gpu_present_v2.worker[count.index] ? (
local.node_cloud_init.enabled ? local.node_cloud_init.cloud_init_data : null
) : (
local.node_ssh_access.enabled ? local.node_cloud_init.cloud_init_data_no_nvidia : null
)
cloud_init_user_data = (
local.node_ssh_access.enabled ||
(local.node_group_gpu_present_v2.worker[count.index] && length(var.nvidia_admin_conf_lines) > 0) ||
try(var.node_group_workers_v2[count.index].local_nvme.enabled, false)
) ? templatefile("${path.module}/templates/cloud_init.yaml.tftpl", {
ssh_users = var.node_ssh_access_users
nvidia_admin_conf_lines = local.node_group_gpu_present_v2.worker[count.index] ? var.nvidia_admin_conf_lines : []
local_nvme_enabled = try(var.node_group_workers_v2[count.index].local_nvme.enabled, false)
local_nvme_mount_path = try(var.node_group_workers_v2[count.index].local_nvme.mount_path, "/mnt/local-nvme")
local_nvme_filesystem_type = try(var.node_group_workers_v2[count.index].local_nvme.filesystem_type, "ext4")
}) : null
}

lifecycle {
Expand Down
14 changes: 10 additions & 4 deletions soperator/modules/k8s/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@ locals {
node_cloud_init = {
enabled = length(var.node_ssh_access_users) > 0 || length(var.nvidia_admin_conf_lines) > 0
cloud_init_data = templatefile("${path.module}/templates/cloud_init.yaml.tftpl", {
ssh_users = var.node_ssh_access_users
nvidia_admin_conf_lines = var.nvidia_admin_conf_lines
ssh_users = var.node_ssh_access_users
nvidia_admin_conf_lines = var.nvidia_admin_conf_lines
local_nvme_enabled = false
local_nvme_mount_path = "/mnt/local-nvme"
local_nvme_filesystem_type = "ext4"
})
cloud_init_data_no_nvidia = templatefile("${path.module}/templates/cloud_init.yaml.tftpl", {
ssh_users = var.node_ssh_access_users
nvidia_admin_conf_lines = []
ssh_users = var.node_ssh_access_users
nvidia_admin_conf_lines = []
local_nvme_enabled = false
local_nvme_mount_path = "/mnt/local-nvme"
local_nvme_filesystem_type = "ext4"
})
}

Expand Down
117 changes: 116 additions & 1 deletion soperator/modules/k8s/templates/cloud_init.yaml.tftpl
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,19 @@ users:
%{ endfor }
%{ endif }

%{ if length(nvidia_admin_conf_lines) > 0 }
%{ if local_nvme_enabled }
package_update: true
packages:
- mdadm
- nvme-cli
%{ if local_nvme_filesystem_type == "xfs" }
- xfsprogs
%{ endif }
%{ endif }

%{ if length(nvidia_admin_conf_lines) > 0 || local_nvme_enabled }
write_files:
%{ if length(nvidia_admin_conf_lines) > 0 }
- path: /etc/modprobe.d/nvidia_admin.conf
owner: root:root
permissions: "0644"
Expand Down Expand Up @@ -91,7 +102,111 @@ write_files:
fi

log "end"
%{ endif }
%{ if local_nvme_enabled }
- path: /usr/local/sbin/prepare-disks.sh
owner: root:root
permissions: "0755"
content: |
#!/usr/bin/env bash
set -euo pipefail

MOUNTPOINT="$${1:-/mnt/local-nvme}"
FILESYSTEM_TYPE="${local_nvme_filesystem_type}"
MD_DEV="/dev/md0"
case "$${FILESYSTEM_TYPE}" in
ext4)
MOUNT_OPTS="noatime,nodiratime,lazytime,commit=60"
MKFS_CMD=(mkfs.ext4 -F -m 0)
;;
xfs)
MOUNT_OPTS="noatime,nodiratime,logbufs=8,inode64"
MKFS_CMD=(mkfs.xfs -f)
;;
*)
echo "Unsupported filesystem type: $${FILESYSTEM_TYPE}"
exit 1
;;
esac

echo "Detecting NVMe disks..."
nvme list

mapfile -t NVME_DISKS < <(
nvme list | awk 'NR>2 && $1 ~ /^\/dev\/nvme[0-9]+n[0-9]+$/ { print $1 }' | sort -V | uniq
)

ROOT_SOURCE="$(findmnt -n -o SOURCE / || true)"
ROOT_PKNAME="$(lsblk -no PKNAME "$${ROOT_SOURCE}" 2>/dev/null || true)"
if [[ -n "$${ROOT_PKNAME}" ]]; then
ROOT_DISK="/dev/$${ROOT_PKNAME}"
FILTERED=()
for d in "$${NVME_DISKS[@]}"; do
if [[ "$${d}" != "$${ROOT_DISK}" ]]; then
FILTERED+=("$${d}")
fi
done
NVME_DISKS=("$${FILTERED[@]}")
fi

DISK_COUNT="$${#NVME_DISKS[@]}"
if (( DISK_COUNT < 2 || DISK_COUNT > 8 )); then
echo "Expected 2..8 NVMe disks for RAID0, found $${DISK_COUNT}: $${NVME_DISKS[*]}"
exit 1
fi

for d in "$${NVME_DISKS[@]}"; do
[[ -b "$${d}" ]] || { echo "Block device not found: $${d}"; exit 1; }
done

echo "Using $${DISK_COUNT} NVMe disk(s): $${NVME_DISKS[*]}"

for d in "$${NVME_DISKS[@]}"; do
DISK_BASENAME="$(basename "$${d}")"
SCHEDULER_PATH="/sys/block/$${DISK_BASENAME}/queue/scheduler"
if [[ -w "$${SCHEDULER_PATH}" ]]; then
echo none | tee "$${SCHEDULER_PATH}" >/dev/null
else
echo "Skipping scheduler update for $${d}: $${SCHEDULER_PATH} is not writable"
fi
done

if [[ -e "$${MD_DEV}" ]]; then
mdadm --stop "$${MD_DEV}" || true
fi

mdadm --create "$${MD_DEV}" \
--level=0 \
--raid-devices="$${DISK_COUNT}" \
"$${NVME_DISKS[@]}"

udevadm settle

"$${MKFS_CMD[@]}" "$${MD_DEV}"

mkdir -p "$${MOUNTPOINT}"
mount -o "$${MOUNT_OPTS}" "$${MD_DEV}" "$${MOUNTPOINT}"

UUID="$(blkid -s UUID -o value "$${MD_DEV}")"
grep -q "$${UUID}" /etc/fstab || echo "UUID=$${UUID} $${MOUNTPOINT} $${FILESYSTEM_TYPE} $${MOUNT_OPTS},nofail 0 2" >> /etc/fstab

if [[ -d /etc/mdadm ]]; then
mdadm --detail --scan > /etc/mdadm/mdadm.conf
fi

echo "Done."
echo "RAID device: $${MD_DEV}"
echo "Mounted at : $${MOUNTPOINT}"
df -h "$${MOUNTPOINT}"
%{ endif }
%{ endif }

%{ if length(nvidia_admin_conf_lines) > 0 || local_nvme_enabled }
runcmd:
%{ if length(nvidia_admin_conf_lines) > 0 }
- [ bash, -lc, "/usr/local/bin/nvidia-conf-check.sh >> /var/log/nvidia-conf-check.log 2>&1" ]
%{ endif }
%{ if local_nvme_enabled }
- [ bash, -lc, "/usr/local/sbin/prepare-disks.sh ${local_nvme_mount_path} > /var/log/prepare-disks.log 2>&1" ]
%{ endif }
%{ endif }
5 changes: 5 additions & 0 deletions soperator/modules/k8s/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,11 @@ variable "node_group_workers_v2" {
policy = optional(string)
reservation_ids = optional(list(string))
}))
local_nvme = optional(object({
enabled = optional(bool, false)
mount_path = optional(string, "/mnt/local-nvme")
filesystem_type = optional(string, "ext4")
}), {})
nodeset_index = number
subset_index = number
}))
Expand Down
1 change: 1 addition & 0 deletions soperator/modules/slurm/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ resource "helm_release" "soperator_fluxcd_cm" {
cluster_name = var.cluster_name
region = var.region
public_o11y_enabled = var.public_o11y_enabled
has_local_nvme = anytrue([for nodeset in var.worker_nodesets : try(nodeset.local_nvme.enabled, false)])
metrics_collector = local.metrics_collector
create_pvcs = var.create_pvcs

Expand Down
Loading
Loading