diff --git a/k8s-training/environment.sh b/k8s-training/environment.sh index 83053058b..c9ab395c6 100644 --- a/k8s-training/environment.sh +++ b/k8s-training/environment.sh @@ -52,9 +52,8 @@ NEBIUS_SA_NAME="k8s-training-sa" NEBIUS_SA_ID=$(nebius iam service-account get-by-name \ --parent-id "${NEBIUS_PROJECT_ID}" \ --name "${NEBIUS_SA_NAME}" \ - --format json \ - | jq -r '.metadata.id') - + --format json 2>/dev/null \ + | jq -r '.metadata.id // empty') if [ -z "$NEBIUS_SA_ID" ]; then NEBIUS_SA_ID=$(nebius iam service-account create \ diff --git a/k8s-training/main.tf b/k8s-training/main.tf index 995489828..96c643be8 100644 --- a/k8s-training/main.tf +++ b/k8s-training/main.tf @@ -166,6 +166,10 @@ resource "nebius_mk8s_v1_node_group" "gpu" { ] : null gpu_cluster = var.enable_gpu_cluster ? nebius_compute_v1_gpu_cluster.fabric_2[0] : null gpu_settings = var.gpu_nodes_driverfull_image ? { drivers_preset = local.device_preset } : null + reservation_policy = var.gpu_nodes_reservation_policy != null ? { + policy = var.gpu_nodes_reservation_policy.policy + reservation_ids = var.gpu_nodes_reservation_policy.reservation_ids + } : null preemptible = var.gpu_nodes_preemptible ? { on_preemption = "STOP" priority = 3 diff --git a/k8s-training/terraform.tfvars b/k8s-training/terraform.tfvars index 479f17d34..4526c6a3b 100644 --- a/k8s-training/terraform.tfvars +++ b/k8s-training/terraform.tfvars @@ -39,6 +39,12 @@ gpu_nodes_preset = "8gpu-128vcpu-1600gb" # GPU nodes preset: 8gpu-128vcpu-1600 # Infiniband fabrics: https://docs.nebius.com/compute/clusters/gpu#fabrics infiniband_fabric = "" # Infiniband fabric name +# gpu_nodes_reservation_policy = { +# policy = "STRICT" +# reservation_ids = ["capacityblockgroup-e00xxxxx"] +# } + + gpu_nodes_driverfull_image = true enable_k8s_node_group_sa = true enable_egress_gateway = false diff --git a/k8s-training/variables.tf b/k8s-training/variables.tf index 0cd2a1fa4..531385dff 100644 --- a/k8s-training/variables.tf +++ b/k8s-training/variables.tf @@ -387,6 +387,20 @@ variable "gpu_nodes_preemptible" { default = false } +variable "gpu_nodes_reservation_policy" { + description = "Reservation policy for GPU node groups. Policy can be AUTO, STRICT, or FORBID." + type = object({ + policy = string + reservation_ids = list(string) + }) + default = null + + validation { + condition = var.gpu_nodes_reservation_policy == null || contains(["AUTO", "STRICT", "FORBID"], var.gpu_nodes_reservation_policy.policy) + error_message = "Policy must be one of AUTO, STRICT, or FORBID." + } +} + variable "custom_driver" { description = "Use customized driver for the GPU Operator, e.g. to run Cuda 13 on H200" type = bool