Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
214a42f
Add autoscaling support
roman-iurkov Feb 17, 2026
ecc4ce7
Get rid of magic calculations
roman-iurkov Feb 19, 2026
8a63a4c
Inculde modules changes
roman-iurkov Feb 19, 2026
4e2a4bb
added nemotron nano v2 vl Nim
Feb 20, 2026
33a2eb3
fixed type
Feb 20, 2026
6f34665
moved nemo vl to cosmos lb
Feb 20, 2026
593342c
Merge pull request #812 from nebius/merge-to-main-from/SCHED-987/0
Uburro Feb 23, 2026
d003d3a
Merge pull request #814 from nebius/merge-to-main-from/release-3.0
Uburro Feb 24, 2026
3c970c3
Merge pull request #816 from nebius/merge-to-main-from/fix-cleanup-ac…
theyoprst Feb 24, 2026
79fb9e4
Merge pull request #809 from nebius/nim/nano2-vl
timothy-le7 Feb 25, 2026
ecb7d64
Merge pull request #822 from nebius/merge-to-main-from/release-3.0.1/0
Uburro Feb 25, 2026
edf5d3e
SCHED-411: update configmaps mode
itechdima Feb 25, 2026
59cd15f
Merge branch 'main' into merge-to-main-from/release-2.0.2/0
Uburro Feb 25, 2026
5a0be94
Merge pull request #821 from nebius/merge-to-main-from/release-2.0.2/0
Uburro Feb 25, 2026
ec807ae
Merge pull request #823 from nebius/SCHED-411/use-native-pyxis
itechdima Feb 26, 2026
f77f062
Merge pull request #826 from nebius/merge-to-main-from/SCHED-1023/e2e…
theyoprst Feb 26, 2026
0f88ff1
Merge pull request #828 from nebius/merge-to-main-from/SCHED-1023/ret…
theyoprst Feb 27, 2026
1bf33cc
Remove comments
roman-iurkov Feb 27, 2026
7a7991e
Merge pull request #800 from nebius/ri/add-autoscaling-support
roman-iurkov Feb 27, 2026
4de6d13
Merge pull request #831 from nebius/merge-to-main-from/kubectl-retry
theyoprst Mar 2, 2026
f345e9b
Merge pull request #833 from nebius/merge-to-main-from/SCHED-993/reve…
theyoprst Mar 2, 2026
e2edd51
Merge pull request #836 from nebius/merge-to-main-from/SCHED-993/merg…
theyoprst Mar 2, 2026
d286393
Merge pull request #838 from nebius/merge-to-main-from/merge-to-soper…
theyoprst Mar 3, 2026
5c42f36
Merge pull request #841 from nebius/merge-to-main-from/SCHED-993/fix-…
theyoprst Mar 3, 2026
c46759c
(skip) Merge pull request #843 from nebius/merge-to-main-from/merge-t…
theyoprst Mar 3, 2026
f5c719b
Switch to preemptible nodes and remove loki (#808)
roman-iurkov Mar 3, 2026
5ee7a62
Merge pull request #846 from nebius/merge-to-main-from/merge-to-soper…
theyoprst Mar 3, 2026
e71ca87
Use a bigger preset for controller nodes by default
rdjjke Mar 3, 2026
b7740ec
ARCHVTEAMS-1267 remove stale NPD variables from k8s-training
Mar 3, 2026
9adba49
Merge pull request #848 from nebius/SCHED-1067/0
rdjjke Mar 4, 2026
cc9c14e
Don't fail disk_cleanup when there are no disks to delete
rdjjke Mar 4, 2026
4890c8d
Merge pull request #847 from nebius/ARCHVTEAMS-1267/remove-npd-stale-…
aaronbfagan Mar 4, 2026
5f845d7
Merge pull request #849 from nebius/fix-disk-cleanup/0
rdjjke Mar 4, 2026
809f4e9
let there be nvme
ali-sattari Mar 6, 2026
7ee7e82
check region/platform/preset for validation
ali-sattari Mar 6, 2026
76a57c6
small fix
ali-sattari Mar 6, 2026
441dcf0
one mount_path to rule them all
ali-sattari Mar 6, 2026
7149138
if nvme then enable hc script
ali-sattari Mar 12, 2026
177b428
fixes and xfs
ali-sattari Mar 13, 2026
4d5ac8d
based on info in #proj-local-disks
ali-sattari Mar 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions k8s-training/helm.tf
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,10 @@ module "device-plugin" {
}

module "o11y" {
source = "../modules/o11y"
parent_id = var.parent_id
tenant_id = var.tenant_id
cluster_id = nebius_mk8s_v1_cluster.k8s-cluster.id
cpu_nodes_count = var.cpu_nodes_count
gpu_nodes_count = var.gpu_nodes_count_per_group * var.gpu_node_groups
source = "../modules/o11y"
parent_id = var.parent_id
tenant_id = var.tenant_id
cluster_id = nebius_mk8s_v1_cluster.k8s-cluster.id
k8s_node_group_sa_id = var.enable_k8s_node_group_sa ? nebius_iam_v1_service_account.k8s_node_group_sa[0].id : null
k8s_node_group_sa_enabled = var.enable_k8s_node_group_sa

Expand All @@ -57,8 +55,8 @@ module "o11y" {
enabled = var.enable_grafana
}
loki = {
enabled = var.enable_loki
replication_factor = var.loki_custom_replication_factor
enabled = var.loki.enabled
replication_factor = var.loki.replication_factor
region = var.region
}
prometheus = {
Expand Down
37 changes: 0 additions & 37 deletions k8s-training/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -74,43 +74,6 @@ locals {
"gpu-b300-sxm" = ["all-disabled", "all-enabled", "all-balanced", "all-1g.23gb", "all-1g.23gb.me", "all-1g.45gb", "all-2g.45gb", "all-3g.90gb", "all-4g.90gb", "all-7g.180gb"]

}

# Mapping from platform and preset to hardware profile for nebius-gpu-health-checker
platform_preset_to_hardware_profile = {
# H100 configurations
"gpu-h100-sxm-1gpu-16vcpu-200gb" = "1xH100"
"gpu-h100-sxm-8gpu-128vcpu-1600gb" = "8xH100"

# H200 configurations
"gpu-h200-sxm-1gpu-16vcpu-200gb" = "1xH200"
"gpu-h200-sxm-8gpu-128vcpu-1600gb" = "8xH200"

# B200 configurations
"gpu-b200-sxm-1gpu-20vcpu-224gb" = "1xB200"
"gpu-b200-sxm-8gpu-160vcpu-1792gb" = "8xB200"
"gpu-b200-sxm-a-8gpu-160vcpu-1792gb" = "8xB200"

#B300 configuration
"gpu-b300-sxm-8gpu-192vcpu-2768gb" = "8xB300"
"gpu-b300-sxm-1gpu-24vcpu-346gb" = "1xB300"

# L40 configurations
"gpu-l40s-d-1gpu-16vcpu-96gb" = "1xL40S"
"gpu-l40s-d-1gpu-32vcpu-192gb" = "1xL40S"
"gpu-l40s-d-1gpu-48vcpu-288gb" = "1xL40S"
"gpu-l40s-d-2gpu-64vcpu-384gb" = "2xL40S"
"gpu-l40s-d-2gpu-64vcpu-384gb" = "2xL40S"
"gpu-l40s-d-2gpu-96vcpu-576gb" = "2xL40S"
"gpu-l40s-d-4gpu-128vcpu-768gb" = "4xL40S"
"gpu-l40s-d-4gpu-192vcpu-1152gb" = "4xL40S"
"gpu-l40s-a-1gpu-8vcpu-32gb" = "1XL40S"
"gpu-l40s-a-1gpu-24vcpu-96gb" = "1X40S"
"gpu-l40s-a-1gpu-32vcpu-128gb" = "1X40S"
"gpu-l40s-a-1gpu-40vcpu-160gb" = "1X40S"
}

# Create the key for hardware profile lookup
hardware_profile_key = "${local.gpu_nodes_platform}-${local.gpu_nodes_preset}"
}

resource "random_string" "random" {
Expand Down
31 changes: 23 additions & 8 deletions k8s-training/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,17 @@ resource "nebius_iam_v1_group_membership" "k8s_node_group_sa-admin" {
# CPU NODE GROUP
################
resource "nebius_mk8s_v1_node_group" "cpu-only" {
fixed_node_count = var.cpu_nodes_count
parent_id = nebius_mk8s_v1_cluster.k8s-cluster.id
name = "${var.cluster_name}-ng-cpu"

autoscaling = var.cpu_nodes_autoscaling.enabled ? {
min_node_count = var.cpu_nodes_autoscaling.min_size == null ? var.cpu_nodes_autoscaling.max_size : var.cpu_nodes_autoscaling.min_size
max_node_count = var.cpu_nodes_autoscaling.max_size
} : null

fixed_node_count = var.cpu_nodes_autoscaling.enabled ? null : var.cpu_nodes_fixed_count


parent_id = nebius_mk8s_v1_cluster.k8s-cluster.id
name = "${var.cluster_name}-ng-cpu"
labels = {
"library-solution" : "k8s-training",
}
Expand Down Expand Up @@ -101,13 +109,20 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" {
}
}
#################
# GPU nODE GROUPS
# GPU NODE GROUPS
#################
resource "nebius_mk8s_v1_node_group" "gpu" {
count = var.gpu_node_groups
fixed_node_count = var.gpu_nodes_count_per_group
parent_id = nebius_mk8s_v1_cluster.k8s-cluster.id
name = "${var.cluster_name}-ng-gpu-${count.index}"
count = var.gpu_node_groups

autoscaling = var.gpu_nodes_autoscaling.enabled ? {
min_node_count = var.gpu_nodes_autoscaling.min_size == null ? var.gpu_nodes_autoscaling.max_size : var.gpu_nodes_autoscaling.min_size
max_node_count = var.gpu_nodes_autoscaling.max_size
} : null

fixed_node_count = var.gpu_nodes_autoscaling.enabled ? null : var.gpu_nodes_fixed_count_per_group

parent_id = nebius_mk8s_v1_cluster.k8s-cluster.id
name = "${var.cluster_name}-ng-gpu-${count.index}"
labels = {
"library-solution" : "k8s-training",
}
Expand Down
30 changes: 25 additions & 5 deletions k8s-training/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,27 @@ ssh_public_key = {
}

# K8s nodes
cpu_nodes_count = 2 # Number of CPU nodes
gpu_nodes_count_per_group = 2 # Number of GPU nodes per group
gpu_node_groups = 1 # In case you need more then 100 nodes in cluster you have to put multiple node groups
cpu_nodes_fixed_count = 2 # Used only when cpu_nodes_autoscaling.enabled = false
cpu_nodes_autoscaling = {
enabled = false
# min_size options:
# - null: min=max, no scale-down (default, recommended - saves ~10 min on initial provisioning)
# it can be changed to a number later if needed.
# - N: can scale down to N nodes
min_size = null
max_size = 4
}
gpu_nodes_fixed_count_per_group = 1 # Number of GPU nodes per group, used only when gpu_nodes_autoscaling.enabled = false
gpu_nodes_autoscaling = {
enabled = false
# min_size options:
# - null: min=max, no scale-down (default, recommended - saves ~10 min on initial provisioning)
# it can be changed to a number later if needed.
# - N: can scale down to N nodes
min_size = null
max_size = 1
}
gpu_node_groups = 1 # In case you need more then 100 nodes in cluster you have to put multiple node groups
# CPU platform and presets: https://docs.nebius.com/compute/virtual-machines/types#cpu-configurations
cpu_nodes_platform = "cpu-d3" # CPU nodes platform
cpu_nodes_preset = "4vcpu-16gb" # CPU nodes preset
Expand Down Expand Up @@ -42,8 +60,10 @@ enable_grafana = true # Enable or disable Grafana® solution by Nebius

# Local Observability installation
enable_prometheus = false # Enable or disable Prometheus and Grafana deployment with true or false
enable_loki = false # Enable or disable Loki deployment with true or false

loki = {
enabled = true # Enable or disable Loki deployment with true or false
replication_factor = 2 # Number of Loki replicas for each log chunk (higher = better availability, more storage/network cost)
}
# Storage
enable_filestore = false # Enable or disable Filestore integration with true or false
existing_filestore = "" # If enable_filestore = true, with this variable we can add existing filestore. Require string, example existing_filestore = "computefilesystem-e00r7z9vfxmg1bk99s"
Expand Down
3 changes: 1 addition & 2 deletions k8s-training/tests/k8s-training-kuberay.tftest.hcl
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
###GLOBAL VARIABLES OWERWITE BLOCK###
variables {
gpu_nodes_platform = "gpu-h100-sxm"
enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket
etcd_cluster_size = 1
gpu_nodes_preemptible = true
}
######
run "k8s_training_kuberay_apply" {
Expand Down
2 changes: 1 addition & 1 deletion k8s-training/tests/main.tftest.hcl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
###GLOBALVARIABLES OWERWITE BLOCK###
variables {
gpu_nodes_platform = "gpu-h100-sxm"
enable_loki = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket
gpu_nodes_preemptible = true
}
######
run "k8s_training_apply" {
Expand Down
37 changes: 26 additions & 11 deletions k8s-training/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ variable "ssh_public_key" {
}

# K8s CPU node group
variable "cpu_nodes_count" {
variable "cpu_nodes_fixed_count" {
description = "Number of nodes in the CPU-only node group."
type = number
default = 3
Expand Down Expand Up @@ -129,12 +129,30 @@ variable "cpu_disk_size" {
}

# K8s GPU node group
variable "gpu_nodes_count_per_group" {
variable "gpu_nodes_fixed_count_per_group" {
description = "Number of nodes in the GPU node group."
type = number
default = 2
}

variable "gpu_nodes_autoscaling" {
type = object({
enabled = optional(bool, false)
min_size = optional(number)
max_size = optional(number)
})
default = {}
}

variable "cpu_nodes_autoscaling" {
type = object({
enabled = optional(bool, false)
min_size = optional(number)
max_size = optional(number)
})
default = {}
}

variable "gpu_node_groups" {
description = "Number of GPU node groups."
type = number
Expand Down Expand Up @@ -238,10 +256,12 @@ variable "enable_grafana" {
default = true
}

variable "enable_loki" {
description = "Enable Loki for logs aggregation."
type = bool
default = true
variable "loki" {
type = object({
enabled = optional(bool, false)
region = optional(string)
replication_factor = optional(number)
})
}

variable "enable_prometheus" {
Expand Down Expand Up @@ -367,11 +387,6 @@ variable "gpu_nodes_preemptible" {
default = false
}

variable "gpu_health_cheker" {
description = "Use preemptible VMs for GPU nodes"
type = bool
default = true
}
variable "custom_driver" {
description = "Use customized driver for the GPU Operator, e.g. to run Cuda 13 on H200"
type = bool
Expand Down
7 changes: 7 additions & 0 deletions modules/nims/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ The module exposes NIMs using a shared LoadBalancer per group. The demo UI expec
- **Cosmos-Reason2-8B** → `8001`
- **Cosmos-Reason2-2B** → `8002`
- **Cosmos-Embed1** → `8003`
- **Nemotron Nano 12B v2 VL (NAno2 VL)** → `8004` *(if enabled in this module)*

### BioNeMo (Separate LB)
- Deployed via `bionemo.tf` (ports and services defined there).
Expand Down Expand Up @@ -107,6 +108,9 @@ The module exposes NIMs using a shared LoadBalancer per group. The demo UI expec
- `bionemo.tf`
Deploys BioNeMo NIMs on a separate LoadBalancer.

- `nemotron-nano-12b-v2-vl.tf`
Deploys Nemotron Nano 12B v2 VL (Nano2 VL), a vision-language model for document intelligence, video understanding, and multimodal reasoning.

---

## How the Demo App Uses This Module
Expand Down Expand Up @@ -170,5 +174,8 @@ module "nims" {
cosmos_reason2_8b = true
cosmos_reason2_2b = true
cosmos_embed1 = true

# Vision-language (Nano2 VL)
nemotron_nano_12b_v2_vl = true
}
```
21 changes: 21 additions & 0 deletions modules/nims/cosmos-proxy.tf
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,17 @@ resource "kubernetes_config_map_v1" "cosmos_tcp_proxy" {
proxy_timeout 600s;
proxy_connect_timeout 10s;
}

# Port 8004 -> nemotron-nano-12b-v2-vl (Nano2 VL)
upstream nemotron_nano_12b_v2_vl {
server nemotron-nano-12b-v2-vl-svc.${var.namespace}.svc.cluster.local:8000;
}
server {
listen 8004;
proxy_pass nemotron_nano_12b_v2_vl;
proxy_timeout 600s;
proxy_connect_timeout 10s;
}
}
EOF
}
Expand Down Expand Up @@ -122,6 +133,9 @@ resource "kubernetes_deployment_v1" "cosmos_tcp_proxy" {
port {
container_port = 8003
}
port {
container_port = 8004
}

volume_mount {
name = "nginx-config"
Expand Down Expand Up @@ -183,5 +197,12 @@ resource "kubernetes_service_v1" "cosmos_lb" {
target_port = 8003
protocol = "TCP"
}

port {
name = "nemotron-nano-12b-v2-vl"
port = 8004
target_port = 8004
protocol = "TCP"
}
}
}
Loading
Loading