nebius · ali-sattari · Feb 17, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 20, 2026
@@ -39,12 +39,10 @@ module "device-plugin" {
 }
 
 module "o11y" {
-  source                    = "../modules/o11y"
-  parent_id                 = var.parent_id
-  tenant_id                 = var.tenant_id
-  cluster_id                = nebius_mk8s_v1_cluster.k8s-cluster.id
-  cpu_nodes_count           = var.cpu_nodes_count
-  gpu_nodes_count           = var.gpu_nodes_count_per_group * var.gpu_node_groups
+  source     = "../modules/o11y"
+  parent_id  = var.parent_id
+  tenant_id  = var.tenant_id
+  cluster_id = nebius_mk8s_v1_cluster.k8s-cluster.id
   k8s_node_group_sa_id      = var.enable_k8s_node_group_sa ? nebius_iam_v1_service_account.k8s_node_group_sa[0].id : null
   k8s_node_group_sa_enabled = var.enable_k8s_node_group_sa
 
@@ -57,8 +55,8 @@ module "o11y" {
       enabled = var.enable_grafana
     }
     loki = {
-      enabled            = var.enable_loki
-      replication_factor = var.loki_custom_replication_factor
+      enabled            = var.loki.enabled
+      replication_factor = var.loki.replication_factor
       region             = var.region
     }
     prometheus = {

@@ -74,43 +74,6 @@ locals {
     "gpu-b300-sxm"   = ["all-disabled", "all-enabled", "all-balanced", "all-1g.23gb", "all-1g.23gb.me", "all-1g.45gb", "all-2g.45gb", "all-3g.90gb", "all-4g.90gb", "all-7g.180gb"]
 
   }
-
-  # Mapping from platform and preset to hardware profile for nebius-gpu-health-checker
-  platform_preset_to_hardware_profile = {
-    # H100 configurations
-    "gpu-h100-sxm-1gpu-16vcpu-200gb"   = "1xH100"
-    "gpu-h100-sxm-8gpu-128vcpu-1600gb" = "8xH100"
-
-    # H200 configurations
-    "gpu-h200-sxm-1gpu-16vcpu-200gb"   = "1xH200"
-    "gpu-h200-sxm-8gpu-128vcpu-1600gb" = "8xH200"
-
-    # B200 configurations
-    "gpu-b200-sxm-1gpu-20vcpu-224gb"     = "1xB200"
-    "gpu-b200-sxm-8gpu-160vcpu-1792gb"   = "8xB200"
-    "gpu-b200-sxm-a-8gpu-160vcpu-1792gb" = "8xB200"
-
-    #B300 configuration
-    "gpu-b300-sxm-8gpu-192vcpu-2768gb" = "8xB300"
-    "gpu-b300-sxm-1gpu-24vcpu-346gb"   = "1xB300"
-
-    # L40 configurations
-    "gpu-l40s-d-1gpu-16vcpu-96gb"    = "1xL40S"
-    "gpu-l40s-d-1gpu-32vcpu-192gb"   = "1xL40S"
-    "gpu-l40s-d-1gpu-48vcpu-288gb"   = "1xL40S"
-    "gpu-l40s-d-2gpu-64vcpu-384gb"   = "2xL40S"
-    "gpu-l40s-d-2gpu-64vcpu-384gb"   = "2xL40S"
-    "gpu-l40s-d-2gpu-96vcpu-576gb"   = "2xL40S"
-    "gpu-l40s-d-4gpu-128vcpu-768gb"  = "4xL40S"
-    "gpu-l40s-d-4gpu-192vcpu-1152gb" = "4xL40S"
-    "gpu-l40s-a-1gpu-8vcpu-32gb"     = "1XL40S"
-    "gpu-l40s-a-1gpu-24vcpu-96gb"    = "1X40S"
-    "gpu-l40s-a-1gpu-32vcpu-128gb"   = "1X40S"
-    "gpu-l40s-a-1gpu-40vcpu-160gb"   = "1X40S"
-  }
-
-  # Create the key for hardware profile lookup
-  hardware_profile_key = "${local.gpu_nodes_platform}-${local.gpu_nodes_preset}"
 }
 
 resource "random_string" "random" {

@@ -48,9 +48,17 @@ resource "nebius_iam_v1_group_membership" "k8s_node_group_sa-admin" {
 # CPU NODE GROUP
 ################
 resource "nebius_mk8s_v1_node_group" "cpu-only" {
-  fixed_node_count = var.cpu_nodes_count
-  parent_id        = nebius_mk8s_v1_cluster.k8s-cluster.id
-  name             = "${var.cluster_name}-ng-cpu"
+
+  autoscaling = var.cpu_nodes_autoscaling.enabled ? {
+    min_node_count = var.cpu_nodes_autoscaling.min_size == null ? var.cpu_nodes_autoscaling.max_size : var.cpu_nodes_autoscaling.min_size
+    max_node_count = var.cpu_nodes_autoscaling.max_size
+  } : null
+
+  fixed_node_count = var.cpu_nodes_autoscaling.enabled ? null : var.cpu_nodes_fixed_count
+
+
+  parent_id = nebius_mk8s_v1_cluster.k8s-cluster.id
+  name      = "${var.cluster_name}-ng-cpu"
   labels = {
     "library-solution" : "k8s-training",
   }
@@ -101,13 +109,20 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" {
   }
 }
 #################
-# GPU nODE GROUPS
+# GPU NODE GROUPS
 #################
 resource "nebius_mk8s_v1_node_group" "gpu" {
-  count            = var.gpu_node_groups
-  fixed_node_count = var.gpu_nodes_count_per_group
-  parent_id        = nebius_mk8s_v1_cluster.k8s-cluster.id
-  name             = "${var.cluster_name}-ng-gpu-${count.index}"
+  count = var.gpu_node_groups
+
+  autoscaling = var.gpu_nodes_autoscaling.enabled ? {
+    min_node_count = var.gpu_nodes_autoscaling.min_size == null ? var.gpu_nodes_autoscaling.max_size : var.gpu_nodes_autoscaling.min_size
+    max_node_count = var.gpu_nodes_autoscaling.max_size
+  } : null
+
+  fixed_node_count = var.gpu_nodes_autoscaling.enabled ? null : var.gpu_nodes_fixed_count_per_group
+
+  parent_id = nebius_mk8s_v1_cluster.k8s-cluster.id
+  name      = "${var.cluster_name}-ng-gpu-${count.index}"
   labels = {
     "library-solution" : "k8s-training",
   }

@@ -9,9 +9,27 @@ ssh_public_key = {
 }
 
 # K8s nodes
-cpu_nodes_count           = 2 # Number of CPU nodes
-gpu_nodes_count_per_group = 2 # Number of GPU nodes per group
-gpu_node_groups           = 1 # In case you need more then 100 nodes in cluster you have to put multiple node groups
+cpu_nodes_fixed_count = 2 # Used only when cpu_nodes_autoscaling.enabled = false
+cpu_nodes_autoscaling = {
+  enabled = false
+  # min_size options:
+  # - null: min=max, no scale-down (default, recommended - saves ~10 min on initial provisioning)
+  #   it can be changed to a number later if needed.
+  # - N: can scale down to N nodes
+  min_size = null
+  max_size = 4
+}
+gpu_nodes_fixed_count_per_group = 1 # Number of GPU nodes per group, used only when gpu_nodes_autoscaling.enabled = false
+gpu_nodes_autoscaling = {
+  enabled = false
+  # min_size options:
+  # - null: min=max, no scale-down (default, recommended - saves ~10 min on initial provisioning)
+  #   it can be changed to a number later if needed.
+  # - N: can scale down to N nodes
+  min_size = null
+  max_size = 1
+}
+gpu_node_groups = 1 # In case you need more then 100 nodes in cluster you have to put multiple node groups
 # CPU platform and presets: https://docs.nebius.com/compute/virtual-machines/types#cpu-configurations
 cpu_nodes_platform = "cpu-d3"     # CPU nodes platform
 cpu_nodes_preset   = "4vcpu-16gb" # CPU nodes preset
@@ -42,8 +60,10 @@ enable_grafana           = true # Enable or disable Grafana® solution by Nebius
 
 # Local Observability installation
 enable_prometheus = false # Enable or disable Prometheus and Grafana deployment with true or false
-enable_loki       = false # Enable or disable Loki deployment with true or false
-
+loki = {
+  enabled            = true # Enable or disable Loki deployment with true or false
+  replication_factor = 2    # Number of Loki replicas for each log chunk (higher = better availability, more storage/network cost)
+}
 # Storage
 enable_filestore               = false # Enable or disable Filestore integration with true or false
 existing_filestore             = ""    # If enable_filestore = true, with this variable we can add existing filestore. Require string, example existing_filestore = "computefilesystem-e00r7z9vfxmg1bk99s"

@@ -1,8 +1,7 @@
 ###GLOBAL VARIABLES OWERWITE BLOCK###
 variables {
   gpu_nodes_platform = "gpu-h100-sxm"
-  enable_loki        = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket
-  etcd_cluster_size  = 1
+  gpu_nodes_preemptible = true
 }
 ######
 run "k8s_training_kuberay_apply" {

@@ -1,7 +1,7 @@
 ###GLOBALVARIABLES OWERWITE BLOCK###
 variables {
   gpu_nodes_platform = "gpu-h100-sxm"
-  enable_loki        = false # TODO: Disabling Loki since not possible to delete non-empty storage bucket
+  gpu_nodes_preemptible = true
 }
 ######
 run "k8s_training_apply" {

@@ -98,7 +98,7 @@ variable "ssh_public_key" {
 }
 
 # K8s CPU node group
-variable "cpu_nodes_count" {
+variable "cpu_nodes_fixed_count" {
   description = "Number of nodes in the CPU-only node group."
   type        = number
   default     = 3
@@ -129,12 +129,30 @@ variable "cpu_disk_size" {
 }
 
 # K8s GPU node group
-variable "gpu_nodes_count_per_group" {
+variable "gpu_nodes_fixed_count_per_group" {
   description = "Number of nodes in the GPU node group."
   type        = number
   default     = 2
 }
 
+variable "gpu_nodes_autoscaling" {
+  type = object({
+    enabled  = optional(bool, false)
+    min_size = optional(number)
+    max_size = optional(number)
+  })
+  default = {}
+}
+
+variable "cpu_nodes_autoscaling" {
+  type = object({
+    enabled  = optional(bool, false)
+    min_size = optional(number)
+    max_size = optional(number)
+  })
+  default = {}
+}
+
 variable "gpu_node_groups" {
   description = "Number of GPU node groups."
   type        = number
@@ -238,10 +256,12 @@ variable "enable_grafana" {
   default     = true
 }
 
-variable "enable_loki" {
-  description = "Enable Loki for logs aggregation."
-  type        = bool
-  default     = true
+variable "loki" {
+  type = object({
+    enabled            = optional(bool, false)
+    region             = optional(string)
+    replication_factor = optional(number)
+  })
 }
 
 variable "enable_prometheus" {
@@ -367,11 +387,6 @@ variable "gpu_nodes_preemptible" {
   default     = false
 }
 
-variable "gpu_health_cheker" {
-  description = "Use preemptible VMs for GPU nodes"
-  type        = bool
-  default     = true
-}
 variable "custom_driver" {
   description = "Use customized driver for the GPU Operator, e.g. to run Cuda 13 on H200"
   type        = bool

@@ -38,6 +38,7 @@ The module exposes NIMs using a shared LoadBalancer per group. The demo UI expec
 - **Cosmos-Reason2-8B** → `8001`
 - **Cosmos-Reason2-2B** → `8002`
 - **Cosmos-Embed1** → `8003`
+- **Nemotron Nano 12B v2 VL (NAno2 VL)** → `8004` *(if enabled in this module)*
 
 ### BioNeMo (Separate LB)
 - Deployed via `bionemo.tf` (ports and services defined there).
@@ -107,6 +108,9 @@ The module exposes NIMs using a shared LoadBalancer per group. The demo UI expec
 - `bionemo.tf`
   Deploys BioNeMo NIMs on a separate LoadBalancer.
 
+- `nemotron-nano-12b-v2-vl.tf`
+  Deploys Nemotron Nano 12B v2 VL (Nano2 VL), a vision-language model for document intelligence, video understanding, and multimodal reasoning.
+
 ---
 
 ## How the Demo App Uses This Module
@@ -170,5 +174,8 @@ module "nims" {
   cosmos_reason2_8b = true
   cosmos_reason2_2b = true
   cosmos_embed1     = true
+
+  # Vision-language (Nano2 VL)
+  nemotron_nano_12b_v2_vl = true
 }
 ```
@@ -66,6 +66,17 @@ resource "kubernetes_config_map_v1" "cosmos_tcp_proxy" {
           proxy_timeout 600s;
           proxy_connect_timeout 10s;
         }
+
+        # Port 8004 -> nemotron-nano-12b-v2-vl (Nano2 VL)
+        upstream nemotron_nano_12b_v2_vl {
+          server nemotron-nano-12b-v2-vl-svc.${var.namespace}.svc.cluster.local:8000;
+        }
+        server {
+          listen 8004;
+          proxy_pass nemotron_nano_12b_v2_vl;
+          proxy_timeout 600s;
+          proxy_connect_timeout 10s;
+        }
       }
     EOF
   }
@@ -122,6 +133,9 @@ resource "kubernetes_deployment_v1" "cosmos_tcp_proxy" {
           port {
             container_port = 8003
           }
+          port {
+            container_port = 8004
+          }
 
           volume_mount {
             name       = "nginx-config"
@@ -183,5 +197,12 @@ resource "kubernetes_service_v1" "cosmos_lb" {
       target_port = 8003
       protocol    = "TCP"
     }
+
+    port {
+      name        = "nemotron-nano-12b-v2-vl"
+      port        = 8004
+      target_port = 8004
+      protocol    = "TCP"
+    }
   }
 }