nebius · elemir · Jan 21, 2026 · Mar 18, 2026
@@ -123,6 +123,59 @@ filestore_disk_size  = 100 * (1024 * 1024 * 1024) #Set the Filestore disk size i
 filestore_block_size = 4096 # Set the Filestore block size in bytes
 ```
 
+### Karpenter (Automatic Node Provisioning)
+
+```hcl
+# Karpenter
+enable_karpenter           = true  # Enable Karpenter for automatic node scaling
+karpenter_create_nodepools = true  # Create default CPU and GPU NodePools
+```
+
+When Karpenter is enabled, it automatically provisions nodes based on pending pod requirements. This is ideal for:
+- **Dynamic workloads**: Inference services, batch jobs, dev/test environments
+- **Cost optimization**: Scale down to zero when idle, scale up on demand
+- **Mixed workload types**: Different instance types for different workloads
+
+#### Understanding Static Nodes vs Karpenter
+
+**Static node groups** (`cpu_nodes_count`, `gpu_nodes_count_per_group`) are Terraform-managed and **always running** regardless of workload. **Karpenter** provisions **additional** nodes dynamically when pods are pending.
+
+| Configuration | Behavior |
+|--------------|----------|
+| `gpu_nodes_count_per_group = 2` + Karpenter | 2 GPU nodes always running + Karpenter adds more if needed |
+| `gpu_nodes_count_per_group = 0` + Karpenter | No static GPU nodes, Karpenter provisions on-demand (scale-to-zero) |
+
+#### Recommended Configuration for Karpenter
+
+To let Karpenter fully manage GPU scaling (including scale-to-zero for cost savings):
+
+```hcl
+# Keep small CPU node group for system workloads (Karpenter controller, monitoring, etc.)
+cpu_nodes_count           = 2
+
+# Let Karpenter manage all GPU nodes dynamically
+gpu_nodes_count_per_group = 0
+gpu_node_groups           = 0
+
+# Enable Karpenter
+enable_karpenter           = true
+karpenter_create_nodepools = true
+```
+
+#### How Karpenter Works
+
+1. You deploy a workload requesting resources (e.g., `nvidia.com/gpu: 1`)
+2. Pod stays **Pending** because no suitable node exists
+3. Karpenter detects the pending pod within seconds
+4. Karpenter provisions an appropriate node automatically
+5. Pod gets scheduled on the new node
+6. When the workload is deleted, Karpenter removes the idle node after the consolidation period (~1 min for CPU, ~5 min for GPU)
+
+**Important notes:**
+- Keep at least 2 CPU nodes (`cpu_nodes_count = 2`) for system workloads (Karpenter controller, monitoring)
+- For **InfiniBand GPU workloads** (distributed training), use static GPU node groups instead of Karpenter
+- Karpenter-provisioned GPU nodes are standalone (no InfiniBand connectivity)
+
 You can use Filestore to add external storage to K8s clusters, this allows you to create a Read-Write-Many HostPath PVCs in a K8s cluster. Use the following paths: `/mnt/filestore` for Filestore.
 
 For more information on how to access storage in K8s, refer [here](#accessing-storage).
@@ -232,4 +285,147 @@ spec:
 ## Good to know:
 - read-write many mode PV will work
 - MSP started testing that solution to enable early integration with mk8s.
-=======
+
+## Karpenter Usage Examples
+
+When Karpenter is enabled, nodes are provisioned automatically based on pod resource requests. Below are example workloads for CPU and GPU.
+
+### Example: CPU Workload with Karpenter
+
+This example deploys a simple nginx deployment that Karpenter will provision CPU nodes for:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: cpu-workload-example
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: cpu-workload
+  template:
+    metadata:
+      labels:
+        app: cpu-workload
+    spec:
+      containers:
+      - name: nginx
+        image: nginx:latest
+        resources:
+          requests:
+            cpu: "2"
+            memory: "4Gi"
+          limits:
+            cpu: "4"
+            memory: "8Gi"
+```
+
+Apply with: `kubectl apply -f cpu-workload.yaml`
+
+Karpenter will automatically:
+1. Detect pending pods that cannot be scheduled
+2. Provision appropriate CPU nodes based on resource requirements
+3. Schedule the pods on the new nodes
+
+### Example: GPU Workload with Karpenter
+
+This example deploys a GPU workload that Karpenter will provision GPU nodes for:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gpu-inference-example
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gpu-inference
+  template:
+    metadata:
+      labels:
+        app: gpu-inference
+    spec:
+      containers:
+      - name: cuda-vectoradd
+        image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04
+        resources:
+          requests:
+            nvidia.com/gpu: "1"
+          limits:
+            nvidia.com/gpu: "1"
+      tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+```
+
+Apply with: `kubectl apply -f gpu-workload.yaml`
+
+Karpenter will automatically:
+1. Detect the GPU resource request
+2. Provision a GPU node with CUDA drivers (using the `gpu` NebiusNodeClass)
+3. Schedule the pod on the new GPU node
+
+### Scaling to Zero
+
+When workloads are removed or scaled down, Karpenter automatically consolidates and removes unused nodes:
+
+```bash
+# Scale down the deployment
+kubectl scale deployment gpu-inference-example --replicas=0
+
+# Karpenter will remove the idle GPU node after the consolidation period (default: 5 minutes for GPU)
+```
+
+### Custom NodePools
+
+For advanced use cases, you can create custom NodePools. Example for a specific GPU type:
+
+```yaml
+apiVersion: karpenter.sh/v1
+kind: NodePool
+metadata:
+  name: h100-inference
+spec:
+  template:
+    metadata:
+      labels:
+        workload-type: inference
+    spec:
+      requirements:
+        - key: karpenter.k8s.nebius/instance-gpu-count
+          operator: In
+          values: ["1", "8"]
+        - key: node.kubernetes.io/instance-type
+          operator: In
+          values: ["gpu-h100-sxm-1gpu-16vcpu-200gb", "gpu-h100-sxm-8gpu-128vcpu-1600gb"]
+      nodeClassRef:
+        group: karpenter.k8s.nebius
+        kind: NebiusNodeClass
+        name: gpu
+  limits:
+    nvidia.com/gpu: "16"
+  disruption:
+    consolidationPolicy: WhenEmptyOrUnderutilized
+    consolidateAfter: 10m
+```
+
+### Monitoring Karpenter
+
+View Karpenter logs:
+```bash
+kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter -f
+```
+
+View provisioned nodes:
+```bash
+kubectl get nodes -l karpenter.sh/registered=true
+```
+
+View NodePools and their status:
+```bash
+kubectl get nodepools
+kubectl describe nodepool cpu-nodepool
+```
@@ -70,7 +70,7 @@ fi
 # Ensure service account is member of editors group
 NEBIUS_GROUP_EDITORS_ID=$(nebius iam group get-by-name \
   --parent-id "${NEBIUS_TENANT_ID}" \
-  --name 'editors' \
+  --name 'admins' \
   --format json \
   | jq -r '.metadata.id')
 IS_MEMBER=$(nebius iam group-membership list-members \

@@ -1,4 +1,5 @@
 module "network-operator" {
+  count = var.enable_karpenter ? 0 : 1
   depends_on = [
     nebius_mk8s_v1_node_group.cpu-only,
     nebius_mk8s_v1_node_group.gpu,

@@ -0,0 +1,62 @@
+module "karpenter" {
+  count  = var.enable_karpenter ? 1 : 0
+  source = "../modules/karpenter"
+
+  parent_id    = var.parent_id
+  tenant_id    = var.tenant_id
+  cluster_id   = nebius_mk8s_v1_cluster.k8s-cluster.id
+  cluster_name = var.cluster_name
+  subnet_id    = var.subnet_id
+
+  k8s_version              = var.k8s_version
+  karpenter_version        = var.karpenter_version
+  create_default_nodepools = var.karpenter_create_nodepools
+
+  # Image families for NodeClasses (override if default naming doesn't work)
+  cpu_nodeclass_image_family = var.karpenter_cpu_image_family
+  gpu_nodeclass_image_family = var.karpenter_gpu_image_family
+
+  depends_on = [
+    nebius_mk8s_v1_node_group.cpu-only,
+  ]
+}
+
+# Karpenter requires special service account for the system cpu nodegroup
+#
+resource "nebius_iam_v1_service_account" "karpenter_system_sa" {
+  count     = var.enable_karpenter ? 1 : 0
+  name      = "${var.cluster_name}-karpenter-manager"
+  parent_id = var.parent_id
+  description = "Service account for the karpenter system nodegroup"
+}
+
+# We create a group because access permits can only be granted to groups
+resource "nebius_iam_v1_group" "karpenter_manager" {
+  count       = var.enable_karpenter ? 1 : 0
+  name        = "${var.cluster_name}-karpenter-manager"
+  parent_id   = var.tenant_id
+}
+
+# Grant project admin access to the project for the karpenter-manager group
+resource "nebius_iam_v1_access_permit" "karpenter_manager_project_admin" {
+  count       = var.enable_karpenter ? 1 : 0
+  parent_id   = nebius_iam_v1_group.karpenter_manager[count.index].id
+  resource_id = var.parent_id
+  role        = "admin"
+}
+
+# Grant project viewer access to the tenant for the karpenter-manager group
+resource "nebius_iam_v1_access_permit" "karpenter_manager_tenant_viewer" {
+  count       = var.enable_karpenter ? 1 : 0
+  parent_id   = nebius_iam_v1_group.karpenter_manager[count.index].id
+  resource_id = var.tenant_id
+  role        = "viewer"
+}
+
+
+# Add service account to the group
+resource "nebius_iam_v1_group_membership" "karpenter_manager_membership" {
+  count     = var.enable_karpenter ? 1 : 0
+  parent_id = nebius_iam_v1_group.karpenter_manager[count.index].id
+  member_id = nebius_iam_v1_service_account.karpenter_system_sa[count.index].id
+}
@@ -0,0 +1,31 @@
+apiVersion: v1
+clusters:
+- cluster:
+    certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUM2akNDQWRLZ0F3SUJBZ0lCQURBTkJna3Foa2lHOXcwQkFRc0ZBREFWTVJNd0VRWURWUVFERXdwcmRXSmwKY201bGRHVnpNQjRYRFRJMk1ETXhOekV4TURFMU4xb1hEVE0yTURNeE5ERXhNRFkxTjFvd0ZURVRNQkVHQTFVRQpBeE1LYTNWaVpYSnVaWFJsY3pDQ0FTSXdEUVlKS29aSWh2Y05BUUVCQlFBRGdnRVBBRENDQVFvQ2dnRUJBS0NxCjhOMzdYT20rVGx1OGp1a0RZWmhuODdjcTNBQ0dOQW9CWFlSVjlKUjdyR2lTR3R2QWQ4Nnc4SmNFWFZOSFErV3AKdCsyNmNCTDhVV25iclFyL05INytuK0NOT3d5S1lMcm1tbXRuS0Y1T05NZ3ZxSzVHbmpMREptNll0c1Zrd0xNUgo2VjB5ZWpCbWw0TjBodkwrVXBTV2dra3c4b3pCRlFRSUR0OGxSRDRsZEJPbkllZ3RzL3IySHViY2FnK2tJeVRFCjF5aVJsSTdzYnJzaEtRT05MdmFOb0VQNGlESmxKdm16Y1UrS0hiR1kxdHBqWFZ0bmlLbXg4TGNlY2tJT01uUWoKTFJMNFJmd3R2RmthdTlzcnV0OHowMUVrK3dzQ3FnQTNySUZnL1ZzQitkQ1pPR2h3cHlEWUdJSVJRRFV4TS93MQpjVjBOaEptUWw0eHpXeVFsdU1jQ0F3RUFBYU5GTUVNd0RnWURWUjBQQVFIL0JBUURBZ0trTUJJR0ExVWRFd0VCCi93UUlNQVlCQWY4Q0FRQXdIUVlEVlIwT0JCWUVGRHorbUVKblUxWlVLd0d3eDJydEN1eDJlTnBrTUEwR0NTcUcKU0liM0RRRUJDd1VBQTRJQkFRQlA5di9wQm50M3F6T2wvZURCZ3hLT2lRN3FSRTJBZ2dFa0RXQkM1QnZrcnY0dgp2T3RiaFh0VmxjNlc5S1YvTUptTXgwdVlCNHYrUFhpUFhwQTc2TVJlK3E2aGd1WnJNdFpLcHlxTEhnSXZybzh4ClV2UWw5Z3U1dXRxdnQyTk1FVWM4RERlQTJOQmM4dW9GTFIyMlM2M2QwL3ZhayszV1JOQ01Wczc1eE5YNTBXREsKRC9uOFJxYnEvZ21zbmJjYUFleElXV2svOUhCNlllQ2NBME9ZK0hPWDliVGxERmdUL1F2NGppdUdSdXR5cDd4bApoeW5aTVJWcEFsK2tUMTlCT29yd2laelVWYm1UbGtPd1plZXVZVnd1bmNKQmFYcGpCT0pIMllWY1ZORXJsdS9lCnh1STZwcEhhUnFrY2t6SklteUQ4Q1R6K2NjbUljUGcvZTl2WTF4T3EKLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo=
+    server: https://sv.mk8scluster-e00pq5tsd20y6rebt1.mk8s.prod.man.nbhost.net:443
+  name: npc-mk8s-k8s-karpenter-test
+contexts:
+- context:
+    cluster: npc-mk8s-k8s-karpenter-test
+    user: npc-mk8s-k8s-karpenter-test
+  name: npc-mk8s-k8s-karpenter-test
+current-context: npc-mk8s-k8s-karpenter-test
+kind: Config
+preferences: {}
+users:
+- name: npc-mk8s-k8s-karpenter-test
+  user:
+    exec:
+      apiVersion: client.authentication.k8s.io/v1beta1
+      args:
+      - mk8s
+      - v1
+      - cluster
+      - get-token
+      - --profile
+      - man-prod
+      - --format
+      - json
+      command: /home/elemir/.config/newbius/bin/npc
+      env: null
+      provideClusterInfo: false
@@ -8,6 +8,7 @@ resource "nebius_mk8s_v1_cluster" "k8s-cluster" {
     etcd_cluster_size = var.etcd_cluster_size
     subnet_id         = var.subnet_id
     version           = var.k8s_version
+    karpenter         = var.enable_karpenter ? {} : null
   }
 }
 
@@ -44,11 +45,11 @@ resource "nebius_iam_v1_group_membership" "k8s_node_group_sa-admin" {
   parent_id = data.nebius_iam_v1_group.editors[0].id
   member_id = nebius_iam_v1_service_account.k8s_node_group_sa[count.index].id
 }
+
 ################
 # CPU NODE GROUP
 ################
 resource "nebius_mk8s_v1_node_group" "cpu-only" {
-
   autoscaling = var.cpu_nodes_autoscaling.enabled ? {
     min_node_count = var.cpu_nodes_autoscaling.min_size == null ? var.cpu_nodes_autoscaling.max_size : var.cpu_nodes_autoscaling.min_size
     max_node_count = var.cpu_nodes_autoscaling.max_size
@@ -69,7 +70,9 @@ resource "nebius_mk8s_v1_node_group" "cpu-only" {
       type           = var.cpu_disk_type
     }
 
-    service_account_id = var.enable_k8s_node_group_sa ? nebius_iam_v1_service_account.k8s_node_group_sa[0].id : null
+    service_account_id = var.enable_karpenter ? nebius_iam_v1_service_account.karpenter_system_sa[0].id : (
+      var.enable_k8s_node_group_sa ? nebius_iam_v1_service_account.k8s_node_group_sa[0].id : null
+    )
 
     network_interfaces = [
       {

@@ -32,6 +32,15 @@ provider "helm" {
     cluster_ca_certificate = nebius_mk8s_v1_cluster.k8s-cluster.status.control_plane.auth.cluster_ca_certificate
     token                  = var.iam_token
   }
+
+  # Registry for Karpenter Helm chart (Nebius Container Registry)
+  registries = [
+    {
+      url      = "oci://cr.eu-north1.nebius.cloud/e00w67thrrz5nhprjm"
+      username = "iam"
+      password = var.iam_token
+    }
+  ]
 }
 
 provider "kubernetes" {

@@ -1,5 +1,5 @@
 # Mk8s cluster name. By default it is "k8s-training"
-cluster_name = "k8s-training"
+cluster_name = "k8s-karpenter-test"
 
 # SSH config
 ssh_user_name = "ubuntu" # Username you want to use to connect to the nodes
@@ -29,7 +29,7 @@ gpu_nodes_autoscaling = {
   min_size = null
   max_size = 1
 }
-gpu_node_groups = 1 # In case you need more then 100 nodes in cluster you have to put multiple node groups
+gpu_node_groups = 0 # In case you need more then 100 nodes in cluster you have to put multiple node groups
 # CPU platform and presets: https://docs.nebius.com/compute/virtual-machines/types#cpu-configurations
 cpu_nodes_platform = "cpu-d3"     # CPU nodes platform
 cpu_nodes_preset   = "4vcpu-16gb" # CPU nodes preset
@@ -99,3 +99,8 @@ kuberay_max_gpu_replicas = 8
 # Enable to deploy KubeRay Operator with RayService CR 
 enable_kuberay_service = false
 
+# Karpenter - Automatic Node Provisioning
+# When enabled, Karpenter will dynamically provision nodes based on workload demands.
+# Note: For InfiniBand GPU workloads, use static node groups (set gpu_nodes_count_per_group > 0)
+enable_karpenter           = true # Enable Karpenter for automatic node scaling
+karpenter_create_nodepools = true # Create default CPU and GPU NodePools