andreyvelich
diff --git a/‎api/openapi-spec/swagger.json‎
Lines changed: 48 additions & 7 deletions b/‎api/openapi-spec/swagger.json‎
Lines changed: 48 additions & 7 deletions
diff --git a/‎charts/kubeflow-trainer/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml‎
Lines changed: 505 additions & 0 deletions b/‎charts/kubeflow-trainer/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml‎
Lines changed: 505 additions & 0 deletions
diff --git a/‎charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainingruntimes.yaml‎
Lines changed: 505 additions & 0 deletions b/‎charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainingruntimes.yaml‎
Lines changed: 505 additions & 0 deletions
diff --git a/‎charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainjobs.yaml‎
Lines changed: 4 additions & 8 deletions b/‎charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainjobs.yaml‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎docs/proposals/2170-kubeflow-trainer-v2/README.md‎
Lines changed: 6 additions & 15 deletions b/‎docs/proposals/2170-kubeflow-trainer-v2/README.md‎
Lines changed: 6 additions & 15 deletions
@@ -4843,14 +4843,10 @@ spec:
                     format: int32
                     type: integer
                   numProcPerNode:
-                    anyOf:
-                    - type: integer
-                    - type: string
-                    description: |-
-                      numProcPerNode is the number of processes/workers/slots on every training node.
-                      For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
-                      For the MPI runtime only int value can be set.
-                    x-kubernetes-int-or-string: true
+                    description: numProcPerNode is the number of processes/workers/slots
+                      on every training node.
+                    format: int32
+                    type: integer
                   resourcesPerNode:
                     description: resourcesPerNode defines the compute resources for
                       each training node.
 
@@ -558,9 +558,9 @@ type Trainer struct {
 	ResourcesPerNode *corev1.ResourceRequirements `json:"resourcesPerNode,omitempty"`
 
 	// Number of processes/workers/slots on every training node.
-	// For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
 	// For the MPI runtime only int value can be set.
-	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
+	// For the Torch runtime the value defaults to `auto` and can be overridden with an int.
+	NumProcPerNode *int32 `json:"numProcPerNode,omitempty"`
 }
 ```
 
@@ -1095,8 +1095,7 @@ metadata:
 spec:
   mlPolicy:
     numNodes: 2
-    torch:
-      numProcPerNode: 5
+    torch: {}
   podGroupPolicy:
     coscheduling:
       scheduleTimeoutSeconds: 100
@@ -1166,12 +1165,6 @@ we won't support them in `TorchMLPolicySource`. We can introduce them in the fut
 
 ```golang
 type TorchMLPolicySource struct {
-	// Number of processes per node.
-	// This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
-	// Supported values: `auto`, `cpu`, `gpu`, or int value.
-	// Defaults to `auto`.
-	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
-
 	// Elastic policy for the PyTorch training.
 	ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"`
 }
@@ -1251,8 +1244,7 @@ metadata:
 spec:
   mlPolicy:
     numNodes: 2
-    torch:
-      numProcPerNode: 5
+    torch: {}
   template:
     spec:
       replicatedJobs:
@@ -1378,9 +1370,7 @@ kind: ClusterTrainingRuntime
 metadata:
   name: torch-distributed-single-worker
 spec:
-  mlPolicy:
-    torch:
-      numProcPerNode: 5
+  mlPolicy: {}
   template:
     spec:
       replicatedJobs:
@@ -1840,6 +1830,7 @@ spec:
 - 2024-07-16 Creation date
 - 2025-03-15 Updated the initializer APIs
 - 2025-10-09 Added PodTemplateOverrides to TrainJob V2 API
+- 2026-02-23 Removed `numProcPerNode` from `TorchMLPolicySource`; `TrainJob.Trainer.numProcPerNode` now accepts only int values (torch defaults to `auto`)
 
 ## Alternatives