Skip to content

Commit 834264e

Browse files
committed
feat(api): BREAKING CHANGE: Remove numProcPerNode API from the Torch MLPolicy
Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
1 parent 30480ac commit 834264e

18 files changed

+2382
-53
lines changed

api/openapi-spec/swagger.json

Lines changed: 48 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/kubeflow-trainer/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml

Lines changed: 505 additions & 0 deletions
Large diffs are not rendered by default.

charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainingruntimes.yaml

Lines changed: 505 additions & 0 deletions
Large diffs are not rendered by default.

charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainjobs.yaml

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4843,14 +4843,10 @@ spec:
48434843
format: int32
48444844
type: integer
48454845
numProcPerNode:
4846-
anyOf:
4847-
- type: integer
4848-
- type: string
4849-
description: |-
4850-
numProcPerNode is the number of processes/workers/slots on every training node.
4851-
For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
4852-
For the MPI runtime only int value can be set.
4853-
x-kubernetes-int-or-string: true
4846+
description: numProcPerNode is the number of processes/workers/slots
4847+
on every training node.
4848+
format: int32
4849+
type: integer
48544850
resourcesPerNode:
48554851
description: resourcesPerNode defines the compute resources for
48564852
each training node.

docs/proposals/2170-kubeflow-trainer-v2/README.md

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -558,9 +558,9 @@ type Trainer struct {
558558
ResourcesPerNode *corev1.ResourceRequirements `json:"resourcesPerNode,omitempty"`
559559

560560
// Number of processes/workers/slots on every training node.
561-
// For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
562561
// For the MPI runtime only int value can be set.
563-
NumProcPerNode *string `json:"numProcPerNode,omitempty"`
562+
// For the Torch runtime the value defaults to `auto` and can be overridden with an int.
563+
NumProcPerNode *int32 `json:"numProcPerNode,omitempty"`
564564
}
565565
```
566566

@@ -1095,8 +1095,7 @@ metadata:
10951095
spec:
10961096
mlPolicy:
10971097
numNodes: 2
1098-
torch:
1099-
numProcPerNode: 5
1098+
torch: {}
11001099
podGroupPolicy:
11011100
coscheduling:
11021101
scheduleTimeoutSeconds: 100
@@ -1166,12 +1165,6 @@ we won't support them in `TorchMLPolicySource`. We can introduce them in the fut
11661165

11671166
```golang
11681167
type TorchMLPolicySource struct {
1169-
// Number of processes per node.
1170-
// This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
1171-
// Supported values: `auto`, `cpu`, `gpu`, or int value.
1172-
// Defaults to `auto`.
1173-
NumProcPerNode *string `json:"numProcPerNode,omitempty"`
1174-
11751168
// Elastic policy for the PyTorch training.
11761169
ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"`
11771170
}
@@ -1251,8 +1244,7 @@ metadata:
12511244
spec:
12521245
mlPolicy:
12531246
numNodes: 2
1254-
torch:
1255-
numProcPerNode: 5
1247+
torch: {}
12561248
template:
12571249
spec:
12581250
replicatedJobs:
@@ -1378,9 +1370,7 @@ kind: ClusterTrainingRuntime
13781370
metadata:
13791371
name: torch-distributed-single-worker
13801372
spec:
1381-
mlPolicy:
1382-
torch:
1383-
numProcPerNode: 5
1373+
mlPolicy: {}
13841374
template:
13851375
spec:
13861376
replicatedJobs:
@@ -1840,6 +1830,7 @@ spec:
18401830
- 2024-07-16 Creation date
18411831
- 2025-03-15 Updated the initializer APIs
18421832
- 2025-10-09 Added PodTemplateOverrides to TrainJob V2 API
1833+
- 2026-02-23 Removed `numProcPerNode` from `TorchMLPolicySource`; `TrainJob.Trainer.numProcPerNode` now accepts only int values (torch defaults to `auto`)
18431834

18441835
## Alternatives
18451836

0 commit comments

Comments
 (0)