We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 5e344ad commit 59364f4Copy full SHA for 59364f4
manifests/base/runtimes/deepspeed_distributed.yaml
@@ -8,9 +8,7 @@ spec:
8
mlPolicy:
9
numNodes: 1
10
mpi:
11
- # TODO (andreyvelich): Change num proc to 1 and remove container resources after we
12
- # allow to override it via TrainJob APIs.
13
- numProcPerNode: 4
+ numProcPerNode: 1
14
mpiImplementation: OpenMPI
15
sshAuthMountPath: /home/mpiuser/.ssh
16
runLauncherAsNode: true
0 commit comments