File tree Expand file tree Collapse file tree 1 file changed +10
-6
lines changed
src/nemo_run/core/execution Expand file tree Collapse file tree 1 file changed +10
-6
lines changed Original file line number Diff line number Diff line change 1010import requests
1111from invoke .context import Context
1212
13- from nemo_run .core .execution .base import (
14- Executor ,
15- ExecutorMacros ,
16- )
13+ from nemo_run .core .execution .base import Executor , ExecutorMacros
1714from nemo_run .core .packaging .base import Packager
1815from nemo_run .core .packaging .git import GitArchivePackager
1916
@@ -54,7 +51,8 @@ class DGXCloudExecutor(Executor):
5451 project_name : str
5552 container_image : str
5653 nodes : int = 1
57- gpus_per_node : int = 8
54+ gpus_per_node : int = 0
55+ nprocs_per_node : int = 1
5856 pvcs : list [dict [str , Any ]] = field (default_factory = list )
5957 distributed_framework : str = "PyTorch"
6058 custom_spec : dict [str , Any ] = field (default_factory = dict )
@@ -160,7 +158,13 @@ def nnodes(self) -> int:
160158 return self .nodes
161159
162160 def nproc_per_node (self ) -> int :
163- return self .gpus_per_node
161+ # Default to the number of GPUs specified per node
162+ # If user doesn't want GPUs, can run multiple processes with CPU only
163+ if self .gpus_per_node :
164+ return self .gpus_per_node
165+ elif self .nprocs_per_node :
166+ return self .nprocs_per_node
167+ return 1
164168
165169 def status (self , job_id : str ) -> Optional [DGXCloudState ]:
166170 url = f"{ self .base_url } /workloads/distributed/{ job_id } "
You can’t perform that action at this time.
0 commit comments