File tree Expand file tree Collapse file tree 2 files changed +13
-3
lines changed Expand file tree Collapse file tree 2 files changed +13
-3
lines changed Original file line number Diff line number Diff line change 11
11
12
12
from torch .nn .parallel import DistributedDataParallel as DDP
13
13
14
+ def verify_min_gpu_count (min_gpus : int = 2 ) -> bool :
15
+ """ verification that we have at least 2 gpus to run dist examples """
16
+ has_gpu = torch .accelerator .is_available ()
17
+ gpu_count = torch .accelerator .device_count ()
18
+ return has_gpu and gpu_count >= min_gpus
19
+
14
20
class ToyModel (nn .Module ):
15
21
def __init__ (self ):
16
22
super (ToyModel , self ).__init__ ()
@@ -88,4 +94,8 @@ def main():
88
94
dist .destroy_process_group ()
89
95
90
96
if __name__ == "__main__" :
97
+ _min_gpu_count = 2
98
+ if not verify_min_gpu_count (min_gpus = _min_gpu_count ):
99
+ print (f"Unable to locate sufficient { _min_gpu_count } gpus to run this example. Exiting." )
100
+ sys .exit ()
91
101
main ()
Original file line number Diff line number Diff line change 1
1
# /bin/bash
2
2
# bash run_example.sh {file_to_run.py} {num_gpus}
3
3
# where file_to_run = example to run. Default = 'example.py'
4
- # num_gpus = num local gpus to use (must be at least 2). Default = 4
4
+ # num_gpus = num local gpus to use (must be at least 2). Default = 2
5
5
6
6
# samples to run include:
7
7
# example.py
8
8
9
- echo " Launching ${1:- example.py} with ${2:- 4 } gpus"
10
- torchrun --nnodes=1 --nproc_per_node=${2:- 4 } --rdzv_id=101 --rdzv_endpoint=" localhost:5972" ${1:- example.py}
9
+ echo " Launching ${1:- example.py} with ${2:- 2 } gpus"
10
+ torchrun --nnodes=1 --nproc_per_node=${2:- 2 } --rdzv_id=101 --rdzv_endpoint=" localhost:5972" ${1:- example.py}
You can’t perform that action at this time.
0 commit comments