Skip to content

Commit 7ed1c7e

Browse files
Abo7atmalialhawasvfdev-5
authored
Update docs with torchrun instead of torch.distributed.launch #2415 (#2420)
* Make Checkpoint.load_objects to accept str and load internally (#2303) * modify error message * Add test for Checkpoint.load_objects * fix test messages to match function error message * Update docs with torchrun -- launcher module (#2415) * Update docs with torchrun -- utils module (#2415) * change codeblock to bash * Update docs with torchrun -- check_idist_parallel (#2415) * Update docs with torchrun -- check_idist_parallel (#2415) Co-authored-by: alialhawas <[email protected]> * Update launcher.py * Update check_idist_parallel.py * Updates according to the review Co-authored-by: alialhawas <[email protected]> Co-authored-by: vfdev <[email protected]>
1 parent 0666b40 commit 7ed1c7e

File tree

4 files changed

+29
-26
lines changed

4 files changed

+29
-26
lines changed

ignite/distributed/launcher.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class Parallel:
2323
provided ``backend`` (useful for standalone scripts).
2424
2525
2) Only initialize a processing group given the ``backend``
26-
(useful with tools like `torch.distributed.launch`_, `horovodrun`_, etc).
26+
(useful with tools like `torchrun`_, `horovodrun`_, etc).
2727
2828
Args:
2929
backend: backend to use: `nccl`, `gloo`, `xla-tpu`, `horovod`. If None, no distributed
@@ -50,14 +50,14 @@ class Parallel:
5050
spawn_kwargs: kwargs to ``idist.spawn`` function.
5151
5252
Examples:
53-
1) Single node or Multi-node, Multi-GPU training launched with `torch.distributed.launch`_ or `horovodrun`_
53+
1) Single node or Multi-node, Multi-GPU training launched with `torchrun` or `horovodrun`_
5454
tools
5555
5656
Single node option with 4 GPUs
5757
5858
.. code-block:: bash
5959
60-
python -m torch.distributed.launch --nproc_per_node=4 --use_env main.py
60+
torchrun --nproc_per_node=4 main.py
6161
# or if installed horovod
6262
horovodrun -np=4 python main.py
6363
@@ -66,15 +66,15 @@ class Parallel:
6666
.. code-block:: bash
6767
6868
## node 0
69-
python -m torch.distributed.launch --nnodes=2 --node_rank=0 --master_addr=master \
70-
--master_port=3344 --nproc_per_node=8 --use_env main.py
69+
torchrun --nnodes=2 --node_rank=0 --master_addr=master --master_port=3344 \
70+
--nproc_per_node=8 main.py
7171
7272
# or if installed horovod
7373
horovodrun -np 16 -H hostname1:8,hostname2:8 python main.py
7474
7575
## node 1
76-
python -m torch.distributed.launch --nnodes=2 --node_rank=1 --master_addr=master \
77-
--master_port=3344 --nproc_per_node=8 --use_env main.py
76+
torchrun --nnodes=2 --node_rank=1 --master_addr=master --master_port=3344 \
77+
--nproc_per_node=8 main.py
7878
7979
8080
User code is the same for both options:
@@ -92,6 +92,8 @@ def training(local_rank, config, **kwargs):
9292
9393
backend = "nccl" # or "horovod" if package is installed
9494
95+
config = {"key": "value"}
96+
9597
with idist.Parallel(backend=backend) as parallel:
9698
parallel.run(training, config, a=1, b=2)
9799
@@ -152,6 +154,8 @@ def training(local_rank, config, **kwargs):
152154
print(idist.get_rank(), ": run with config:", config, "- backend=", idist.backend())
153155
# ...
154156
157+
config = {"key": "value"}
158+
155159
with idist.Parallel(backend="xla-tpu", nproc_per_node=8) as parallel:
156160
parallel.run(training, config, a=1, b=2)
157161
@@ -188,12 +192,13 @@ def training(local_rank, config, **kwargs):
188192
"master_port": 15000
189193
}
190194
195+
config = {"key": "value"}
196+
191197
with idist.Parallel(backend="nccl", **dist_config) as parallel:
192198
parallel.run(training, config, a=1, b=2)
193199
194200
195-
196-
.. _torch.distributed.launch: https://pytorch.org/docs/stable/distributed.html#launch-utility
201+
.. _torchrun: https://pytorch.org/docs/stable/elastic/run.html#launcher-api
197202
.. _horovodrun: https://horovod.readthedocs.io/en/latest/api.html#module-horovod.run
198203
.. _dist.init_process_group: https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group
199204
.. versionchanged:: 0.4.2
@@ -294,6 +299,8 @@ def training(local_rank, config, **kwargs):
294299
print(idist.get_rank(), ": run with config:", config, "- backend=", idist.backend())
295300
# ...
296301
302+
config = {"key": "value"}
303+
297304
with idist.Parallel(backend=backend) as parallel:
298305
parallel.run(training, config, a=1, b=2)
299306

ignite/distributed/utils.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -480,12 +480,11 @@ def initialize(backend: str, **kwargs: Any) -> None:
480480
- | "horovod" : comm(=None), more info: `hvd_init`_.
481481
482482
Examples:
483-
Launch single node multi-GPU training with ``torch.distributed.launch`` utility.
483+
Launch single node multi-GPU training with ``torchrun`` utility.
484484
485485
.. code-block:: python
486486
487-
# >>> python -m torch.distributed.launch --nproc_per_node=4 main.py
488-
487+
# >>> torchrun --nproc_per_node=4 main.py
489488
# main.py
490489
491490
import ignite.distributed as idist

tests/ignite/distributed/check_idist_parallel.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,19 +38,20 @@ def training(local_rank, config, **kwargs):
3838
python tests/ignite/distributed/check_idist_parallel.py
3939
```
4040
41-
- Launch 4 procs using gloo backend with `torch.distributed.launch`
41+
- Launch 4 procs using gloo backend with `torchrun`:
42+
4243
```
43-
python -m torch.distributed.launch --nproc_per_node=4 --use_env \
44-
tests/ignite/distributed/check_idist_parallel.py --backend=gloo
44+
torchrun --nproc_per_node=4 tests/ignite/distributed/check_idist_parallel.py --backend=gloo
4545
```
4646
47-
- Launch 2 procs in 2 nodes using gloo backend with `torch.distributed.launch`:
47+
- Launch 2 procs in 2 nodes using gloo backend with `torchrun` or `torch.distributed.launch`:
48+
4849
```
49-
bash -c "python -m torch.distributed.launch --nnodes=2 --node_rank=0 \
50-
--master_addr=localhost --master_port=3344 --nproc_per_node=2 --use_env \
50+
bash -c "torchrun --nnodes=2 --node_rank=0 \
51+
--master_addr=localhost --master_port=3344 --nproc_per_node=2 \
5152
tests/ignite/distributed/check_idist_parallel.py --backend=gloo &" \
52-
&& bash -c "python -m torch.distributed.launch --nnodes=2 --node_rank=1 \
53-
--master_addr=localhost --master_port=3344 --nproc_per_node=2 --use_env \
53+
&& bash -c "torchrun --nnodes=2 --node_rank=1 \
54+
--master_addr=localhost --master_port=3344 --nproc_per_node=2 \
5455
tests/ignite/distributed/check_idist_parallel.py --backend=gloo &"
5556
```
5657

tests/ignite/distributed/test_launcher.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,11 @@ def test_check_idist_parallel_no_dist(exec_filepath):
6363

6464

6565
def _test_check_idist_parallel_torch_launch(init_method, fp, backend, nprocs):
66-
# python -m torch.distributed.launch --nproc_per_node=nprocs --use_env \
67-
# tests/ignite/distributed/check_idist_parallel.py --backend=backend
66+
# torchrun --nproc_per_node=nprocs tests/ignite/distributed/check_idist_parallel.py --backend=backend
6867

6968
cmd = [
70-
sys.executable,
71-
"-m",
72-
"torch.distributed.launch",
69+
"torchrun",
7370
f"--nproc_per_node={nprocs}",
74-
"--use_env",
7571
fp,
7672
f"--backend={backend}",
7773
]

0 commit comments

Comments
 (0)