HorizonRobotics · sqchai · Jul 22, 2024 · Aug 9, 2024
diff --git a/README.md b/README.md
@@ -117,6 +117,26 @@ frames using the following command:
 python -m alf.bin.play --root_dir=LOG_DIR
 ```
 
+To launch single-node multi-gpu training, set the 'multi-gpu' argument
+```bash
+python -m alf.bin.train --conf=CONF_FILE --root_dir=LOG_DIR --distributed multi-gpu
+```
+
+To launch multi-node multi-gpu training, we use torch distirbuted launch module. The 'local_rank' for each process can be obtained from 'PerProcessContext' class, which can be used to assign gpu for your environment if you wish. To start training, run the following command on the host machine:
+```bash
+python -m torch.distributed.launch \
+    --nproc_per_node=NGPU_ON_NODE \
+    --nnodes=NUMBER_OF_NODES \
+    --node_rank=NODE_RANK \
+    --master_addr=HOST_IP \
+    --master_port=12345 \
+    ./alf/bin/train.py \
+    --conf=CONF_FILE \
+    --root_dir=LOG_DIR \
+    --distributed multi-node-multi-gpu \
+```
+and simultaneously run the same command on each worker machine. For each worker machine, assign a NODE_RANK to it and update NGPU_ON_NODE if the number of GPUs is different from the host. Please make sure that all machines get a same copy of the codebase.
+
 #### **Deprecated**
 
 An older version of ALF used [gin](https://github.com/google/gin-config)
@@ -247,4 +267,4 @@ If you use ALF for research and find it useful, please consider citing:
 
 ## Contribute to ALF
 
-You are welcome to contribute to ALF. Please follow the guideline [here](https://alf.readthedocs.io/en/latest/contributing.html).
+You are welcome to contribute to ALF. Please follow the guideline [here](https://alf.readthedocs.io/en/latest/contributing.html).
diff --git a/alf/bin/train.py b/alf/bin/train.py
@@ -80,9 +80,10 @@ def _define_flags():
     flags.DEFINE_bool('store_snapshot', True,
                       'Whether store an ALF snapshot before training')
     flags.DEFINE_enum(
-        'distributed', 'none', ['none', 'multi-gpu'],
+        'distributed', 'none', ['none', 'multi-gpu', 'multi-node-multi-gpu'],
         'Set whether and how to run trainning in distributed mode.')
     flags.mark_flag_as_required('root_dir')
+    flags.DEFINE_integer('local-rank', None, 'Local rank passed from distributed launcher')
 
 
 FLAGS = flags.FLAGS
@@ -98,7 +99,6 @@ def _setup_logging(rank: int, log_dir: str):
     FLAGS.alsologtostderr = True
     logging.set_verbosity(logging.INFO)
     logging.get_absl_handler().use_absl_log_file(log_dir=log_dir)
-    logging.use_absl_handler()
 
 
 def _setup_device(rank: int = 0):
@@ -116,12 +116,13 @@ def _setup_device(rank: int = 0):
         torch.cuda.set_device(rank)
 
 
-def _train(root_dir, rank=0, world_size=1):
+def _train(root_dir, local_rank=-1, rank=0, world_size=1):
     """Launch the trainer after the conf file has been parsed. This function
     could be called by grid search after the config has been modified.
 
     Args:
         root_dir (str): Path to the directory for writing logs/summaries/checkpoints.
+        local_rank (int): The ID of the process within current node
         rank (int): The ID of the process among all of the DDP processes. For
             non-distributed training, this id should be 0.
         world_size (int): The number of processes in total. If set to 1, it is
@@ -133,6 +134,8 @@ def _train(root_dir, rank=0, world_size=1):
 
     if trainer_conf.ml_type == 'rl':
         ddp_rank = rank if world_size > 1 else -1
+        if ddp_rank > -1 and local_rank > -1:
+            ddp_rank = local_rank
         trainer = policy_trainer.RLTrainer(trainer_conf, ddp_rank)
     elif trainer_conf.ml_type == 'sl':
         # NOTE: SLTrainer does not support distributed training yet
@@ -176,13 +179,12 @@ def training_worker(rank: int,
             # Specialization for distributed mode
             dist.init_process_group('nccl', rank=rank, world_size=world_size)
             # Recover the flags when spawned as a sub process
-            if rank > 0:
-                _define_flags()
-                FLAGS(sys.argv, known_only=True)
-                FLAGS.mark_as_parsed()
+            _define_flags()
+            FLAGS(sys.argv, known_only=True)
+            FLAGS.mark_as_parsed()
             # Set the rank and total number of processes for distributed training.
             PerProcessContext().set_distributed(
-                rank=rank, num_processes=world_size)
+                rank=rank, local_rank=-1, num_processes=world_size)
             assert paras_queue is not None
             PerProcessContext().set_paras_queue(paras_queue)
 
@@ -191,7 +193,65 @@ def training_worker(rank: int,
 
         # Parse the configuration file, which will also implicitly bring up the environments.
         common.parse_conf_file(conf_file)
-        _train(root_dir, rank, world_size)
+        _train(root_dir=root_dir, rank=rank, world_size=world_size)
+    except KeyboardInterrupt:
+        pass
+    except Exception as e:
+        if world_size >= 1:
+            # If the training worker is running as a process in multiprocessing
+            # environment, this will make sure that the exception raised in this
+            # particular process is captured and shown.
+            logging.exception(f'{mp.current_process().name} - {e}')
+        raise e
+    finally:
+        # Note that each training worker will have its own child processes
+        # running the environments. In the case when training worker process
+        # finishes ealier (e.g. when it raises an exception), it will hang
+        # instead of quitting unless all child processes are killed.
+        alf.close_env()
+
+
+
+def training_worker_multi_node(local_rank: int, 
+                               rank: int,
+                               world_size: int,
+                               conf_file: str,
+                               root_dir: str,
+                               paras_queue: mp.Queue = None):
+    """An executable instance that trains and evaluate the algorithm
+
+    Args:
+        local_rank (int): The ID of the process within current node.
+        rank (int): The ID of the process among all of the DDP processes.
+        world_size (int): The number of processes in total. If set to 1, it is
+            interpreted as "non distributed mode".
+        conf_file (str): Path to the training configuration.
+        root_dir (str): Path to the directory for writing logs/summaries/checkpoints.
+        paras_queue: a shared Queue for checking the consistency of model parameters
+            in different worker processes, if multi-gpu training is used.
+    """
+    try:
+        _setup_logging(log_dir=root_dir, rank=rank)
+        _setup_device(local_rank)
+        if world_size > 1:
+            # Specialization for distributed mode
+            dist.init_process_group('nccl', rank=rank, world_size=world_size)
+            # Recover the flags when spawned as a sub process
+            # _define_flags()
+            FLAGS(sys.argv, known_only=True)
+            FLAGS.mark_as_parsed()
+            # Set the rank and total number of processes for distributed training.
+            PerProcessContext().set_distributed(
+                rank=rank, local_rank=local_rank, num_processes=world_size)
+            assert paras_queue is not None
+            PerProcessContext().set_paras_queue(paras_queue)
+
+        # Make PerProcessContext read-only.
+        PerProcessContext().finalize()
+
+        # Parse the configuration file, which will also implicitly bring up the environments.
+        common.parse_conf_file(conf_file)
+        _train(root_dir=root_dir, local_rank=local_rank, rank=rank, world_size=world_size)
     except KeyboardInterrupt:
         pass
     except Exception as e:
@@ -215,7 +275,7 @@ def main(_):
 
     conf_file = common.get_conf_file()
 
-    if FLAGS.store_snapshot:
+    if FLAGS.store_snapshot and int(os.environ['RANK']) == 0:
         common.generate_alf_snapshot(common.alf_root(), conf_file, root_dir)
 
     # FLAGS.distributed is guaranteed to be one of the possible values.
@@ -263,9 +323,41 @@ def main(_):
             # again. But we raise another error so that we will have a correct
             # exit code for the program.
             raise ChildProcessError(f'Training failed on subprocess exception')
+
+    elif FLAGS.distributed == 'multi-node-multi-gpu':
+        local_rank = int(os.environ['LOCAL_RANK'])
+        rank = int(os.environ['RANK'])
+        world_size = int(os.environ['WORLD_SIZE'])
+        print("local_rank: {} | rank: {} | world_size: {}".format(local_rank, rank, world_size))
+
+        if world_size == 1:
+            logging.warn(
+                'Fallback to single GPU mode as there is only one GPU')
+            training_worker(
+                rank=0, world_size=1, conf_file=conf_file, root_dir=root_dir)
+            return
+
+        try:
+            # Create a shared queue for checking the consistency of the parameters
+            # in different work processes.
+            manager = mp.Manager()
+            paras_queue = manager.Queue()
+            training_worker_multi_node(local_rank=local_rank, 
+                                       rank=rank, 
+                                       world_size=world_size,
+                                       conf_file=conf_file,
+                                       root_dir=root_dir, 
+                                       paras_queue=paras_queue)
+        except KeyboardInterrupt:
+            pass
+        except Exception as e:
+            # ``e`` has been printed in the subprocess, so here we won't print it
+            # again. But we raise another error so that we will have a correct
+            # exit code for the program.
+            raise ChildProcessError(f'Training failed on subprocess exception')
 
 
 if __name__ == '__main__':
     __spec__ = None  # see https://github.com/HorizonRobotics/alf/pull/1554 for explanation
     _define_flags()
-    app.run(main)
+    app.run(main)
diff --git a/alf/environments/process_environment.py b/alf/environments/process_environment.py
@@ -107,6 +107,7 @@ def _worker(conn: multiprocessing.connection,
             torch_num_threads_per_env: int = 1,
             ddp_num_procs: int = 1,
             ddp_rank: int = -1,
+            local_rank: int= -1,
             name: str = ''):
     """The process waits for actions and sends back environment results.
 
@@ -142,6 +143,7 @@ def _worker(conn: multiprocessing.connection,
                 SpawnedProcessContext(
                     ddp_num_procs=ddp_num_procs,
                     ddp_rank=ddp_rank,
+                    local_rank=local_rank,
                     env_id=env_id,
                     env_ctor=env_constructor,
                     pre_configs=pre_configs))
@@ -299,13 +301,14 @@ def start(self, wait_to_start=True):
 
         ddp_num_procs = PerProcessContext().num_processes
         ddp_rank = PerProcessContext().ddp_rank
+        local_rank = PerProcessContext().local_rank
 
         self._process = mp_ctx.Process(
             target=_worker,
             args=(conn, self._env_constructor, self._start_method,
                   alf.get_handled_pre_configs(), self._env_id, self._flatten,
                   self._fast, self._num_envs, self._torch_num_threads,
-                  ddp_num_procs, ddp_rank, self._name),
+                  ddp_num_procs, ddp_rank, local_rank, self._name),
             name=f"ProcessEnvironment-{self._env_id}")
         atexit.register(self.close)
         self._process.start()
@@ -476,3 +479,4 @@ def render(self, mode='human'):
             NotImplementedError: If the environment does not support rendering.
         """
         return self.call('render', mode)()
+
diff --git a/alf/utils/per_process_context.py b/alf/utils/per_process_context.py
@@ -34,6 +34,7 @@ def __new__(cls):
             cls._instance = super(PerProcessContext, cls).__new__(cls)
             cls._instance._read_only = False
             cls._instance._ddp_rank = -1
+            cls._instance._local_rank = -1
             cls._instance._num_processes = 1
         return cls._instance
 
@@ -42,7 +43,7 @@ def finalize(self) -> None:
         """
         self._read_only = True
 
-    def set_distributed(self, rank: int, num_processes: int) -> None:
+    def set_distributed(self, rank: int, local_rank: int, num_processes: int) -> None:
         """Set the distributed properties.
 
         Args:
@@ -53,6 +54,7 @@ def set_distributed(self, rank: int, num_processes: int) -> None:
             raise AttributeError(
                 'Cannot mutate PerProcessContext after it is finalized')
         self._ddp_rank = rank
+        self._local_rank = local_rank
         self._num_processes = num_processes
 
     def set_paras_queue(self, paras_queue: mp.Queue):
@@ -77,6 +79,10 @@ def is_distributed(self):
     @property
     def ddp_rank(self):
         return self._ddp_rank
+
+    @property
+    def local_rank(self):
+        return self._local_rank
 
     @property
     def num_processes(self):

diff --git a/alf/utils/spawned_process_utils.py b/alf/utils/spawned_process_utils.py
@@ -31,6 +31,7 @@ class SpawnedProcessContext(NamedTuple):
     """
     ddp_num_procs: int
     ddp_rank: int
+    local_rank: int
     env_id: int
     env_ctor: Callable[..., AlfEnvironment]
     pre_configs: List[Tuple[str, Any]]