Fix for hanging issue on TPU Pod (#16844)

Liyang90 · carmocca · lexierule · commit 5c3de464127f · 2023-02-28T11:52:17.000-05:00
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
diff --git a/src/lightning_fabric/CHANGELOG.md b/src/lightning_fabric/CHANGELOG.md
@@ -7,7 +7,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [1.9.4] - 2023-02-28
 
-No changes.
+### Fixed
+
+- Fixed DDP spawn hang on TPU Pods ([#16844](https://github.com/Lightning-AI/lightning/pull/16844))
 
 
 ## [1.9.3] - 2023-02-21
diff --git a/src/lightning_fabric/strategies/launchers/xla.py b/src/lightning_fabric/strategies/launchers/xla.py
@@ -77,20 +77,21 @@ def launch(self, function: Callable, *args: Any, **kwargs: Any) -> Any:
 
     def _wrapping_function(
         self,
+        # XLA's multiprocessing returns the global index, not the local index as torch's multiprocessing
+        # https://github.com/pytorch/xla/blob/v1.13.0/torch_xla/distributed/xla_multiprocessing.py#L321
         process_idx: int,
         function: Callable,
         args: Any,
         kwargs: Any,
         return_queue: SimpleQueue,
         global_states: Optional[_GlobalStateSnapshot] = None,
     ) -> None:
-        self._strategy._local_rank = process_idx
         results = function(*args, **kwargs)
 
-        if process_idx == 0:
+        if self._strategy.local_rank == 0:
             return_queue.put(move_data_to_device(results, "cpu"))
 
-        _rank_teardown(process_idx)
+        _rank_teardown(self._strategy.local_rank)
 
 
 def _rank_teardown(rank: int) -> None:
diff --git a/src/lightning_fabric/strategies/xla.py b/src/lightning_fabric/strategies/xla.py
@@ -59,7 +59,6 @@ def __init__(
         self._checkpoint_io: Optional[CheckpointIO]
         self._backward_sync_control = None  # XLA synchronizes gradients in the optimizer.step() call
         self._launched = False
-        self._local_rank = 0
 
     @property
     def root_device(self) -> torch.device:
@@ -73,10 +72,6 @@ def root_device(self) -> torch.device:
     def num_processes(self) -> int:
         return len(self.parallel_devices) if self.parallel_devices is not None else 0
 
-    @property
-    def local_rank(self) -> int:
-        return self._local_rank
-
     @property
     def checkpoint_io(self) -> CheckpointIO:
         if self._checkpoint_io is None:
@@ -214,8 +209,6 @@ def register_strategies(cls, strategy_registry: Dict) -> None:
     def _set_world_ranks(self) -> None:
         if self.cluster_environment is None:
             return
-        self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank)
-        self.cluster_environment.set_world_size(self.num_processes)
         rank_zero_only.rank = self.cluster_environment.global_rank()
 
     @staticmethod
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -7,7 +7,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [1.9.4] - 2023-02-28
 
-No changes.
+### Fixed
+
+- Fixed DDP spawn hang on TPU Pods ([#16844](https://github.com/Lightning-AI/lightning/pull/16844))
 
 
 ## [1.9.3] - 2023-02-21
diff --git a/src/pytorch_lightning/strategies/launchers/xla.py b/src/pytorch_lightning/strategies/launchers/xla.py
@@ -88,6 +88,8 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"]
 
     def _wrapping_function(
         self,
+        # XLA's multiprocessing returns the global index, not the local index as torch's multiprocessing
+        # https://github.com/pytorch/xla/blob/v1.13.0/torch_xla/distributed/xla_multiprocessing.py#L321
         process_idx: int,
         trainer: Optional["pl.Trainer"],
         function: Callable,
@@ -96,16 +98,15 @@ def _wrapping_function(
         return_queue: SimpleQueue,
         global_states: Optional[_GlobalStateSnapshot] = None,
     ) -> None:
-        self._strategy._local_rank = process_idx
         results = function(*args, **kwargs)
 
         if trainer is not None:
             results = self._collect_rank_zero_results(trainer, results)
 
-        if process_idx == 0:
+        if self._strategy.local_rank == 0:
             return_queue.put(move_data_to_device(results, "cpu"))
 
-        _rank_teardown(process_idx)
+        _rank_teardown(self._strategy.local_rank)
 
     def _collect_rank_zero_results(self, trainer: "pl.Trainer", results: Any) -> Optional["_WorkerOutput"]:
         rank_zero_debug("Collecting results from rank 0 process.")
diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py
@@ -97,6 +97,10 @@ def root_device(self) -> torch.device:
 
         return xm.xla_device()
 
+    @property
+    def local_rank(self) -> int:
+        return self.cluster_environment.local_rank() if self.cluster_environment is not None else 0
+
     @staticmethod
     def _validate_dataloader(dataloaders: Union[TRAIN_DATALOADERS, EVAL_DATALOADERS]) -> None:
         def check_has_len(dataloader: DataLoader) -> None:
@@ -234,6 +238,11 @@ def setup_distributed(self) -> None:
         self.set_world_ranks()
         rank_zero_only.rank = self.global_rank
 
+    def set_world_ranks(self) -> None:
+        if self.cluster_environment is None:
+            return
+        rank_zero_only.rank = self.cluster_environment.global_rank()
+
     def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]:
         assert self.model is not None
         with self.precision_plugin.val_step_context():