[TPU] Preserve the device with XLA's collectives (#18275)

carmocca · web-flow · commit fcb8e17303d5 · 2023-08-16T22:56:41.000+02:00
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -138,6 +138,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - DataLoader re-instantiation is now only performed when a distributed sampler is required ([#18191](https://github.com/Lightning-AI/lightning/pull/18191))
 
 
+- Broadcast and reduction of tensors with XLA-based strategies now preserve the input's device ([#18275](https://github.com/Lightning-AI/lightning/pull/18275))
+
 ### Deprecated
 
 - Deprecated the `DDPStrategy.is_distributed` property. This strategy is distributed by definition ([#17381](https://github.com/Lightning-AI/lightning/pull/17381))
diff --git a/src/lightning/fabric/strategies/xla.py b/src/lightning/fabric/strategies/xla.py
@@ -161,13 +161,15 @@ def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bo
             )
         if tensor.dim() == 0:
             tensor = tensor.unsqueeze(0)
-        if tensor.device.type != "xla":
-            tensor = tensor.to(self.root_device)
+        original_device = tensor.device
+        tensor = tensor.to(self.root_device)
 
         import torch_xla.core.functions as xf
         import torch_xla.core.xla_model as xm
 
-        return xf.all_gather(tensor) if sync_grads else xm.all_gather(tensor)
+        tensor = xf.all_gather(tensor) if sync_grads else xm.all_gather(tensor)
+        tensor = tensor.to(original_device)
+        return tensor
 
     def all_reduce(
         self, output: Union[Tensor, Any], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None
@@ -211,8 +213,9 @@ def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
         if is_tensor:
             if obj.dim() == 0:
                 obj = obj.unsqueeze(0)
-            if obj.device.type != "xla":
-                obj = obj.to(self.root_device)
+            original_device = obj.device
+            # XLA distributed requires that the data is on the XLA device
+            obj = obj.to(self.root_device)
         else:
             # support for arbitrary pickle-ables
             buffer = io.BytesIO()
@@ -226,8 +229,11 @@ def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
         obj = obj[0]
 
         if not is_tensor:
+            # this will preserve the dtype and device of any tensors
             buffer = io.BytesIO(obj.cpu().byte().numpy())
             obj = torch.load(buffer)
+        else:
+            obj = obj.to(original_device)
 
         return obj
 
diff --git a/src/lightning/fabric/strategies/xla_fsdp.py b/src/lightning/fabric/strategies/xla_fsdp.py
@@ -269,13 +269,15 @@ def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bo
             )
         if tensor.dim() == 0:
             tensor = tensor.unsqueeze(0)
-        if tensor.device.type != "xla":
-            tensor = tensor.to(self.root_device)
+        original_device = tensor.device
+        tensor = tensor.to(self.root_device)
 
         import torch_xla.core.functions as xf
         import torch_xla.core.xla_model as xm
 
-        return xf.all_gather(tensor) if sync_grads else xm.all_gather(tensor)
+        tensor = xf.all_gather(tensor) if sync_grads else xm.all_gather(tensor)
+        tensor = tensor.to(original_device)
+        return tensor
 
     def all_reduce(
         self, output: Union[Tensor, Any], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None
@@ -319,8 +321,9 @@ def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
         if is_tensor:
             if obj.dim() == 0:
                 obj = obj.unsqueeze(0)
-            if obj.device.type != "xla":
-                obj = obj.to(self.root_device)
+            original_device = obj.device
+            # XLA distributed requires that the data is on the XLA device
+            obj = obj.to(self.root_device)
         else:
             # support for arbitrary pickle-ables
             buffer = io.BytesIO()
@@ -334,8 +337,11 @@ def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
         obj = obj[0]
 
         if not is_tensor:
+            # this will preserve the dtype and device of any tensors
             buffer = io.BytesIO(obj.cpu().byte().numpy())
             obj = torch.load(buffer)
+        else:
+            obj = obj.to(original_device)
 
         return obj
 
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -148,6 +148,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - The input tensors now get cast to the right precision type before transfer to the device ([#18264](https://github.com/Lightning-AI/lightning/pull/18264))
 
 
+- Broadcast and reduction of tensors with XLA-based strategies now preserve the input's device ([#18275](https://github.com/Lightning-AI/lightning/pull/18275))
+
 ### Deprecated
 
 - Deprecated the `SingleTPUStrategy` (`strategy="single_tpu"`) in favor of `SingleDeviceXLAStrategy` (`strategy="single_xla"`) ([#17383](https://github.com/Lightning-AI/lightning/pull/17383))
diff --git a/src/lightning/pytorch/strategies/xla.py b/src/lightning/pytorch/strategies/xla.py
@@ -186,8 +186,9 @@ def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
         if is_tensor:
             if obj.dim() == 0:
                 obj = obj.unsqueeze(0)
-            if obj.device.type != "xla":
-                obj = obj.to(self.root_device)
+            original_device = obj.device
+            # XLA distributed requires that the data is on the XLA device
+            obj = obj.to(self.root_device)
         else:
             # support for arbitrary pickle-ables
             buffer = io.BytesIO()
@@ -201,8 +202,11 @@ def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
         obj = obj[0]
 
         if not is_tensor:
+            # this will preserve the dtype and device of any tensors
             buffer = io.BytesIO(obj.cpu().byte().numpy())
             obj = torch.load(buffer)
+        else:
+            obj = obj.to(original_device)
 
         return obj
 
@@ -290,13 +294,15 @@ def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bo
             )
         if tensor.dim() == 0:
             tensor = tensor.unsqueeze(0)
-        if tensor.device.type != "xla":
-            tensor = tensor.to(self.root_device)
+        original_device = tensor.device
+        tensor = tensor.to(self.root_device)
 
         import torch_xla.core.functions as xf
         import torch_xla.core.xla_model as xm
 
-        return xf.all_gather(tensor) if sync_grads else xm.all_gather(tensor)
+        tensor = xf.all_gather(tensor) if sync_grads else xm.all_gather(tensor)
+        tensor = tensor.to(original_device)
+        return tensor
 
     def teardown(self) -> None:
         super().teardown()
diff --git a/tests/tests_fabric/strategies/test_xla.py b/tests/tests_fabric/strategies/test_xla.py
@@ -14,13 +14,13 @@
 import os
 from functools import partial
 from unittest import mock
-from unittest.mock import MagicMock, Mock
+from unittest.mock import ANY, MagicMock, Mock
 
 import pytest
 import torch
 from torch.utils.data import DataLoader
 
-from lightning.fabric.accelerators.xla import _XLA_GREATER_EQUAL_2_1, XLAAccelerator
+from lightning.fabric.accelerators.xla import _using_pjrt, _XLA_GREATER_EQUAL_2_1, XLAAccelerator
 from lightning.fabric.strategies import XLAStrategy
 from lightning.fabric.strategies.launchers.xla import _XLALauncher
 from lightning.fabric.utilities.distributed import ReduceOp
@@ -52,19 +52,30 @@ def xla_launch(fn, strategy=None):
 def broadcast_on_tpu_fn(strategy):
     # test broadcasting a tensor
     obj = torch.tensor(strategy.global_rank)
+    assert obj.device.type == "cpu"
     # In PjRT, the local rank and global rank have no solid relation.
     # global rank may not even be contiguous on a host, because it depends on the 3D mesh structure that is formed by
     # the TPUs on all hosts in a pod. So checking a different src is not reliable
     # https://github.com/pytorch/xla/blob/v2.0.0/torch_xla/experimental/pjrt.py#L161-L163
     src = 0
     result = strategy.broadcast(obj, src)
     assert result.item() == src
-    assert result.device.type == "xla"
+    assert result.device.type == "cpu"  # the original device is preserved
 
     # test broadcasting an arbitrary object
-    obj = ("ver_0.5", "logger_name", strategy.global_rank)
-    result = strategy.broadcast(obj, src=src)
-    assert result == ("ver_0.5", "logger_name", src)
+    if _using_pjrt():
+        tensor = torch.tensor(strategy.global_rank, device=strategy.root_device, dtype=torch.bfloat16)
+        obj = ("ver_0.5", "logger_name", strategy.global_rank, tensor)
+        result = strategy.broadcast(obj, src=src)
+        assert result == ("ver_0.5", "logger_name", src, ANY)
+        assert result[3].device.type == "xla"  # the original device is preserved
+        assert result[3].dtype == torch.bfloat16
+    else:
+        # XRT fails to unpickle tensors, segfaults with
+        # RuntimeError: vector::_M_range_check: __n (which is 1) >= this->size() (which is 1)
+        obj = ("ver_0.5", "logger_name", strategy.global_rank)
+        result = strategy.broadcast(obj, src=src)
+        assert result == ("ver_0.5", "logger_name", src)
 
 
 @RunIf(tpu=True)
@@ -134,7 +145,7 @@ def tpu_all_gather_fn(strategy):
         tensor = torch.tensor(1.0, requires_grad=True)
         result = strategy.all_gather(tensor, sync_grads=sync_grads)
         summed = result.sum()
-        assert summed.device.type == "xla"
+        assert summed.device.type == "cpu"  # the original device is preserved
         assert torch.equal(summed, torch.tensor(strategy.world_size, dtype=torch.float32))
         if not _XLA_GREATER_EQUAL_2_1:
             summed.backward()