integrate xla strategies

justusschock · justusschock · commit 467b9355128d · 2025-11-10T15:19:33.000Z
diff --git a/src/lightning/fabric/strategies/single_xla.py b/src/lightning/fabric/strategies/single_xla.py
@@ -13,15 +13,14 @@
 # limitations under the License.
 from typing import Optional
 
-import torch
 from typing_extensions import override
 
 from lightning.fabric.accelerators import Accelerator
-from lightning.fabric.accelerators.xla import _XLA_AVAILABLE
 from lightning.fabric.plugins import CheckpointIO, Precision, XLAPrecision
 from lightning.fabric.plugins.io.xla import XLACheckpointIO
 from lightning.fabric.strategies import _StrategyRegistry
 from lightning.fabric.strategies.single_device import SingleDeviceStrategy
+from lightning.fabric.utilities.imports import _raise_enterprise_not_available
 from lightning.fabric.utilities.types import _DEVICE
 
 
@@ -35,20 +34,16 @@ def __init__(
         checkpoint_io: Optional[XLACheckpointIO] = None,
         precision: Optional[XLAPrecision] = None,
     ):
-        if not _XLA_AVAILABLE:
-            raise ModuleNotFoundError(str(_XLA_AVAILABLE))
-        if isinstance(device, torch.device):
-            # unwrap the `torch.device` in favor of `xla_device`
-            device = device.index
-
-        import torch_xla.core.xla_model as xm
+        _raise_enterprise_not_available()
+        from pytorch_lightning_enterprise.fabric.strategies.xla.single import validate_xla_strategy
 
         super().__init__(
             accelerator=accelerator,
-            device=xm.xla_device(device),
+            device=device,
             checkpoint_io=checkpoint_io,
             precision=precision,
         )
+        validate_xla_strategy(strategy=self, device=device)
 
     @property
     @override
diff --git a/src/lightning/fabric/strategies/xla.py b/src/lightning/fabric/strategies/xla.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import io
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 import torch
@@ -22,14 +21,13 @@
 from typing_extensions import override
 
 from lightning.fabric.accelerators import Accelerator
-from lightning.fabric.accelerators.xla import _XLA_GREATER_EQUAL_2_1
 from lightning.fabric.plugins import CheckpointIO, Precision, XLAPrecision
 from lightning.fabric.plugins.environments import XLAEnvironment
 from lightning.fabric.plugins.io.xla import XLACheckpointIO
 from lightning.fabric.strategies import ParallelStrategy, _StrategyRegistry
 from lightning.fabric.strategies.launchers.xla import _XLALauncher
 from lightning.fabric.strategies.strategy import TBroadcast
-from lightning.fabric.utilities.rank_zero import rank_zero_only
+from lightning.fabric.utilities.imports import _raise_enterprise_not_available
 from lightning.fabric.utilities.types import _PATH, ReduceOp
 
 if TYPE_CHECKING:
@@ -55,22 +53,19 @@ def __init__(
             checkpoint_io=checkpoint_io,
             precision=precision,
         )
-        self._backward_sync_control = None  # XLA synchronizes gradients in the optimizer.step() call
-        self._launched = False
-        self._sync_module_states = sync_module_states
+        _raise_enterprise_not_available()
+        from pytorch_lightning_enterprise.fabric.strategies.xla.ddp import XLAStrategyFabric as EnterpriseXLAStrategy
+
+        self.xla_strategy_impl = EnterpriseXLAStrategy(outer_object=self, sync_module_states=sync_module_states)
 
     @property
     @override
     def root_device(self) -> torch.device:
-        if not self._launched:
-            raise RuntimeError("Accessing the XLA device before processes have spawned is not allowed.")
-        import torch_xla.core.xla_model as xm
-
-        return xm.xla_device()
+        return self.xla_strategy_impl.root_device
 
     @property
     def num_processes(self) -> int:
-        return len(self.parallel_devices) if self.parallel_devices is not None else 0
+        return self.xla_strategy_impl.num_processes
 
     @property
     @override
@@ -107,71 +102,42 @@ def precision(self, precision: Optional[Precision]) -> None:
     @property
     @override
     def global_rank(self) -> int:
-        return super().global_rank if self._launched else 0
+        return self.xla_strategy_impl.global_rank
 
     @property
     @override
     def local_rank(self) -> int:
-        return super().local_rank if self._launched else 0
+        return self.xla_strategy_impl.local_rank
 
     @property
     @override
     def node_rank(self) -> int:
-        return super().node_rank if self._launched else 0
+        return self.xla_strategy_impl.node_rank
 
     @property
     @override
     def world_size(self) -> int:
-        return super().world_size if self._launched else 1
+        return self.xla_strategy_impl.world_size
 
     @override
     def _configure_launcher(self) -> None:
         self._launcher = _XLALauncher(self)
 
     @override
     def setup_environment(self) -> None:
-        assert self.parallel_devices is not None
-        if len(self.parallel_devices) == 1:
-            # spawning only 1 device with PjRT is not supported:
-            # https://github.com/Lightning-AI/pytorch-lightning/pull/17408#discussion_r1170671732
-            raise NotImplementedError(
-                f"The {type(self).__name__} does not support running on a single device with the PjRT runtime."
-                " Try using all devices or the `SingleDeviceXLAStrategy` strategy"
-            )
-
-        self._launched = True
-        rank_zero_only.rank = self.global_rank
-        super().setup_environment()
+        return self.xla_strategy_impl.setup_environment()
 
     @override
     def setup_module(self, module: Module) -> Module:
-        if self._sync_module_states:
-            if _XLA_GREATER_EQUAL_2_1:
-                from torch_xla.core.xla_model import broadcast_master_param
-            else:
-                from torch_xla.experimental.pjrt import broadcast_master_param
-
-            broadcast_master_param(module)
-
-        return module
+        return self.xla_strategy_impl.setup_module(module=module)
 
     @override
     def module_to_device(self, module: Module) -> None:
-        module.to(self.root_device)
+        return self.xla_strategy_impl.module_to_device(module=module)
 
     @override
     def process_dataloader(self, dataloader: DataLoader) -> "MpDeviceLoader":
-        from torch_xla.distributed.parallel_loader import MpDeviceLoader
-
-        if isinstance(dataloader, MpDeviceLoader):
-            # dataloader is already wrapped by MpDeviceLoader
-            return dataloader
-
-        dataloader = MpDeviceLoader(dataloader, self.root_device)
-        # Mimic interface to torch.utils.data.DataLoader
-        dataloader.dataset = dataloader._loader.dataset
-        dataloader.batch_sampler = getattr(dataloader._loader, "batch_sampler", None)
-        return dataloader
+        return self.xla_strategy_impl.process_dataloader(dataloader=dataloader)
 
     @override
     def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> Tensor:
@@ -185,92 +151,21 @@ def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bo
             A tensor of shape (world_size, ...)
 
         """
-        if not self._launched:
-            return tensor
-        if not isinstance(tensor, Tensor):
-            raise NotImplementedError(
-                f"`{type(self).__name__}.all_gather` is only implemented for tensors. Given {tensor}"
-            )
-        if tensor.dim() == 0:
-            tensor = tensor.unsqueeze(0)
-        original_device = tensor.device
-        tensor = tensor.to(self.root_device)
-
-        import torch_xla.core.functions as xf
-        import torch_xla.core.xla_model as xm
-
-        tensor = xf.all_gather(tensor) if sync_grads else xm.all_gather(tensor)
-        tensor = tensor.to(original_device)
-        return tensor
+        return self.xla_strategy_impl.all_gather(tensor=tensor, group=group, sync_grads=sync_grads)
 
     @override
     def all_reduce(
         self, output: Union[Tensor, Any], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None
     ) -> Tensor:
-        if not isinstance(output, Tensor):
-            output = torch.tensor(output, device=self.root_device)
-
-        invalid_reduce_op = isinstance(reduce_op, ReduceOp) and reduce_op != ReduceOp.SUM
-        invalid_reduce_op_str = isinstance(reduce_op, str) and reduce_op.lower() not in ("sum", "mean", "avg")
-        if invalid_reduce_op or invalid_reduce_op_str:
-            raise ValueError(
-                "Currently, the XLAStrategy only supports `sum`, `mean`, `avg` for the reduce operation, got:"
-                f" {reduce_op}"
-            )
-        import torch_xla.core.xla_model as xm
-
-        output = xm.mesh_reduce("reduce", output, sum)
-
-        if isinstance(reduce_op, str) and reduce_op.lower() in ("avg", "mean"):
-            output = output / self.world_size
-
-        return output
+        return self.xla_strategy_impl.all_reduce(output=output, group=group, reduce_op=reduce_op)
 
     @override
     def barrier(self, name: Optional[str] = None, *args: Any, **kwargs: Any) -> None:
-        if not self._launched:
-            return
-        import torch_xla.core.xla_model as xm
-
-        if name is None:
-            # `None` is not supported: "TypeError: _xla_rendezvous(): incompatible function arguments"
-            name = ""
-        xm.rendezvous(name)
+        return self.xla_strategy_impl.barrier(name=name, *args, **kwargs)
 
     @override
     def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
-        if not self._launched:
-            return obj
-
-        import torch_xla.core.xla_model as xm
-
-        is_tensor = isinstance(obj, Tensor)
-        if is_tensor:
-            if obj.dim() == 0:
-                obj = obj.unsqueeze(0)
-            original_device = obj.device
-            # XLA distributed requires that the data is on the XLA device
-            obj = obj.to(self.root_device)
-        else:
-            # support for arbitrary pickle-ables
-            buffer = io.BytesIO()
-            torch.save(obj, buffer)
-            obj = torch.tensor(  # type: ignore[assignment]
-                bytearray(buffer.getbuffer()), device=self.root_device, dtype=torch.float
-            )
-
-        obj = [obj]
-        xm.collective_broadcast(obj, root_ordinal=src)
-        obj = obj[0]
-
-        if not is_tensor:
-            # this will preserve the dtype and device of any tensors
-            buffer = io.BytesIO(obj.cpu().byte().numpy())
-            obj = torch.load(buffer)
-        else:
-            obj = obj.to(original_device)
-
-        return obj
+        return self.xla_strategy_impl.broadcast(obj=obj, src=src)
 
     @override
     def save_checkpoint(
@@ -291,12 +186,9 @@ def save_checkpoint(
                 boolean indicating whether the given parameter should be saved (``True``) or filtered out (``False``).
 
         """
-        import torch_xla.core.xla_model as xm
-
-        # sync any pending lazy tensors on all ranks before saving to prevent potential collective hangs
-        xm.mark_step()
-        # save on global rank zero only
-        super().save_checkpoint(path, state, storage_options=storage_options, filter=filter)
+        return self.xla_strategy_impl.save_checkpoint(
+            path=path, state=state, storage_options=storage_options, filter=filter
+        )
 
     @classmethod
     @override
diff --git a/src/lightning/fabric/strategies/xla_fsdp.py b/src/lightning/fabric/strategies/xla_fsdp.py