update

justusschock · justusschock · commit 975c0980c351 · 2025-11-11T11:41:31.000Z
diff --git a/src/lightning/fabric/accelerators/xla.py b/src/lightning/fabric/accelerators/xla.py
@@ -16,12 +16,15 @@
 from typing import Any, Union
 
 import torch
+from lightning_utilities.core.imports import RequirementCache
 from typing_extensions import override
 
 from lightning.fabric.accelerators.accelerator import Accelerator
 from lightning.fabric.accelerators.registry import _AcceleratorRegistry
 from lightning.fabric.utilities.imports import _raise_enterprise_not_available
 
+_XLA_AVAILABLE = RequirementCache("torch_xla>=1.13", "torch_xla")
+
 
 class XLAAccelerator(Accelerator):
     """Accelerator for XLA devices, normally TPUs.
diff --git a/src/lightning/pytorch/strategies/launchers/xla.py b/src/lightning/pytorch/strategies/launchers/xla.py
@@ -11,23 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 import queue
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 import torch.multiprocessing as mp
 from typing_extensions import override
 
-from lightning.fabric.accelerators.xla import _XLA_AVAILABLE
-from lightning.fabric.strategies.launchers.xla import _rank_teardown
-from lightning.fabric.utilities import move_data_to_device
 from lightning.pytorch.strategies.launchers.multiprocessing import (
     _GlobalStateSnapshot,
     _MultiProcessingLauncher,
     _WorkerOutput,
 )
-from lightning.pytorch.trainer.states import TrainerFn
-from lightning.pytorch.utilities.rank_zero import rank_zero_debug
+from lightning.pytorch.utilities.imports import _raise_if_not_enterprise_not_available
 
 if TYPE_CHECKING:
     import lightning.pytorch as pl
@@ -51,14 +46,16 @@ class _XLALauncher(_MultiProcessingLauncher):
     """
 
     def __init__(self, strategy: "pl.strategies.XLAStrategy") -> None:
-        if not _XLA_AVAILABLE:
-            raise ModuleNotFoundError(str(_XLA_AVAILABLE))
-        super().__init__(strategy=strategy, start_method="fork")
+        super().__init__(strategy)
+        _raise_if_not_enterprise_not_available()
+        from pytorch_lightning_enterprise.strategies.xla.launcher import _XLALauncherTrainer as EnterpriseXLALauncher
+
+        self.xla_launcher_impl = EnterpriseXLALauncher(strategy)
 
     @property
     @override
     def is_interactive_compatible(self) -> bool:
-        return True
+        return self.xla_launcher_impl.is_interactive_compatible()
 
     @override
     def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] = None, **kwargs: Any) -> Any:
@@ -75,46 +72,7 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"]
             **kwargs: Optional keyword arguments to be passed to the given function.
 
         """
-        if self._already_fit and trainer is not None and trainer.state.fn == TrainerFn.FITTING:
-            # resolving https://github.com/Lightning-AI/pytorch-lightning/issues/18775 will lift this restriction
-            raise NotImplementedError(
-                "Calling `trainer.fit()` twice on the same Trainer instance using a spawn-based strategy is not"
-                " supported. You can work around this by creating a new Trainer instance and passing the"
-                " `fit(ckpt_path=...)` argument."
-            )
-
-        # pjrt requires that the queue is serializable
-        return_queue = mp.Manager().Queue()
-
-        import torch_xla.distributed.xla_multiprocessing as xmp
-
-        spawn_kwargs = {}
-        nprocs = self._strategy.num_processes
-        if nprocs == 1:
-            # avoid warning: "Unsupported nprocs". If it's 1, it will call the launched function directly.
-            # otherwise it will use all devices
-            spawn_kwargs["nprocs"] = nprocs
-
-        process_context = xmp.spawn(
-            self._wrapping_function,
-            args=(trainer, function, args, kwargs, return_queue),
-            start_method=self._start_method,
-            join=False,  # we will join ourselves to get the process references
-            **spawn_kwargs,
-        )
-        # xla will not actually create processes if only 1 device
-        if process_context is not None:
-            self.procs = process_context.processes
-            while not process_context.join():
-                pass
-
-        worker_output = return_queue.get()
-        if trainer is None:
-            return worker_output
-
-        self._already_fit |= trainer.state.fn == TrainerFn.FITTING
-        self._recover_results_in_main_process(worker_output, trainer)
-        return worker_output.trainer_results
+        return self.xla_launcher_impl.launch(function, *args, trainer=trainer, **kwargs)
 
     @override
     def _wrapping_function(
@@ -129,48 +87,10 @@ def _wrapping_function(
         return_queue: Union[mp.SimpleQueue, queue.Queue],
         global_states: Optional[_GlobalStateSnapshot] = None,
     ) -> None:
-        import torch_xla.core.xla_model as xm
-
-        if len(xm.get_xla_supported_devices()) > 1:
-            # `get_xla_supported_devices` in the spawned process returns the logical devices (2 for v2/v3 and 1 for v4)
-            # so when there's more than one (multithreading), objects need to be deep-copied
-            import copy
-
-            trainer, function, args, kwargs = copy.deepcopy((trainer, function, args, kwargs))
-
-        results = function(*args, **kwargs)
-
-        if trainer is not None:
-            results = self._collect_rank_zero_results(trainer, results)
-
-        if self._strategy.local_rank == 0:
-            return_queue.put(move_data_to_device(results, "cpu"))
-
-        _rank_teardown(self._strategy.local_rank)
+        return self.xla_launcher_impl._wrapping_function(
+            process_idx, trainer, function, args, kwargs, return_queue, global_states
+        )
 
     @override
     def _collect_rank_zero_results(self, trainer: "pl.Trainer", results: Any) -> Optional["_WorkerOutput"]:
-        rank_zero_debug("Collecting results from rank 0 process.")
-        checkpoint_callback = trainer.checkpoint_callback
-        best_model_path = (
-            checkpoint_callback.best_model_path
-            if checkpoint_callback and hasattr(checkpoint_callback, "best_model_path")
-            else None
-        )
-
-        # save the last weights
-        weights_path = None
-        if trainer.state.fn == TrainerFn.FITTING:
-            # requires to compute the state_dict on all processes in case Metrics are present
-            state_dict = self._strategy.lightning_module_state_dict()
-            weights_path = os.path.join(trainer.default_root_dir, ".temp.ckpt")
-            self._strategy.checkpoint_io.save_checkpoint(state_dict, weights_path)
-
-        # We use `local_rank` here as separate filesystems are used for each VM for TPU Pod Training
-        if self._strategy.local_rank != 0:
-            return None
-
-        # add extra result data from trainer to send to main process
-        extra = self.get_extra_results(trainer)
-
-        return _WorkerOutput(best_model_path, weights_path, trainer.state, results, extra)
+        return self.xla_launcher_impl._collect_rank_zero_results(trainer, results)