Lightning-AI
diff --git a/‎src/lightning/fabric/accelerators/xla.py‎
Lines changed: 5 additions & 5 deletions b/‎src/lightning/fabric/accelerators/xla.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/lightning/fabric/plugins/environments/slurm.py‎
Lines changed: 1 addition & 1 deletion b/‎src/lightning/fabric/plugins/environments/slurm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/fabric/plugins/io/xla.py‎
Lines changed: 2 additions & 2 deletions b/‎src/lightning/fabric/plugins/io/xla.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/lightning/fabric/plugins/precision/bitsandbytes.py‎
Lines changed: 13 additions & 12 deletions b/‎src/lightning/fabric/plugins/precision/bitsandbytes.py‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎src/lightning/fabric/plugins/precision/transformer_engine.py‎
Lines changed: 4 additions & 4 deletions b/‎src/lightning/fabric/plugins/precision/transformer_engine.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/lightning/fabric/strategies/deepspeed.py‎
Lines changed: 25 additions & 27 deletions b/‎src/lightning/fabric/strategies/deepspeed.py‎
Lines changed: 25 additions & 27 deletions
diff --git a/‎src/lightning/fabric/strategies/launchers/xla.py‎
Lines changed: 5 additions & 4 deletions b/‎src/lightning/fabric/strategies/launchers/xla.py‎
Lines changed: 5 additions & 4 deletions
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import functools
-from typing import Any, Union
+from typing import Any
 
 import torch
 from lightning_utilities.core.imports import RequirementCache
@@ -47,13 +47,13 @@ def teardown(self) -> None:
 
     @staticmethod
     @override
-    def parse_devices(devices: Union[int, str, list[int]]) -> Union[int, list[int]]:
+    def parse_devices(devices: int | str | list[int]) -> int | list[int]:
         """Accelerator device parsing logic."""
         return _parse_tpu_devices(devices)
 
     @staticmethod
     @override
-    def get_parallel_devices(devices: Union[int, list[int]]) -> list[torch.device]:
+    def get_parallel_devices(devices: int | list[int]) -> list[torch.device]:
         """Gets parallel devices for the Accelerator."""
         devices = _parse_tpu_devices(devices)
         if isinstance(devices, int):
@@ -131,7 +131,7 @@ def _using_pjrt() -> bool:
     return pjrt.using_pjrt()
 
 
-def _parse_tpu_devices(devices: Union[int, str, list[int]]) -> Union[int, list[int]]:
+def _parse_tpu_devices(devices: int | str | list[int]) -> int | list[int]:
     """Parses the TPU devices given in the format as accepted by the
     :class:`~lightning.pytorch.trainer.trainer.Trainer` and :class:`~lightning.fabric.Fabric`.
 
@@ -168,7 +168,7 @@ def _check_tpu_devices_valid(devices: object) -> None:
     )
 
 
-def _parse_tpu_devices_str(devices: str) -> Union[int, list[int]]:
+def _parse_tpu_devices_str(devices: str) -> int | list[int]:
     devices = devices.strip()
     try:
         return int(devices)
 
@@ -116,7 +116,7 @@ def job_name() -> str | None:
         return os.environ.get("SLURM_JOB_NAME")
 
     @staticmethod
-    def job_id() -> Optional[int]:
+    def job_id() -> int | None:
         # in interactive mode, don't make logs use the same job id
         if _is_slurm_interactive_mode():
             return None
 
@@ -13,7 +13,7 @@
 # limitations under the License.
 import logging
 import os
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from lightning_utilities.core.apply_func import apply_to_collection
@@ -41,7 +41,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
 
     @override
-    def save_checkpoint(self, checkpoint: dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
+    def save_checkpoint(self, checkpoint: dict[str, Any], path: _PATH, storage_options: Any | None = None) -> None:
         """Save model/training states as a checkpoint file through state-dump and file-write.
 
         Args:
 
@@ -17,10 +17,11 @@
 import os
 import warnings
 from collections import OrderedDict
+from collections.abc import Callable
 from contextlib import AbstractContextManager, ExitStack
 from functools import partial
 from types import ModuleType
-from typing import Any, Callable, Literal, Optional, cast
+from typing import Any, Literal, cast
 
 import torch
 from lightning_utilities import apply_to_collection
@@ -70,8 +71,8 @@ class BitsandbytesPrecision(Precision):
     def __init__(
         self,
         mode: Literal["nf4", "nf4-dq", "fp4", "fp4-dq", "int8", "int8-training"],
-        dtype: Optional[torch.dtype] = None,
-        ignore_modules: Optional[set[str]] = None,
+        dtype: torch.dtype | None = None,
+        ignore_modules: set[str] | None = None,
     ) -> None:
         _import_bitsandbytes()
 
@@ -176,7 +177,7 @@ def _ignore_missing_weights_hook(module: torch.nn.Module, incompatible_keys: _In
 
 
 def _replace_param(
-    param: torch.nn.Parameter, data: torch.Tensor, quant_state: Optional[tuple] = None
+    param: torch.nn.Parameter, data: torch.Tensor, quant_state: tuple | None = None
 ) -> torch.nn.Parameter:
     bnb = _import_bitsandbytes()
 
@@ -223,18 +224,18 @@ class _Linear8bitLt(bnb.nn.Linear8bitLt):
         """Wraps `bnb.nn.Linear8bitLt` and enables instantiation directly on the device and re-quantizaton when loading
         the state dict."""
 
-        def __init__(self, *args: Any, device: Optional[_DEVICE] = None, threshold: float = 6.0, **kwargs: Any) -> None:
+        def __init__(self, *args: Any, device: _DEVICE | None = None, threshold: float = 6.0, **kwargs: Any) -> None:
             super().__init__(*args, device=device, threshold=threshold, **kwargs)
             self.weight = cast(bnb.nn.Int8Params, self.weight)  # type: ignore[has-type]
-            self.bias: Optional[torch.nn.Parameter] = self.bias
+            self.bias: torch.nn.Parameter | None = self.bias
             # if the device is CUDA or we are under a CUDA context manager, quantize the weight here, so we don't end up
             # filling the device memory with float32 weights which could lead to OOM
             if torch.tensor(0, device=device).device.type == "cuda":
                 self.quantize_()
             self._register_load_state_dict_pre_hook(partial(_quantize_on_load_hook, self.quantize_))
             self.register_load_state_dict_post_hook(_ignore_missing_weights_hook)
 
-        def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torch.device] = None) -> None:
+        def quantize_(self, weight: torch.Tensor | None = None, device: torch.device | None = None) -> None:
             """Inplace quantize."""
             if weight is None:
                 weight = self.weight.data
@@ -246,7 +247,7 @@ def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torc
 
         @staticmethod
         def quantize(
-            int8params: bnb.nn.Int8Params, weight: torch.Tensor, device: Optional[torch.device]
+            int8params: bnb.nn.Int8Params, weight: torch.Tensor, device: torch.device | None
         ) -> bnb.nn.Int8Params:
             device = device or torch.device("cuda")
             if device.type != "cuda":
@@ -310,18 +311,18 @@ class _Linear4bit(bnb.nn.Linear4bit):
         """Wraps `bnb.nn.Linear4bit` to enable: instantiation directly on the device, re-quantizaton when loading the
         state dict, meta-device initialization, and materialization."""
 
-        def __init__(self, *args: Any, device: Optional[_DEVICE] = None, **kwargs: Any) -> None:
+        def __init__(self, *args: Any, device: _DEVICE | None = None, **kwargs: Any) -> None:
             super().__init__(*args, device=device, **kwargs)
             self.weight = cast(bnb.nn.Params4bit, self.weight)  # type: ignore[has-type]
-            self.bias: Optional[torch.nn.Parameter] = self.bias
+            self.bias: torch.nn.Parameter | None = self.bias
             # if the device is CUDA or we are under a CUDA context manager, quantize the weight here, so we don't end up
             # filling the device memory with float32 weights which could lead to OOM
             if torch.tensor(0, device=device).device.type == "cuda":
                 self.quantize_()
             self._register_load_state_dict_pre_hook(partial(_quantize_on_load_hook, self.quantize_))
             self.register_load_state_dict_post_hook(_ignore_missing_weights_hook)
 
-        def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torch.device] = None) -> None:
+        def quantize_(self, weight: torch.Tensor | None = None, device: torch.device | None = None) -> None:
             """Inplace quantize."""
             if weight is None:
                 weight = self.weight.data
@@ -334,7 +335,7 @@ def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torc
 
         @staticmethod
         def quantize(
-            params4bit: bnb.nn.Params4bit, weight: torch.Tensor, device: Optional[torch.device]
+            params4bit: bnb.nn.Params4bit, weight: torch.Tensor, device: torch.device | None
         ) -> bnb.nn.Params4bit:
             device = device or torch.device("cuda")
             if device.type != "cuda":
 
@@ -14,7 +14,7 @@
 import logging
 from collections.abc import Mapping
 from contextlib import AbstractContextManager, ExitStack
-from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, Literal, Union
 
 import torch
 from lightning_utilities import apply_to_collection
@@ -68,9 +68,9 @@ def __init__(
         self,
         *,
         weights_dtype: torch.dtype,
-        recipe: Optional[Union[Mapping[str, Any], "DelayedScaling"]] = None,
-        replace_layers: Optional[bool] = None,
-        fallback_compute_dtype: Optional[torch.dtype] = None,
+        recipe: Union[Mapping[str, Any], "DelayedScaling"] | None = None,
+        replace_layers: bool | None = None,
+        fallback_compute_dtype: torch.dtype | None = None,
     ) -> None:
         if not _TRANSFORMER_ENGINE_AVAILABLE:
             raise ModuleNotFoundError(str(_TRANSFORMER_ENGINE_AVAILABLE))
 
@@ -16,12 +16,12 @@
 import logging
 import os
 import platform
-from collections.abc import Mapping
+from collections.abc import Callable, Mapping
 from contextlib import AbstractContextManager, ExitStack
 from datetime import timedelta
 from itertools import chain
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 from lightning_utilities.core.imports import RequirementCache
@@ -57,10 +57,10 @@ class DeepSpeedStrategy(DDPStrategy, _Sharded):
 
     def __init__(
         self,
-        accelerator: Optional[Accelerator] = None,
+        accelerator: Accelerator | None = None,
         zero_optimization: bool = True,
         stage: int = 2,
-        remote_device: Optional[str] = None,
+        remote_device: str | None = None,
         offload_optimizer: bool = False,
         offload_parameters: bool = False,
         offload_params_device: str = "cpu",
@@ -84,11 +84,11 @@ def __init__(
         allgather_bucket_size: int = 200_000_000,
         reduce_bucket_size: int = 200_000_000,
         zero_allow_untested_optimizer: bool = True,
-        logging_batch_size_per_gpu: Optional[int] = None,
-        config: Optional[Union[_PATH, dict[str, Any]]] = None,
+        logging_batch_size_per_gpu: int | None = None,
+        config: _PATH | dict[str, Any] | None = None,
         logging_level: int = logging.WARN,
-        parallel_devices: Optional[list[torch.device]] = None,
-        cluster_environment: Optional[ClusterEnvironment] = None,
+        parallel_devices: list[torch.device] | None = None,
+        cluster_environment: ClusterEnvironment | None = None,
         loss_scale: float = 0,
         initial_scale_power: int = 16,
         loss_scale_window: int = 1000,
@@ -99,9 +99,9 @@ def __init__(
         contiguous_memory_optimization: bool = False,
         synchronize_checkpoint_boundary: bool = False,
         load_full_weights: bool = False,
-        precision: Optional[Precision] = None,
-        process_group_backend: Optional[str] = None,
-        timeout: Optional[timedelta] = default_pg_timeout,
+        precision: Precision | None = None,
+        process_group_backend: str | None = None,
+        timeout: timedelta | None = default_pg_timeout,
         exclude_frozen_parameters: bool = False,
     ) -> None:
         """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large
@@ -262,7 +262,7 @@ def __init__(
             process_group_backend=process_group_backend,
         )
         self._backward_sync_control = None  # DeepSpeed handles gradient accumulation internally
-        self._timeout: Optional[timedelta] = timeout
+        self._timeout: timedelta | None = timeout
 
         self.config = self._load_config(config)
         if self.config is None:
@@ -316,7 +316,7 @@ def __init__(
         self.hysteresis = hysteresis
         self.min_loss_scale = min_loss_scale
 
-        self._deepspeed_engine: Optional[DeepSpeedEngine] = None
+        self._deepspeed_engine: DeepSpeedEngine | None = None
 
     @property
     def zero_stage_3(self) -> bool:
@@ -374,7 +374,7 @@ def setup_optimizer(self, optimizer: Optimizer) -> Optimizer:
         raise NotImplementedError(self._err_msg_joint_setup_required())
 
     @override
-    def module_init_context(self, empty_init: Optional[bool] = None) -> AbstractContextManager:
+    def module_init_context(self, empty_init: bool | None = None) -> AbstractContextManager:
         if self.zero_stage_3 and empty_init is False:
             raise NotImplementedError(
                 f"`{empty_init=}` is not a valid choice with `DeepSpeedStrategy` when ZeRO stage 3 is enabled."
@@ -404,9 +404,9 @@ def module_sharded_context(self) -> AbstractContextManager:
     def save_checkpoint(
         self,
         path: _PATH,
-        state: dict[str, Union[Module, Optimizer, Any]],
-        storage_options: Optional[Any] = None,
-        filter: Optional[dict[str, Callable[[str, Any], bool]]] = None,
+        state: dict[str, Module | Optimizer | Any],
+        storage_options: Any | None = None,
+        filter: dict[str, Callable[[str, Any], bool]] | None = None,
     ) -> None:
         """Save model, optimizer, and other state in a checkpoint directory.
 
@@ -471,9 +471,9 @@ def save_checkpoint(
     def load_checkpoint(
         self,
         path: _PATH,
-        state: Optional[Union[Module, Optimizer, dict[str, Union[Module, Optimizer, Any]]]] = None,
+        state: Module | Optimizer | dict[str, Module | Optimizer | Any] | None = None,
         strict: bool = True,
-        weights_only: Optional[bool] = None,
+        weights_only: bool | None = None,
     ) -> dict[str, Any]:
         """Load the contents from a checkpoint and restore the state of the given objects.
 
@@ -554,8 +554,8 @@ def clip_gradients_norm(
         self,
         module: "DeepSpeedEngine",
         optimizer: Optimizer,
-        max_norm: Union[float, int],
-        norm_type: Union[float, int] = 2.0,
+        max_norm: float | int,
+        norm_type: float | int = 2.0,
         error_if_nonfinite: bool = True,
     ) -> torch.Tensor:
         raise NotImplementedError(
@@ -564,9 +564,7 @@ def clip_gradients_norm(
         )
 
     @override
-    def clip_gradients_value(
-        self, module: "DeepSpeedEngine", optimizer: Optimizer, clip_val: Union[float, int]
-    ) -> None:
+    def clip_gradients_value(self, module: "DeepSpeedEngine", optimizer: Optimizer, clip_val: float | int) -> None:
         raise NotImplementedError(
             "DeepSpeed handles gradient clipping automatically within the optimizer. "
             "Make sure to set the `gradient_clipping` value in your Config."
@@ -614,7 +612,7 @@ def register_strategies(cls, strategy_registry: _StrategyRegistry) -> None:
         )
 
     def _initialize_engine(
-        self, model: Module, optimizer: Optional[Optimizer] = None, scheduler: Optional["_LRScheduler"] = None
+        self, model: Module, optimizer: Optimizer | None = None, scheduler: Optional["_LRScheduler"] = None
     ) -> tuple["DeepSpeedEngine", Optimizer, Any]:
         """Initialize one model and one optimizer with an optional learning rate scheduler.
 
@@ -716,7 +714,7 @@ def _create_default_config(
         self,
         zero_optimization: bool,
         zero_allow_untested_optimizer: bool,
-        logging_batch_size_per_gpu: Optional[int],
+        logging_batch_size_per_gpu: int | None,
         partition_activations: bool,
         cpu_checkpointing: bool,
         contiguous_memory_optimization: bool,
@@ -825,7 +823,7 @@ def load(module: torch.nn.Module, prefix: str = "") -> None:
 
         load(module, prefix="")
 
-    def _load_config(self, config: Optional[Union[_PATH, dict[str, Any]]]) -> Optional[dict[str, Any]]:
+    def _load_config(self, config: _PATH | dict[str, Any] | None) -> dict[str, Any] | None:
         if config is None and self.DEEPSPEED_ENV_VAR in os.environ:
             rank_zero_info(f"Loading DeepSpeed config from set {self.DEEPSPEED_ENV_VAR} environment variable")
             config = os.environ[self.DEEPSPEED_ENV_VAR]
 
@@ -13,7 +13,8 @@
 # limitations under the License.
 import queue
 import time
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, Union
 
 import torch.multiprocessing as mp
 from typing_extensions import override
@@ -68,7 +69,7 @@ def launch(self, function: Callable, *args: Any, **kwargs: Any) -> Any:
             **kwargs: Optional keyword arguments to be passed to the given function.
 
         """
-        return_queue: Union[queue.Queue, mp.SimpleQueue]
+        return_queue: queue.Queue | mp.SimpleQueue
         return_queue = mp.Manager().Queue()
 
         import torch_xla.distributed.xla_multiprocessing as xmp
@@ -96,8 +97,8 @@ def _wrapping_function(
         function: Callable,
         args: Any,
         kwargs: Any,
-        return_queue: Union[mp.SimpleQueue, queue.Queue],
-        global_states: Optional[_GlobalStateSnapshot] = None,
+        return_queue: mp.SimpleQueue | queue.Queue,
+        global_states: _GlobalStateSnapshot | None = None,
     ) -> None:
         import torch_xla.core.xla_model as xm