Lightning-AI
diff --git a/‎docs/source-fabric/api/accelerators.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source-fabric/api/accelerators.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source-pytorch/api_references.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source-pytorch/api_references.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source-pytorch/extensions/accelerator.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source-pytorch/extensions/accelerator.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/lightning/fabric/accelerators/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/lightning/fabric/accelerators/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/lightning/fabric/accelerators/musa.py‎
Lines changed: 186 additions & 0 deletions b/‎src/lightning/fabric/accelerators/musa.py‎
Lines changed: 186 additions & 0 deletions
diff --git a/‎src/lightning/fabric/cli.py‎
Lines changed: 4 additions & 2 deletions b/‎src/lightning/fabric/cli.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/lightning/fabric/connector.py‎
Lines changed: 7 additions & 2 deletions b/‎src/lightning/fabric/connector.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/lightning/fabric/utilities/device_parser.py‎
Lines changed: 18 additions & 11 deletions b/‎src/lightning/fabric/utilities/device_parser.py‎
Lines changed: 18 additions & 11 deletions
@@ -20,3 +20,4 @@ Accelerators
     CUDAAccelerator
     MPSAccelerator
     XLAAccelerator
+    MUSAAccelerator
@@ -14,6 +14,7 @@ accelerators
     CPUAccelerator
     CUDAAccelerator
     XLAAccelerator
+    MUSAAccelerator
 
 callbacks
 ---------
 
@@ -128,3 +128,4 @@ Accelerator API
     CUDAAccelerator
     MPSAccelerator
     XLAAccelerator
+    MUSAAccelerator
@@ -18,6 +18,7 @@
 from lightning.fabric.accelerators.mps import MPSAccelerator  # noqa: F401
 from lightning.fabric.accelerators.registry import _AcceleratorRegistry
 from lightning.fabric.accelerators.xla import XLAAccelerator  # noqa: F401
+from lightning.fabric.accelerators.musa import MUSAAccelerator  # noqa: F401
 from lightning.fabric.utilities.registry import _register_classes
 
 ACCELERATOR_REGISTRY = _AcceleratorRegistry()
 
@@ -0,0 +1,186 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import lru_cache
+from typing import Optional, Union
+
+import torch
+from typing_extensions import override
+
+from lightning.fabric.accelerators.accelerator import Accelerator
+from lightning.fabric.accelerators.registry import _AcceleratorRegistry
+from lightning.fabric.utilities.rank_zero import rank_zero_info
+
+
+class MUSAAccelerator(Accelerator):
+    """Accelerator for MUSA devices."""
+
+    @override
+    def setup_device(self, device: torch.device) -> None:
+        """
+        Raises:
+            ValueError:
+                If the selected device is not of type MUSA.
+        """
+        if device.type != "musa":
+            raise ValueError(f"Device should be MUSA, got {device} instead.")
+        _check_musa_matmul_precision(device)
+        torch.musa.set_device(device)
+
+    @override
+    def teardown(self) -> None:
+        _clear_musa_memory()
+
+    @staticmethod
+    @override
+    def parse_devices(devices: Union[int, str, list[int]]) -> Optional[list[int]]:
+        """Accelerator device parsing logic."""
+        from lightning.fabric.utilities.device_parser import _parse_gpu_ids
+
+        return _parse_gpu_ids(devices, include_musa=True)
+
+    @staticmethod
+    @override
+    def get_parallel_devices(devices: list[int]) -> list[torch.device]:
+        """Gets parallel devices for the Accelerator."""
+        return [torch.device("musa", i) for i in devices]
+
+    @staticmethod
+    @override
+    def auto_device_count() -> int:
+        """Get the devices when set to auto."""
+        return num_musa_devices()
+
+    @staticmethod
+    @override
+    def is_available() -> bool:
+        return num_musa_devices() > 0
+
+    @staticmethod
+    @override
+    def name() -> str:
+        return "musa"
+
+    @classmethod
+    @override
+    def register_accelerators(cls, accelerator_registry: _AcceleratorRegistry) -> None:
+        accelerator_registry.register(
+            cls.name(),
+            cls,
+            description=cls.__name__,
+        )
+
+
+def find_usable_musa_devices(num_devices: int = -1) -> list[int]:
+    """Returns a list of all available and usable MUSA GPU devices.
+
+    A GPU is considered usable if we can successfully move a tensor to the device, and this is what this function
+    tests for each GPU on the system until the target number of usable devices is found.
+
+    A subset of GPUs on the system might be used by other processes, and if the GPU is configured to operate in
+    'exclusive' mode (configurable by the admin), then only one process is allowed to occupy it.
+
+    Args:
+        num_devices: The number of devices you want to request. By default, this function will return as many as there
+            are usable MUSA GPU devices available.
+
+    Warning:
+        If multiple processes call this function at the same time, there can be race conditions in the case where
+        both processes determine that the device is unoccupied, leading into one of them crashing later on.
+
+    """
+    if num_devices == 0:
+        return []
+    visible_devices = _get_all_visible_musa_devices()
+    if not visible_devices:
+        raise ValueError(
+            f"You requested to find {num_devices} devices but there are no visible MUSA devices on this machine."
+        )
+    if num_devices > len(visible_devices):
+        raise ValueError(
+            f"You requested to find {num_devices} devices but this machine only has {len(visible_devices)} GPUs."
+        )
+
+    available_devices = []
+    unavailable_devices = []
+
+    for gpu_idx in visible_devices:
+        try:
+            torch.tensor(0, device=torch.device("musa", gpu_idx))
+        except RuntimeError:
+            unavailable_devices.append(gpu_idx)
+            continue
+
+        available_devices.append(gpu_idx)
+        if len(available_devices) == num_devices:
+            # exit early if we found the right number of GPUs
+            break
+
+    if num_devices != -1 and len(available_devices) != num_devices:
+        raise RuntimeError(
+            f"You requested to find {num_devices} devices but only {len(available_devices)} are currently available."
+            f" The devices {unavailable_devices} are occupied by other processes and can't be used at the moment."
+        )
+    return available_devices
+
+
+def _get_all_visible_musa_devices() -> list[int]:
+    """Returns a list of all visible MUSA GPU devices.
+
+    Devices masked by the environment variabale ``MUSA_VISIBLE_DEVICES`` won't be returned here. For example, assume you
+    have 8 physical GPUs. If ``MUSA_VISIBLE_DEVICES="1,3,6"``, then this function will return the list ``[0, 1, 2]``
+    because these are the three visible GPUs after applying the mask ``MUSA_VISIBLE_DEVICES``.
+
+    """
+    return list(range(num_musa_devices()))
+
+
+def num_musa_devices() -> int:
+    """Returns the number of available MUSA devices."""
+    return torch.musa.device_count()
+
+
+def is_musa_available() -> bool:
+    """Returns a bool indicating if MUSA is currently available."""
+    # We set `PYTORCH_NVML_BASED_MUSA_CHECK=1` in lightning.fabric.__init__.py
+    return torch.musa.is_available()
+
+
+def _is_ampere_or_later(device: Optional[torch.device] = None) -> bool:
+    major, _ = torch.musa.get_device_capability(device)
+    return major >= 8  # Ampere and later leverage tensor cores, where this setting becomes useful
+
+
+@lru_cache(1)  # show the warning only ever once
+def _check_musa_matmul_precision(device: torch.device) -> None:
+    if not torch.musa.is_available() or not _is_ampere_or_later(device):
+        return
+    # check that the user hasn't changed the precision already, this works for both `allow_tf32 = True` and
+    # `set_float32_matmul_precision`
+    if torch.get_float32_matmul_precision() == "highest":  # default
+        rank_zero_info(
+            f"You are using a MUSA device ({torch.musa.get_device_name(device)!r}) that has Tensor Cores. To properly"
+            " utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off"
+            " precision for performance. For more details, read https://pytorch.org/docs/stable/generated/"
+            "torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision"
+        )
+    # note: no need change `torch.backends.cudnn.allow_tf32` as it's enabled by default:
+    # https://pytorch.org/docs/stable/notes/musa.html#tensorfloat-32-tf32-on-ampere-devices
+
+
+def _clear_musa_memory() -> None:
+    # strangely, the attribute function be undefined when torch.compile is used
+    if hasattr(torch._C, "_musa_clearCublasWorkspaces"):
+        # https://github.com/pytorch/pytorch/issues/95668
+        torch._C._musa_clearMublasWorkspaces()
+    torch.musa.empty_cache()
@@ -21,7 +21,7 @@
 from lightning_utilities.core.imports import RequirementCache
 from typing_extensions import get_args
 
-from lightning.fabric.accelerators import CPUAccelerator, CUDAAccelerator, MPSAccelerator
+from lightning.fabric.accelerators import CPUAccelerator, CUDAAccelerator, MPSAccelerator, MUSAAccelerator
 from lightning.fabric.plugins.precision.precision import _PRECISION_INPUT_STR, _PRECISION_INPUT_STR_ALIAS
 from lightning.fabric.strategies import STRATEGY_REGISTRY
 from lightning.fabric.utilities.consolidate_checkpoint import _process_cli_args
@@ -196,9 +196,11 @@ def _get_num_processes(accelerator: str, devices: str) -> int:
         else:
             raise ValueError(f"Cannot default to '1' device for accelerator='{accelerator}'")
     if accelerator == "gpu":
-        parsed_devices = _parse_gpu_ids(devices, include_cuda=True, include_mps=True)
+        parsed_devices = _parse_gpu_ids(devices, include_cuda=True, include_mps=True, include_musa=True)
     elif accelerator == "cuda":
         parsed_devices = CUDAAccelerator.parse_devices(devices)
+    elif accelerator == "musa":
+        parsed_devices = MUSAAccelerator.parse_devices(devices)
     elif accelerator == "mps":
         parsed_devices = MPSAccelerator.parse_devices(devices)
     elif accelerator == "tpu":
 
@@ -24,6 +24,7 @@
 from lightning.fabric.accelerators.cuda import CUDAAccelerator
 from lightning.fabric.accelerators.mps import MPSAccelerator
 from lightning.fabric.accelerators.xla import XLAAccelerator
+from lightning.fabric.accelerators.musa import MUSAAccelerator
 from lightning.fabric.plugins import (
     BitsandbytesPrecision,
     CheckpointIO,
@@ -322,6 +323,8 @@ def _choose_auto_accelerator() -> str:
             return "mps"
         if CUDAAccelerator.is_available():
             return "cuda"
+        if MUSAAccelerator.is_available():
+            return "musa"
         return "cpu"
 
     @staticmethod
@@ -330,6 +333,8 @@ def _choose_gpu_accelerator_backend() -> str:
             return "mps"
         if CUDAAccelerator.is_available():
             return "cuda"
+        if MUSAAccelerator.is_available():
+            return "musa"
         raise RuntimeError("No supported gpu backend found!")
 
     def _set_parallel_devices_and_init_accelerator(self) -> None:
@@ -400,8 +405,8 @@ def _choose_strategy(self) -> Union[Strategy, str]:
         if self._num_nodes_flag > 1:
             return "ddp"
         if len(self._parallel_devices) <= 1:
-            if isinstance(self._accelerator_flag, (CUDAAccelerator, MPSAccelerator)) or (
-                isinstance(self._accelerator_flag, str) and self._accelerator_flag in ("cuda", "gpu", "mps")
+            if isinstance(self._accelerator_flag, (CUDAAccelerator, MPSAccelerator, MUSAAccelerator)) or (
+                isinstance(self._accelerator_flag, str) and self._accelerator_flag in ("cuda", "gpu", "mps", "musa")
             ):
                 device = _determine_root_gpu_device(self._parallel_devices)
             else:
 
@@ -50,6 +50,7 @@ def _parse_gpu_ids(
     gpus: Optional[Union[int, str, list[int]]],
     include_cuda: bool = False,
     include_mps: bool = False,
+    include_musa: bool = False,
 ) -> Optional[list[int]]:
     """Parses the GPU IDs given in the format as accepted by the :class:`~lightning.pytorch.trainer.trainer.Trainer`.
 
@@ -61,6 +62,7 @@ def _parse_gpu_ids(
             Any int N > 0 indicates that GPUs [0..N) should be used.
         include_cuda: A boolean value indicating whether to include CUDA devices for GPU parsing.
         include_mps: A boolean value indicating whether to include MPS devices for GPU parsing.
+        include_musa: A boolean value indicating whether to include MUSA devices for GPU parsing.
 
     Returns:
         A list of GPUs to be used or ``None`` if no GPUs were requested
@@ -70,7 +72,7 @@ def _parse_gpu_ids(
             If no GPUs are available but the value of gpus variable indicates request for GPUs
 
     .. note::
-        ``include_cuda`` and ``include_mps`` default to ``False`` so that you only
+        ``include_cuda`` ``include_musa`` and ``include_mps`` default to ``False`` so that you only
         have to specify which device type to use and all other devices are not disabled.
 
     """
@@ -84,23 +86,23 @@ def _parse_gpu_ids(
     # We know the user requested GPUs therefore if some of the
     # requested GPUs are not available an exception is thrown.
     gpus = _normalize_parse_gpu_string_input(gpus)
-    gpus = _normalize_parse_gpu_input_to_list(gpus, include_cuda=include_cuda, include_mps=include_mps)
+    gpus = _normalize_parse_gpu_input_to_list(gpus, include_cuda=include_cuda, include_mps=include_mps, include_musa=include_musa)
     if not gpus:
         raise MisconfigurationException("GPUs requested but none are available.")
 
     if (
         torch.distributed.is_available()
         and torch.distributed.is_torchelastic_launched()
         and len(gpus) != 1
-        and len(_get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps)) == 1
+        and len(_get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps, include_musa=include_musa)) == 1
     ):
         # Omit sanity check on torchelastic because by default it shows one visible GPU per process
         return gpus
 
     # Check that GPUs are unique. Duplicate GPUs are not supported by the backend.
     _check_unique(gpus)
 
-    return _sanitize_gpu_ids(gpus, include_cuda=include_cuda, include_mps=include_mps)
+    return _sanitize_gpu_ids(gpus, include_cuda=include_cuda, include_mps=include_mps, include_musa=include_musa)
 
 
 def _normalize_parse_gpu_string_input(s: Union[int, str, list[int]]) -> Union[int, list[int]]:
@@ -113,7 +115,7 @@ def _normalize_parse_gpu_string_input(s: Union[int, str, list[int]]) -> Union[in
     return int(s.strip())
 
 
-def _sanitize_gpu_ids(gpus: list[int], include_cuda: bool = False, include_mps: bool = False) -> list[int]:
+def _sanitize_gpu_ids(gpus: list[int], include_cuda: bool = False, include_mps: bool = False, include_musa: bool = False) -> list[int]:
     """Checks that each of the GPUs in the list is actually available. Raises a MisconfigurationException if any of the
     GPUs is not available.
 
@@ -128,9 +130,9 @@ def _sanitize_gpu_ids(gpus: list[int], include_cuda: bool = False, include_mps:
             If machine has fewer available GPUs than requested.
 
     """
-    if sum((include_cuda, include_mps)) == 0:
+    if sum((include_cuda, include_mps, include_musa)) == 0:
         raise ValueError("At least one gpu type should be specified!")
-    all_available_gpus = _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps)
+    all_available_gpus = _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps, include_musa=include_musa)
     for gpu in gpus:
         if gpu not in all_available_gpus:
             raise MisconfigurationException(
@@ -140,7 +142,7 @@ def _sanitize_gpu_ids(gpus: list[int], include_cuda: bool = False, include_mps:
 
 
 def _normalize_parse_gpu_input_to_list(
-    gpus: Union[int, list[int], tuple[int, ...]], include_cuda: bool, include_mps: bool
+    gpus: Union[int, list[int], tuple[int, ...]], include_cuda: bool, include_mps: bool, include_musa: bool
 ) -> Optional[list[int]]:
     assert gpus is not None
     if isinstance(gpus, (MutableSequence, tuple)):
@@ -150,22 +152,24 @@ def _normalize_parse_gpu_input_to_list(
     if not gpus:  # gpus==0
         return None
     if gpus == -1:
-        return _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps)
+        return _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps, include_musa=include_musa)
 
     return list(range(gpus))
 
 
-def _get_all_available_gpus(include_cuda: bool = False, include_mps: bool = False) -> list[int]:
+def _get_all_available_gpus(include_cuda: bool = False, include_mps: bool = False, include_musa: bool = False) -> list[int]:
     """
     Returns:
         A list of all available GPUs
     """
     from lightning.fabric.accelerators.cuda import _get_all_visible_cuda_devices
     from lightning.fabric.accelerators.mps import _get_all_available_mps_gpus
+    from lightning.fabric.accelerators.musa import _get_all_visible_musa_devices
 
     cuda_gpus = _get_all_visible_cuda_devices() if include_cuda else []
     mps_gpus = _get_all_available_mps_gpus() if include_mps else []
-    return cuda_gpus + mps_gpus
+    musa_gpus = _get_all_visible_musa_devices() if include_musa else []
+    return cuda_gpus + mps_gpus + musa_gpus + musa_gpus
 
 
 def _check_unique(device_ids: list[int]) -> None:
@@ -211,11 +215,14 @@ def _select_auto_accelerator() -> str:
     from lightning.fabric.accelerators.cuda import CUDAAccelerator
     from lightning.fabric.accelerators.mps import MPSAccelerator
     from lightning.fabric.accelerators.xla import XLAAccelerator
+    from lightning.fabric.accelerators.musa import MUSAAccelerator
 
     if XLAAccelerator.is_available():
         return "tpu"
     if MPSAccelerator.is_available():
         return "mps"
     if CUDAAccelerator.is_available():
         return "cuda"
+    if MUSAAccelerator.is_available():
+        return "musa"
     return "cpu"