Add Ascend NPU device support. (#955)

Ginray · web-flow · commit 62c0544dff36 · 2025-12-02T13:48:37.000+08:00
## Summary This PR is the first step in the adaptation of Ascend NPU to Liger Kernel: adding NPU device support. For details, refer to [[RFC] Native Ascend NPU Support for Liger Kernel](#954), Section 2.1: **Device Support Integration**. ## Details Key Modifications: 1. Add the installation method and basic function adaptation for NPU. 2. Directly import via `triton.language.math` on NPU to avoid errors caused by non-existent interfaces. ## Testing Done Verification Status: We have conducted verification on **Atlas 800T A3**, and basic test cases such as `test_softmax` and `test_swiglu` have passed. We will continue to improve it in the future. <img width="2870" height="924" alt="image" src="https://github.com/user-attachments/assets/e1bb8195-e140-4531-9cc6-d590ce07e7c9" /> - Hardware Type: <BLANK> - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [x] run `make test-convergence` to ensure convergence
diff --git a/setup.py b/setup.py
@@ -24,6 +24,8 @@ def get_default_dependencies():
         return [
             "torch>=2.6.0",
         ]
+    elif platform == "npu":
+        return ["torch_npu==2.6.0", "triton-ascend"]
 
 
 def get_optional_dependencies():
@@ -67,7 +69,21 @@ def is_xpu_available():
     return False
 
 
-def get_platform() -> Literal["cuda", "rocm", "cpu", "xpu"]:
+def is_ascend_available() -> bool:
+    """Best-effort Ascend detection.
+
+    Checks for common Ascend environment variables and a possible `npu-smi`
+    utility if present.
+    """
+    try:
+        subprocess.run(["npu-smi", "info"], check=True)
+        return True
+    except (subprocess.SubprocessError, FileNotFoundError):
+        pass
+    return False
+
+
+def get_platform() -> Literal["cuda", "rocm", "cpu", "xpu", "npu"]:
     """
     Detect whether the system has NVIDIA or AMD GPU without torch dependency.
     """
@@ -86,6 +102,9 @@ def get_platform() -> Literal["cuda", "rocm", "cpu", "xpu"]:
             if is_xpu_available():
                 print("Intel GPU detected")
                 return "xpu"
+            elif is_ascend_available():
+                print("Ascend NPU detected")
+                return "npu"
             else:
                 print("No GPU detected")
                 return "cpu"
diff --git a/src/liger_kernel/ops/cross_entropy.py b/src/liger_kernel/ops/cross_entropy.py
@@ -10,8 +10,9 @@
 from liger_kernel.ops.utils import element_mul_kernel
 from liger_kernel.ops.utils import is_hip
 from liger_kernel.utils import infer_device
+from liger_kernel.utils import is_npu_available
 
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import tanh
diff --git a/src/liger_kernel/ops/dyt.py b/src/liger_kernel/ops/dyt.py
@@ -7,8 +7,10 @@
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
 from liger_kernel.ops.utils import infer_device
+from liger_kernel.utils import get_npu_multi_processor_count
+from liger_kernel.utils import is_npu_available
 
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import tanh
@@ -125,7 +127,8 @@ def liger_dyt_bwd(dy, x, alpha, gamma, beta):
         NUM_SMS = torch.cuda.get_device_properties(x.device).multi_processor_count
     elif device == "xpu":
         NUM_SMS = torch.xpu.get_device_properties(x.device).gpu_subslice_count
-
+    elif device == "npu":
+        NUM_SMS = get_npu_multi_processor_count()
     da = torch.zeros(NUM_SMS, triton.cdiv(N, 512), dtype=torch.float32, device=x.device)
     dg = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device)
     db = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device) if HAVE_BETA else None
diff --git a/src/liger_kernel/ops/fused_add_rms_norm.py b/src/liger_kernel/ops/fused_add_rms_norm.py
@@ -9,8 +9,10 @@
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
 from liger_kernel.ops.utils import torch_to_triton_dtype
+from liger_kernel.utils import get_npu_multi_processor_count
+from liger_kernel.utils import is_npu_available
 
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
@@ -293,6 +295,8 @@ def fused_add_rms_norm_backward(dY, dS_out, S, W, RSTD, offset, casting_mode, BL
         sm_count = torch.cuda.get_device_properties(S.device).multi_processor_count
     elif S.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(S.device).gpu_eu_count
+    elif S.device.type == "npu":
+        sm_count = get_npu_multi_processor_count()
 
     # fp32 for numerical stability especially.
     _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
diff --git a/src/liger_kernel/ops/geglu.py b/src/liger_kernel/ops/geglu.py
@@ -7,8 +7,9 @@
 from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.utils import is_npu_available
 
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import tanh
diff --git a/src/liger_kernel/ops/group_norm.py b/src/liger_kernel/ops/group_norm.py
@@ -6,8 +6,9 @@
 
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.utils import is_npu_available
 
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
diff --git a/src/liger_kernel/ops/layer_norm.py b/src/liger_kernel/ops/layer_norm.py
@@ -8,8 +8,9 @@
 from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.utils import is_npu_available
 
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
diff --git a/src/liger_kernel/ops/poly_norm.py b/src/liger_kernel/ops/poly_norm.py
@@ -7,8 +7,10 @@
 from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.utils import get_npu_multi_processor_count
+from liger_kernel.utils import is_npu_available
 
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         from triton.language.extra.libdevice import rsqrt
     except ModuleNotFoundError:
@@ -290,6 +292,8 @@ def poly_norm_backward(dY, X, W, RSTD, BLOCK_SIZE, num_warps, in_place):
         sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
     elif X.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
+    elif X.device.type == "npu":
+        sm_count = get_npu_multi_processor_count()
 
     # Allocate or reuse gradients
     if in_place is True:
diff --git a/src/liger_kernel/ops/rms_norm.py b/src/liger_kernel/ops/rms_norm.py
@@ -21,8 +21,10 @@
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
 from liger_kernel.ops.utils import torch_to_triton_dtype
+from liger_kernel.utils import get_npu_multi_processor_count
+from liger_kernel.utils import is_npu_available
 
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
@@ -450,6 +452,8 @@ def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warp
         sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
     elif X.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
+    elif X.device.type == "npu":
+        sm_count = get_npu_multi_processor_count()
 
     # fp32 for numerical stability especially.
     _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
diff --git a/src/liger_kernel/ops/utils.py b/src/liger_kernel/ops/utils.py
@@ -78,6 +78,8 @@ def get_amp_custom_fwd_bwd() -> Callable:
             functools.partial(torch.amp.custom_fwd, device_type=device),
             functools.partial(torch.amp.custom_bwd, device_type=device),
         )
+    if hasattr(torch, "npu") and getattr(torch.npu, "amp", None) is not None:
+        return torch.npu.amp.custom_fwd, torch.npu.amp.custom_bwd
     return torch.cuda.amp.custom_fwd, torch.cuda.amp.custom_bwd
 
 
diff --git a/src/liger_kernel/utils.py b/src/liger_kernel/utils.py
@@ -18,12 +18,37 @@ def infer_device():
     """
     if torch.cuda.is_available():  # Works for both Nvidia and AMD
         return "cuda"
+    # Use Ascend NPU if available (torch.npu)
+    elif is_npu_available():
+        return "npu"
+    # XPU (Intel) if available
     elif torch.xpu.is_available():
         return "xpu"
     else:
         return "cpu"
 
 
+def is_npu_available() -> bool:
+    """Detect Ascend NPU availability."""
+    try:
+        from transformers.utils import is_torch_npu_available
+
+        return is_torch_npu_available()
+    except Exception:
+        return False
+
+
+def get_npu_multi_processor_count() -> int:
+    """Return a heuristic multi-processor count for NPU."""
+    if is_npu_available():
+        NPU_MULTI_PROCESSOR_COUNT = 48
+        dev_props = torch.npu.get_device_properties()
+        # The vector_core_num attribute is supported in the torch.npu v7.2.0 release version.
+        return dev_props.vector_core_num if hasattr(dev_props, "vector_core_num") else NPU_MULTI_PROCESSOR_COUNT
+    # Reasonable default to avoid division by zero
+    return 1
+
+
 def transformers_version_dispatch(
     required_version: str,
     before_fn,
diff --git a/test/conftest.py b/test/conftest.py
@@ -1,11 +1,15 @@
 import pytest
 import torch
 
+from liger_kernel.utils import is_npu_available
+
 
 @pytest.fixture(autouse=True)
 def clear_gpu_cache():
     yield
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+    elif is_npu_available():
+        torch.npu.empty_cache()
     elif torch.xpu.is_available():
         torch.xpu.empty_cache()
diff --git a/test/utils.py b/test/utils.py
@@ -55,6 +55,9 @@ def set_seed(seed=42):
         # If you are using XPU
         torch.xpu.manual_seed(seed)
         torch.xpu.manual_seed_all(seed)
+    elif device == "npu":
+        torch.npu.manual_seed(seed)
+        torch.npu.manual_seed_all(seed)
 
     # Python hash seed
     os.environ["PYTHONHASHSEED"] = str(seed)
@@ -258,6 +261,8 @@ def supports_bfloat16():
         return torch.cuda.get_device_capability() >= (8, 0)  # Ampere and newer
     elif device == "xpu":
         return True
+    elif device == "npu":
+        return True
     else:
         return False
 

Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,8 @@ def get_amp_custom_fwd_bwd() -> Callable:`
`78`	`78`	`functools.partial(torch.amp.custom_fwd, device_type=device),`
`79`	`79`	`functools.partial(torch.amp.custom_bwd, device_type=device),`
`80`	`80`	`)`
	`81`	`+ if hasattr(torch, "npu") and getattr(torch.npu, "amp", None) is not None:`
	`82`	`+ return torch.npu.amp.custom_fwd, torch.npu.amp.custom_bwd`
`81`	`83`	`return torch.cuda.amp.custom_fwd, torch.cuda.amp.custom_bwd`
`82`	`84`
`83`	`85`