Unify NPU vector core count helpers (#1052)

lowdy1 · web-flow · commit bb886718a591 · 2026-02-02T15:48:07.000+08:00
## Summary
Both `get_npu_core_count` and `get_npu_multi_processor_count` currently
serve the same purpose: retrieving the number of NPU vector cores. And
`get_npu_multi_processor_count` additionally requires torch_npu &gt;=
v7.2.0.

This PR leaves a single implementation(`get_npu_core_count`) for
consistency.

In the future, we may further simplify this logic by replacing the
helper with `torch.npu.get_device_properties()`, aligning the NPU with
other backends.


## Testing Done
All affected operators were tested with `pytest`.

Hardware Type: Ascend 910B4


- [x] run `make test` to ensure correctness
- [x] run `make checkstyle` to ensure code style
- [ ] run `make test-convergence` to ensure convergence
diff --git a/src/liger_kernel/ops/dyt.py b/src/liger_kernel/ops/dyt.py
@@ -6,8 +6,8 @@
 
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
 from liger_kernel.ops.utils import infer_device
-from liger_kernel.utils import get_npu_multi_processor_count
 from liger_kernel.utils import is_npu_available
 
 if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
@@ -128,7 +128,7 @@ def liger_dyt_bwd(dy, x, alpha, gamma, beta):
     elif device == "xpu":
         NUM_SMS = torch.xpu.get_device_properties(x.device).gpu_subslice_count
     elif device == "npu":
-        NUM_SMS = get_npu_multi_processor_count()
+        NUM_SMS = get_npu_core_count()
     da = torch.zeros(NUM_SMS, triton.cdiv(N, 512), dtype=torch.float32, device=x.device)
     dg = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device)
     db = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device) if HAVE_BETA else None
diff --git a/src/liger_kernel/ops/fused_add_rms_norm.py b/src/liger_kernel/ops/fused_add_rms_norm.py
@@ -8,9 +8,9 @@
 from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
 from liger_kernel.ops.utils import set_large_grf_mode
 from liger_kernel.ops.utils import torch_to_triton_dtype
-from liger_kernel.utils import get_npu_multi_processor_count
 from liger_kernel.utils import is_npu_available
 
 if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
@@ -290,7 +290,7 @@ def fused_add_rms_norm_backward(dY, dS_out, S, W, RSTD, offset, casting_mode, BL
     elif S.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(S.device).gpu_eu_count
     elif S.device.type == "npu":
-        sm_count = get_npu_multi_processor_count()
+        sm_count = get_npu_core_count()
 
     # fp32 for numerical stability especially.
     _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
diff --git a/src/liger_kernel/ops/layer_norm.py b/src/liger_kernel/ops/layer_norm.py
@@ -8,8 +8,8 @@
 from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
 from liger_kernel.ops.utils import set_large_grf_mode
-from liger_kernel.utils import get_npu_multi_processor_count
 from liger_kernel.utils import is_npu_available
 
 if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
@@ -251,7 +251,7 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
     elif X.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
     elif X.device.type == "npu":
-        sm_count = get_npu_multi_processor_count()
+        sm_count = get_npu_core_count()
 
     # fp32 for numerical stability especially.
     _DW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
diff --git a/src/liger_kernel/ops/poly_norm.py b/src/liger_kernel/ops/poly_norm.py
@@ -7,8 +7,8 @@
 from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
 from liger_kernel.ops.utils import set_large_grf_mode
-from liger_kernel.utils import get_npu_multi_processor_count
 from liger_kernel.utils import is_npu_available
 
 if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
@@ -287,7 +287,7 @@ def poly_norm_backward(dY, X, W, RSTD, BLOCK_SIZE, num_warps, in_place):
     elif X.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
     elif X.device.type == "npu":
-        sm_count = get_npu_multi_processor_count()
+        sm_count = get_npu_core_count()
 
     # Allocate or reuse gradients
     if in_place is True:
diff --git a/src/liger_kernel/ops/rms_norm.py b/src/liger_kernel/ops/rms_norm.py
@@ -20,9 +20,9 @@
 from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
 from liger_kernel.ops.utils import set_large_grf_mode
 from liger_kernel.ops.utils import torch_to_triton_dtype
-from liger_kernel.utils import get_npu_multi_processor_count
 from liger_kernel.utils import is_npu_available
 
 if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
@@ -494,7 +494,7 @@ def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warp
     elif X.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
     elif X.device.type == "npu":
-        sm_count = get_npu_multi_processor_count()
+        sm_count = get_npu_core_count()
 
     if W is not None:
         # fp32 for numerical stability especially.
diff --git a/src/liger_kernel/utils.py b/src/liger_kernel/utils.py
@@ -65,17 +65,6 @@ def is_npu_available() -> bool:
         return False
 
 
-def get_npu_multi_processor_count() -> int:
-    """Return a heuristic multi-processor count for NPU."""
-    if is_npu_available():
-        NPU_MULTI_PROCESSOR_COUNT = 48
-        dev_props = torch.npu.get_device_properties()
-        # The vector_core_num attribute is supported in the torch.npu v7.2.0 release version.
-        return dev_props.vector_core_num if hasattr(dev_props, "vector_core_num") else NPU_MULTI_PROCESSOR_COUNT
-    # Reasonable default to avoid division by zero
-    return 1
-
-
 def transformers_version_dispatch(
     required_version: str,
     before_fn,