[NPU]: Adjust MAX_FUSED_SIZE when using fused_linear_cross_entropy (#985)

zheliuyu · lancerts · web-flow · commit 44c2c31b7966 · 2025-12-23T08:15:43.000-08:00
## Summary
&lt;!--- This is a required section; please describe the main purpose of
this proposed code change. ---&gt;

Adjust MAX_FUSED_SIZE to avoid ub overflow when using
fused_linear_cross_entropy on npu.

## Testing Done

- Hardware Type: Ascend NPU A2
- [x] run `make test` to ensure correctness
- [x] run `make checkstyle` to ensure code style
- [x] run `make test-convergence` to ensure convergence

### Compare

`pytest test/transformers/test_fused_linear_cross_entropy.py`

```
Original code: 105 passed, 16 failed. All failed due to ub overflow.

Adjusted: 121 passed
```

Co-authored-by: Shao Tang &lt;tangshao28@gmail.com&gt;
diff --git a/src/liger_kernel/ops/cross_entropy.py b/src/liger_kernel/ops/cross_entropy.py
@@ -289,7 +289,13 @@ def liger_cross_entropy_kernel(
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
 # The optimal maximum block size depends on your hardware, your kernel, and your dtype
-MAX_FUSED_SIZE = 4096 if infer_device() == "xpu" else 65536 // 2  # the best size we found by manually tuning
+# the best size we found by manually tuning on xpu and npu.
+if infer_device() == "xpu":
+    MAX_FUSED_SIZE = 4096
+elif infer_device() == "npu":
+    MAX_FUSED_SIZE = 2048
+else:
+    MAX_FUSED_SIZE = 65536 // 2
 
 
 def cross_entropy_forward(
diff --git a/src/liger_kernel/ops/fused_linear_cross_entropy.py b/src/liger_kernel/ops/fused_linear_cross_entropy.py
@@ -6,11 +6,12 @@
 from liger_kernel.ops.utils import amp_custom_fwd
 from liger_kernel.ops.utils import element_mul_kernel
 from liger_kernel.ops.utils import is_hip
+from liger_kernel.utils import infer_device
 
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
 # The optimal maximum block size depends on your hardware, your kernel, and your dtype
-MAX_FUSED_SIZE = 65536 // 2
+MAX_FUSED_SIZE = 2048 if infer_device() == "npu" else 65536 // 2
 
 
 def fused_linear_cross_entropy_forward(