[NPU] bugfix for model Qwen3-Coder-Next at weight shape transpose for npu. (sgl-project#18700)

Hexq0210 · McZyWu · web-flow · commit d0bb14003489 · 2026-02-25T15:46:20.000+08:00
Co-authored-by: McZyWu &lt;zhuoyun.wu.23@ucl.ac.uk&gt;
diff --git a/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -118,7 +118,7 @@ def npu_fused_moe_without_routing_weights_bf16(
     # gmm1: gate_up_proj
     hidden_states = torch.ops.npu.npu_grouped_matmul(
         x=[hidden_states],
-        weight=[layer.w13_weight.permute(0, 2, 1)],
+        weight=[layer.w13_weight],
         split_item=2,
         group_list_type=group_list_type,
         group_type=0,
@@ -129,7 +129,7 @@ def npu_fused_moe_without_routing_weights_bf16(
     # gmm2: down_proj
     hidden_states = torch.ops.npu.npu_grouped_matmul(
         x=[hidden_states],
-        weight=[layer.w2_weight.permute(0, 2, 1)],
+        weight=[layer.w2_weight],
         split_item=2,
         group_list_type=group_list_type,
         group_type=0,
diff --git a/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py
@@ -46,7 +46,7 @@
 from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_npu
 from sglang.srt.utils.common import rank0_log
 
-if not is_cpu() and not is_npu():
+if not is_cpu():
     # fix import error on CPU device, no impacts when non-CPU path
     try:
         from sglang.jit_kernel.cutedsl_gdn import (