reformat

shihaobai · shihaobai · commit a9e015639ed8 · 2025-08-19T19:20:29.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py
@@ -70,20 +70,27 @@ def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_t
         )
         topk_weights.mul_(self.routed_scaling_factor)
         if self.num_fused_shared_experts > 0:
-            pad_topk_ids = torch.arange(
-                         start=self.n_routed_experts - self.num_fused_shared_experts, 
-                         end=self.n_routed_experts,
-                         step=1,
-                         dtype=topk_ids.dtype,
-                         device="cuda").view(1, self.num_fused_shared_experts).repeat(topk_ids.shape[0], 1)
-            pad_topk_weights = torch.full((topk_weights.shape[0], self.num_fused_shared_experts),
-                                          fill_value=1.0,
-                                          device="cuda",
-                                          dtype=topk_weights.dtype)
-            
+            pad_topk_ids = (
+                torch.arange(
+                    start=self.n_routed_experts - self.num_fused_shared_experts,
+                    end=self.n_routed_experts,
+                    step=1,
+                    dtype=topk_ids.dtype,
+                    device="cuda",
+                )
+                .view(1, self.num_fused_shared_experts)
+                .repeat(topk_ids.shape[0], 1)
+            )
+            pad_topk_weights = torch.full(
+                (topk_weights.shape[0], self.num_fused_shared_experts),
+                fill_value=1.0,
+                device="cuda",
+                dtype=topk_weights.dtype,
+            )
+
             topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1)
             topk_weights = torch.cat([topk_weights, pad_topk_weights], dim=1)
-        
+
         w1, w1_scale = self.w1
         w2, w2_scale = self.w2
         use_fp8_w8a8 = self.quant_method is not None
diff --git a/lightllm/common/fused_moe/moe_silu_and_mul.py b/lightllm/common/fused_moe/moe_silu_and_mul.py
@@ -4,6 +4,7 @@
 import triton.language as tl
 from .moe_silu_and_mul_config import MoeSiluAndMulKernelConfig
 
+
 @triton.jit
 def _silu_and_mul_kernel_fast(
     input_ptr,
@@ -34,12 +35,12 @@ def _silu_and_mul_kernel_fast(
     else:
         mask = None
         other = None
-    
+
     for m_index in tl.range(m_start_index, m_end_index, num_stages=NUM_STAGES):
         gate_offsets = m_index * stride_input_m + n_offsets[None, :]
         up_offsets = m_index * stride_input_m + (n_offsets[None, :] + size_n)
         out_offsets = m_index * stride_output_m + n_offsets[None, :]
-        
+
         up = tl.load(
             input_ptr + up_offsets,
             mask=mask,
diff --git a/lightllm/common/quantization/deepgemm_quant.py b/lightllm/common/quantization/deepgemm_quant.py
@@ -65,6 +65,6 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
             )
 
         if out is None:
-            out = alloc_func((m, n), input_tensor.dtype, device=input_tensor.device)
+            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
         deep_gemm.gemm_fp8_fp8_bf16_nt([qinput_tensor, input_scale], [qweight.t(), weight_scale.t()], out)
         return out
diff --git a/lightllm/common/quantization/triton_quant/triton_quant.py b/lightllm/common/quantization/triton_quant/triton_quant.py
@@ -49,7 +49,7 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
         m = input_tensor.shape[0]
         n = qweight.shape[1]
         if out is None:
-            out = alloc_func((m, n), input_tensor.dtype, device=input_tensor.device)
+            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
         w8a8_block_fp8_matmul(
             input_tensor_q,
             qweight,
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
@@ -137,7 +137,7 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
                 input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func
             )
         if out is None:
-            out = alloc_func((m, n), input_tensor.dtype, device=input_tensor.device)
+            out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
         if n % 128 != 0:
             w8a8_block_fp8_matmul(
                 qinput_tensor,
diff --git a/lightllm/models/deepseek2/triton_kernel/rotary_emb_config.py b/lightllm/models/deepseek2/triton_kernel/rotary_emb_config.py
@@ -38,7 +38,7 @@ def try_to_get_best_config(
                 config = {"BLOCK_SEQ": 1, "NUM_STAGE": 1, "num_warps": 1, "num_stages": 1, "HEAD_PARALLEL_NUM": 1}
             else:
                 config = {"BLOCK_SEQ": 16, "NUM_STAGE": 1, "num_warps": 1, "num_stages": 1, "HEAD_PARALLEL_NUM": 1}
-            
+
         return config
 
     @classmethod
diff --git a/test/kernel/deepseekv3_rotary_emb_tuning.py b/test/kernel/deepseekv3_rotary_emb_tuning.py
@@ -213,7 +213,7 @@ def tuning_configs(
 if __name__ == "__main__":
     torch.multiprocessing.set_start_method("spawn")
     from lightllm.utils.tuning_utils import mp_tuning
-     
+
     # for deepseekv3 600B
 
     for q_head_num in [128, 64, 32, 16, 8]:
diff --git a/test/kernel/fuse_moe_tuning.py b/test/kernel/fuse_moe_tuning.py
@@ -99,20 +99,27 @@ def test_kernel(
     topk_values, topk_ids = torch.topk(rnd_logics, topk, dim=1)
     if num_fused_shared_experts > 0:
         # 存在融合共享专家的时候，需要pad 共享专家对应的id 到topk_ids 中
-        pad_topk_ids = torch.arange(
-                start=expert_num - num_fused_shared_experts, 
-                end=expert_num,
-                step=1,
-                dtype=topk_ids.dtype,
-                device="cuda").view(1, num_fused_shared_experts).repeat(topk_ids.shape[0], 1)
+        pad_topk_ids = (
+            torch.arange(
+                start=expert_num - num_fused_shared_experts, end=expert_num, step=1, dtype=topk_ids.dtype, device="cuda"
+            )
+            .view(1, num_fused_shared_experts)
+            .repeat(topk_ids.shape[0], 1)
+        )
         topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1)
     topk_weights = torch.randn((m, topk + num_fused_shared_experts), device="cuda", dtype=dtype) / 10
 
-    expert_to_tokens = torch.empty((expert_num, (topk + num_fused_shared_experts) * m), dtype=torch.int32, device="cuda")
-    expert_to_weights = torch.empty((expert_num, (topk + num_fused_shared_experts) * m), dtype=torch.float32, device="cuda")
+    expert_to_tokens = torch.empty(
+        (expert_num, (topk + num_fused_shared_experts) * m), dtype=torch.int32, device="cuda"
+    )
+    expert_to_weights = torch.empty(
+        (expert_num, (topk + num_fused_shared_experts) * m), dtype=torch.float32, device="cuda"
+    )
     moe_align(topk_ids=topk_ids, out=expert_to_tokens)
     expert_to_token_num = torch.empty((expert_num,), dtype=torch.int32, device="cuda")
-    moe_align1(expert_to_tokens, topk_weights, expert_to_weights, expert_to_token_num, topk=topk + num_fused_shared_experts)
+    moe_align1(
+        expert_to_tokens, topk_weights, expert_to_weights, expert_to_token_num, topk=topk + num_fused_shared_experts
+    )
 
     out1 = torch.zeros((m * (topk + num_fused_shared_experts), 2 * n), dtype=torch.bfloat16, device="cuda")
     down_in = torch.zeros((m * (topk + num_fused_shared_experts), n), dtype=torch.bfloat16, device="cuda")

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,6 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_`
`65`	`65`	`)`
`66`	`66`
`67`	`67`	`if out is None:`
`68`		`- out = alloc_func((m, n), input_tensor.dtype, device=input_tensor.device)`
	`68`	`+ out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)`
`69`	`69`	`deep_gemm.gemm_fp8_fp8_bf16_nt([qinput_tensor, input_scale], [qweight.t(), weight_scale.t()], out)`
`70`	`70`	`return out`
Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_`
`137`	`137`	`input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func`
`138`	`138`	`)`
`139`	`139`	`if out is None:`
`140`		`- out = alloc_func((m, n), input_tensor.dtype, device=input_tensor.device)`
	`140`	`+ out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)`
`141`	`141`	`if n % 128 != 0:`
`142`	`142`	`w8a8_block_fp8_matmul(`
`143`	`143`	`qinput_tensor,`