update

shihaobai · web-flow · commit e5dbfb53cca5 · 2025-08-07T21:03:39.000+08:00
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -516,8 +516,7 @@ def grouped_matmul(
     if block_size_k != 0:
         # 如果使用了 block wise 量化，分块大小不能超过 block size
         BLOCK_SIZE_K = min(BLOCK_SIZE_K, block_size_k)
-        BLOCK_SIZE_K = triton.next_power_of_2(BLOCK_SIZE_K//2 + 1)
-        # assert BLOCK_SIZE_K == triton.next_power_of_2(BLOCK_SIZE_K)
+        assert BLOCK_SIZE_K == triton.next_power_of_2(BLOCK_SIZE_K)
 
     if use_fp8_w8a8:
         # 当权重使用 block wise 量化时，激活也使用 per token， group size 量化
diff --git a/lightllm/common/fused_moe/moe_kernel_configs.py b/lightllm/common/fused_moe/moe_kernel_configs.py
@@ -42,12 +42,12 @@ def try_to_get_best_config(
         else:
             if M <= expert_num:
                 config = {
-                    "BLOCK_SIZE_M": 32,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 128,
-                    "GROUP_SIZE_M": 32,
+                    "BLOCK_SIZE_M": 16,
+                    "BLOCK_SIZE_N": 32,
+                    "BLOCK_SIZE_K": 64,
+                    "GROUP_SIZE_M": 1,
                     "num_warps": 4,
-                    "num_stages": 3,
+                    "num_stages": 1,
                 }
             else:
                 config = {
diff --git a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
@@ -111,7 +111,7 @@ def _moe_ffn_edp(
         ep_output = layer_weight.experts.experts(
             hidden_states,
             router_logits=router_logits,
-            top_k=8,
+            top_k=self.num_experts_per_tok,
             renormalize=self.norm_topk_prob,
             use_grouped_topk=False,
             topk_group=None,
diff --git a/lightllm/utils/envs_utils.py b/lightllm/utils/envs_utils.py
@@ -26,9 +26,7 @@ def get_unique_server_name():
 def set_cuda_arch(args):
     if not torch.cuda.is_available():
         return
-    from lightllm.utils.sgl_utils import HAS_FLASHINFER
-
-    if HAS_FLASHINFER:
+    if args.enable_flashinfer_prefill or args.enable_flashinfer_decode:
         capability = torch.cuda.get_device_capability()
         arch = f"{capability[0]}.{capability[1]}"
         os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"
diff --git a/lightllm/utils/sgl_utils.py b/lightllm/utils/sgl_utils.py
@@ -30,15 +30,3 @@
         "sgl_kernel is not installed, or the installed version did not support fa3. \
         Try to upgrade it."
     )
-
-try:
-    import flashinfer
-    from flashinfer.norm import fused_add_rmsnorm, rmsnorm
-
-    HAS_FLASHINFER = True
-except:
-    HAS_FLASHINFER = False
-    logger.warning(
-        "flashinfer is not installed, you can't use the api of it. \
-                   You can solve it by running `pip install flashinfer`."
-    )