ModelTC
diff --git a/‎docker/Dockerfile‎
Lines changed: 2 additions & 1 deletion b/‎docker/Dockerfile‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docker/Dockerfile.deepep‎
Lines changed: 2 additions & 1 deletion b/‎docker/Dockerfile.deepep‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 4 additions & 3 deletions b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py‎
Lines changed: 3 additions & 2 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 36 additions & 22 deletions b/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 36 additions & 22 deletions
diff --git a/‎lightllm/common/fused_moe/grouped_fused_moe_ep.py‎
Lines changed: 3 additions & 2 deletions b/‎lightllm/common/fused_moe/grouped_fused_moe_ep.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎lightllm/common/fused_moe/grouped_topk.py‎
Lines changed: 8 additions & 2 deletions b/‎lightllm/common/fused_moe/grouped_topk.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎lightllm/common/fused_moe/moe_kernel_configs.py‎
Lines changed: 2 additions & 0 deletions b/‎lightllm/common/fused_moe/moe_kernel_configs.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lightllm/common/fused_moe/moe_silu_and_mul.py‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/common/fused_moe/moe_silu_and_mul.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lightllm/common/fused_moe/topk_select.py‎
Lines changed: 10 additions & 0 deletions b/‎lightllm/common/fused_moe/topk_select.py‎
Lines changed: 10 additions & 0 deletions
@@ -39,7 +39,8 @@ RUN pip install -r /lightllm/requirements.txt --no-cache-dir
 
 RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly 
 
-RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v .
+# TODO: offline compile
+# RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v .
 
 RUN apt-get update && apt-get install -y libnuma-dev # for sgl_kernel
 
 
@@ -39,7 +39,8 @@ RUN pip install -r /lightllm/requirements.txt --no-cache-dir
 
 RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly 
 
-RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v .
+# TODO: offline compile
+# RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v .
 
 RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms
 RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev
 
@@ -24,8 +24,9 @@
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.distributed.communication_op import dist_group_manager
 from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
+from lightllm.common.triton_utils.autotuner import AutotuneLevel
 from lightllm.utils.custom_kernel_utis import pad2dim_tensor_to_new_batch
-from lightllm.utils.envs_utils import set_model_init_status, is_triton_autotune_enabled, disable_triton_autotune
+from lightllm.utils.envs_utils import set_model_init_status, set_triton_autotune_level, get_triton_autotune_level
 from lightllm.utils.infer_utils import post_empty_cache
 
 logger = init_logger(__name__)
@@ -731,7 +732,7 @@ def autotune_layers(self):
     @torch.no_grad()
     @post_empty_cache
     def _autotune_warmup(self):
-        if not is_triton_autotune_enabled():
+        if get_triton_autotune_level() not in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
             return
 
         torch.distributed.barrier()
@@ -794,7 +795,7 @@ def _autotune_warmup(self):
                 torch.cuda.empty_cache()
         self.layers_num = layer_num_bak
         torch.distributed.barrier()
-        disable_triton_autotune()
+        set_triton_autotune_level(AutotuneLevel.USE_AUTOTUNE_HIS_CONFIG)
 
     @final
     @torch.no_grad()
 
@@ -17,7 +17,8 @@
 )
 from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
 from lightllm.common.basemodel.triton_kernel.redundancy_topk_ids_repair import redundancy_topk_ids_repair
-from lightllm.utils.envs_utils import is_triton_autotune_enabled
+from lightllm.utils.envs_utils import get_triton_autotune_level
+from lightllm.common.triton_utils.autotuner import AutotuneLevel
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
@@ -358,7 +359,7 @@ def prefilled_group_gemm(
             ######################################## warning ##################################################
             # here is used to match autotune feature, make moe model run same triton kernel in different rank.
             # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
-            if is_triton_autotune_enabled():
+            if get_triton_autotune_level() in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
                 _gemm_out_a = torch.zeros((1, N), device=device, dtype=hidden_dtype)
                 _silu_out = torch.zeros((1, N // 2), device=device, dtype=hidden_dtype)
                 silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
 
@@ -332,6 +332,7 @@ def grouped_matmul_kernel(
     GROUP_SIZE_M: tl.constexpr,
     MUL_ROUTED_WEIGHT: tl.constexpr = False,
     NEED_K_MASK: tl.constexpr = True,
+    NEED_TRANS: tl.constexpr = False,
 ):
     pid = tl.program_id(0)
 
@@ -367,13 +368,6 @@ def grouped_matmul_kernel(
         mask=token_mask,
         other=0,
     )
-    if MUL_ROUTED_WEIGHT:
-        a_m_scale = tl.load(
-            expert_to_weights_ptr + expert_id * expert_to_weights_stride0 + offs_am,
-            mask=token_mask,
-            other=0.0,
-        )
-
     offs_bn = (tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % n
     offs_k = tl.arange(0, BLOCK_SIZE_K)
 
@@ -387,7 +381,7 @@ def grouped_matmul_kernel(
             b_scale = tl.load(weight_scale_ptr + expert_id, eviction_policy="evict_last")
             ab_scale = a_scale * b_scale
 
-    if use_fp8_w8a8:
+    if NEED_TRANS:
         a_ptrs = token_ptr + (a_m_index // topk_num)[None, :] * token_stride_0 + offs_k[:, None]
         b_ptrs = weights_ptr + weight_stride_0 * expert_id + offs_k[None, :] + offs_bn[:, None] * weight_stride_1
         accumulator = tl.zeros((BLOCK_SIZE_N, BLOCK_SIZE_M), dtype=tl.float32)
@@ -401,16 +395,20 @@ def grouped_matmul_kernel(
         # tl.multiple_of(a_ptrs, [16, 16])
         # tl.multiple_of(b_ptrs, [16, 16])
 
-        if use_fp8_w8a8:
+        if NEED_TRANS:
             if NEED_K_MASK:
-                a = tl.load(a_ptrs, mask=(token_mask[None, :]) & (offs_k[:, None] < k), other=0.0)
+                a = tl.load(
+                    a_ptrs, mask=(token_mask[None, :]) & (offs_k[:, None] < k - step_k * BLOCK_SIZE_K), other=0.0
+                )
                 b = tl.load(b_ptrs, mask=(offs_k[None, :] < k), other=0.0)
             else:
                 a = tl.load(a_ptrs, mask=(token_mask[None, :]), other=0.0)
                 b = tl.load(b_ptrs)
         else:
             if NEED_K_MASK:
-                a = tl.load(a_ptrs, mask=(token_mask[:, None]) & (offs_k[None, :] < k), other=0.0)
+                a = tl.load(
+                    a_ptrs, mask=(token_mask[:, None]) & (offs_k[None, :] < k - step_k * BLOCK_SIZE_K), other=0.0
+                )
                 b = tl.load(b_ptrs, mask=(offs_k[:, None] < k), other=0.0)
             else:
                 a = tl.load(a_ptrs, mask=(token_mask[:, None]), other=0.0)
@@ -421,24 +419,34 @@ def grouped_matmul_kernel(
                 offs_ks = step_k * BLOCK_SIZE_K // block_size_k
                 a_scale = tl.load(a_scale_ptrs + offs_ks, mask=token_mask, other=0.0)
                 b_scale = tl.load(b_scale_ptrs + offs_ks * weight_scale_stride2)
-                accumulator += tl.dot(b, a) * b_scale[:, None] * a_scale[None, :]
+                if NEED_TRANS:
+                    accumulator += tl.dot(b, a) * b_scale[:, None] * a_scale[None, :]
+                else:
+                    accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :]
             else:
-                accumulator = tl.dot(b, a, acc=accumulator)
+                if NEED_TRANS:
+                    accumulator = tl.dot(b, a, acc=accumulator)
+                else:
+                    accumulator = tl.dot(a, b, acc=accumulator)
         else:
             accumulator += tl.dot(a, b)
 
         a_ptrs += BLOCK_SIZE_K
         b_ptrs += BLOCK_SIZE_K
-        offs_k += BLOCK_SIZE_K
+
+    if NEED_TRANS:
+        accumulator = accumulator.T
 
     if use_fp8_w8a8:
-        if block_size_k > 0 and block_size_n > 0:
-            accumulator = accumulator.T
-        else:
-            accumulator = accumulator.T
+        if not (block_size_k > 0 and block_size_n > 0):
             accumulator *= ab_scale
 
     if MUL_ROUTED_WEIGHT:
+        a_m_scale = tl.load(
+            expert_to_weights_ptr + expert_id * expert_to_weights_stride0 + offs_am,
+            mask=token_mask,
+            other=0.0,
+        )
         accumulator *= a_m_scale[:, None]
 
     c = accumulator.to(compute_type)
@@ -478,13 +486,15 @@ def _get_grouped_matmul_configs():
             "GROUP_SIZE_M": gm,
             "num_warps": nw,
             "num_stages": ns,
+            "NEED_TRANS": need_trans,
         }
-        for ns in [1, 2, 3, 4, 5]
-        for gm in [1, 2, 4, 8]
-        for nw in [2, 4, 8]
+        for ns in [2, 3, 4, 5]
+        for gm in [1, 16, 32, 64]
+        for nw in [4, 8]
         for bm in [16, 32, 64, 128]
         for bn in [16, 32, 64, 128]
-        for bk in [16, 32, 64, 128]
+        for bk in [32, 64, 128]
+        for need_trans in [True, False]
     ]
 
 
@@ -559,6 +569,9 @@ def grouped_matmul(
     GROUP_SIZE_M = run_config["GROUP_SIZE_M"]
     num_warps = run_config["num_warps"]
     num_stages = run_config["num_stages"]
+    NEED_TRANS = run_config.get("NEED_TRANS", False)
+    if not use_fp8_w8a8:
+        assert NEED_TRANS is False, "only use_fp8_w8a8 mode can use NEED_TRANS to accelerate"
 
     if block_size_k != 0:
         # 如果使用了 block wise 量化，分块大小不能超过 block size
@@ -638,6 +651,7 @@ def grouped_matmul(
         GROUP_SIZE_M=GROUP_SIZE_M,
         MUL_ROUTED_WEIGHT=mul_routed_weight,
         NEED_K_MASK=NEED_K_MASK,
+        NEED_TRANS=NEED_TRANS,
         num_warps=num_warps,
         num_stages=num_stages,
     )
 
@@ -14,7 +14,8 @@
 )
 from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
 from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
-from lightllm.utils.envs_utils import is_triton_autotune_enabled
+from lightllm.utils.envs_utils import get_triton_autotune_level
+from lightllm.common.triton_utils.autotuner import AutotuneLevel
 import numpy as np
 
 logger = init_logger(__name__)
@@ -191,7 +192,7 @@ def fused_experts_impl(
             ######################################## warning ##################################################
             # here is used to match autotune feature, make moe model run same triton kernel in different rank.
             # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
-            if is_triton_autotune_enabled():
+            if get_triton_autotune_level() in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
                 _gemm_out_a = torch.zeros((1, N), device=hidden_states.device, dtype=hidden_states.dtype)
                 _silu_out = torch.zeros((1, N // 2), device=hidden_states.device, dtype=hidden_states.dtype)
                 silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
 
@@ -140,6 +140,7 @@ def grouped_topk_kernel(
     offs_group = tl.arange(0, EXPERT_GROUP_NUM)
     offs_group_v = tl.arange(0, EXPERT_GROUP_SIZE)
     tl.store(scores_buffer_ptr + scores_stride_m * token_index + offs_n, scores, mask=offs_n < total_expert_num)
+    tl.debug_barrier()
     group_scores = tl.load(
         scores_buffer_ptr
         + scores_stride_token_m * token_index
@@ -174,7 +175,7 @@ def grouped_topk_kernel(
         mask_group_scores,
         mask=((offs_group < group_num)[:, None]) & ((offs_group_v < group_expert_num)[None, :]),
     )  # [group, group_size]
-
+    tl.debug_barrier()
     mask_scores = tl.load(
         scores_buffer_ptr + scores_stride_m * token_index + offs_n, mask=offs_n < total_expert_num, other=-10000000.0
     )
@@ -227,6 +228,11 @@ def triton_grouped_topk(
 
     assert total_expert_num % num_expert_group == 0
 
+    if token_num <= 256:
+        num_warps = 4
+    else:
+        num_warps = 1
+
     grouped_topk_kernel[(token_num,)](
         gating_output,
         *gating_output.stride(),
@@ -250,7 +256,7 @@ def triton_grouped_topk(
         EXPERT_GROUP_SIZE=triton.next_power_of_2(total_expert_num // num_expert_group),
         RENORMALIZE=renormalize,
         GROUP_SCORE_USED_TOPK_NUM=group_score_used_topk_num,
-        num_warps=1,
+        num_warps=num_warps,
         num_stages=1,
     )
     return out_topk_weights, out_topk_ids
@@ -46,6 +46,7 @@ def try_to_get_best_config(
                     "BLOCK_SIZE_N": 32,
                     "BLOCK_SIZE_K": 64,
                     "GROUP_SIZE_M": 1,
+                    "NEED_TRANS": False,
                     "num_warps": 4,
                     "num_stages": 1,
                 }
@@ -55,6 +56,7 @@ def try_to_get_best_config(
                     "BLOCK_SIZE_N": 64,
                     "BLOCK_SIZE_K": 32,
                     "GROUP_SIZE_M": 8,
+                    "NEED_TRANS": False,
                     "num_warps": 4,
                     "num_stages": 1,
                 }
 
@@ -68,7 +68,7 @@ def _get_silu_and_mul_configs():
         {"BLOCK_M": bm, "BLOCK_N": bn, "num_warps": nw, "NUM_STAGES": ns}
         for ns in [1, 2, 4]
         for nw in [1, 4, 8]
-        for bm in [32, 64, 128, 256]
+        for bm in [1, 8, 32, 64, 128, 256]
         for bn in [32, 64, 128, 256]
     ]
 
 
@@ -23,6 +23,8 @@
 from lightllm.utils.light_utils import light_ops
 from typing import Callable, List, Optional, Tuple
 from lightllm.common.fused_moe.softmax_topk import softmax_topk
+from lightllm.common.triton_utils.autotuner import AutotuneLevel
+from lightllm.utils.envs_utils import get_triton_autotune_level
 
 use_cuda_grouped_topk = os.getenv("LIGHTLLM_CUDA_GROUPED_TOPK", "False").upper() in ["ON", "TRUE", "1"]
 
@@ -221,4 +223,12 @@ def select_experts(
             hidden_states=hidden_states, gating_output=router_logits, topk=top_k, renormalize=renormalize
         )
 
+    ######################################## warning ##################################################
+    # here is used to match autotune feature, make topk_ids more random
+    if get_triton_autotune_level() in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
+        rand_gen = torch.Generator(device="cuda")
+        rand_gen.manual_seed(router_logits.shape[0])
+        router_logits = torch.randn(size=router_logits.shape, generator=rand_gen, dtype=torch.float32, device="cuda")
+        _, topk_ids = torch.topk(router_logits, k=top_k, dim=1)
+
     return topk_weights, topk_ids
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ def _get_silu_and_mul_configs():`
`68`	`68`	`{"BLOCK_M": bm, "BLOCK_N": bn, "num_warps": nw, "NUM_STAGES": ns}`
`69`	`69`	`for ns in [1, 2, 4]`
`70`	`70`	`for nw in [1, 4, 8]`
`71`		`- for bm in [32, 64, 128, 256]`
	`71`	`+ for bm in [1, 8, 32, 64, 128, 256]`
`72`	`72`	`for bn in [32, 64, 128, 256]`
`73`	`73`	`]`
`74`	`74`