tuning searching space

shihaobai · shihaobai · commit fc7f21f42b1a · 2025-09-02T14:03:36.000+08:00
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -343,11 +343,13 @@ def grouped_matmul_kernel(
     group_id = pid // num_pid_in_group
     first_pid_m = group_id * GROUP_SIZE_M
     group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    in_group_index = pid % num_pid_in_group
-    back_mark = (in_group_index // group_size_m) % 2
-    back_mark1 = -1 * (2 * back_mark - 1)
-    pid_m = first_pid_m + back_mark * (group_size_m - 1) + back_mark1 * (in_group_index % group_size_m)
+    pid_m = first_pid_m + pid % num_pid_in_group % group_size_m
     pid_n = (pid % num_pid_in_group) // group_size_m
+    # in_group_index = pid % num_pid_in_group
+    # back_mark = (in_group_index // group_size_m) % 2
+    # back_mark1 = -1 * (2 * back_mark - 1)
+    # pid_m = first_pid_m + back_mark * (group_size_m - 1) + back_mark1 * (in_group_index % group_size_m)
+    # pid_n = (pid % num_pid_in_group) // group_size_m
 
     expert_id = tl.load(mblocks_to_expert_id + pid_m)
 
@@ -488,12 +490,12 @@ def _get_grouped_matmul_configs():
             "num_stages": ns,
             "NEED_TRANS": need_trans,
         }
-        for ns in [1, 2, 3, 4, 5]
-        for gm in [1, 2, 4, 8]
-        for nw in [2, 4, 8]
+        for ns in [2, 3, 4, 5]
+        for gm in [1, 16, 32, 64]
+        for nw in [4, 8]
         for bm in [16, 32, 64, 128]
         for bn in [16, 32, 64, 128]
-        for bk in [16, 32, 64, 128]
+        for bk in [32, 64, 128]
         for need_trans in [True, False]
     ]
 
diff --git a/lightllm/common/triton_utils/autotuner.py b/lightllm/common/triton_utils/autotuner.py
@@ -192,7 +192,7 @@ def _try_load_cache(self, static_key):
                 self.cached_configs[static_key] = orjson.loads(f.read())
         return
 
-    def _bench(self, *args, n_repeat=3, n_retries=1, **kwargs):
+    def _bench(self, *args, n_repeat=3, n_retries=5, **kwargs):
         from triton.compiler.errors import CompileTimeAssertionFailure
         from triton.runtime.errors import OutOfResources, PTXASError