add kernel tunning setting config.

wangzaijun · wangzaijun · commit f5356840a0e1 · 2024-12-24T17:01:52.000+08:00
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -30,6 +30,7 @@
     get_device_sm_shared_mem_num,
     get_device_warp_size,
 )
+from .moe_kernel_configs import MoeGroupedGemmKernelConfig
 
 FFN_MOE_CHUNK_SIZE = 8 * 1024
 
@@ -365,16 +366,25 @@ def grouped_matmul(
     out is tensor shape [token_num * topk_num, out_dim]
     """
     compute_type = tl.bfloat16 if out.dtype == torch.bfloat16 else tl.float16
+    expert_num, n, k = expert_weights.shape
+    assert token_inputs.shape[1] == k
+    assert expert_to_token_index.shape == expert_to_weights.shape
+    assert token_inputs.is_contiguous()
+    assert expert_to_token_num.is_contiguous()
+    assert expert_to_weights.is_contiguous()
+    assert expert_weights.is_contiguous()
 
     if not run_config:
-        run_config = {
-            "BLOCK_SIZE_M": 64,
-            "BLOCK_SIZE_N": 64,
-            "BLOCK_SIZE_K": 32,
-            "GROUP_SIZE_M": 1,
-            "num_warps": 4,
-            "num_stages": 3,
-        }
+        run_config = MoeGroupedGemmKernelConfig.try_to_get_best_config(
+            M=token_inputs.shape[0],
+            N=n,
+            K=k,
+            topk_num=topk_num,
+            expert_num=expert_num,
+            mul_routed_weight=mul_routed_weight,
+            use_fp8_w8a8=use_fp8_w8a8,
+            out_dtype=str(out.dtype),
+        )
     BLOCK_SIZE_M = run_config["BLOCK_SIZE_M"]
     BLOCK_SIZE_N = run_config["BLOCK_SIZE_N"]
     BLOCK_SIZE_K = run_config["BLOCK_SIZE_K"]
@@ -385,14 +395,6 @@ def grouped_matmul(
     if use_fp8_w8a8:
         token_inputs, token_input_scale = ops.scaled_fp8_quant(token_inputs, token_input_scale)
 
-    expert_num, n, k = expert_weights.shape
-    assert token_inputs.shape[1] == k
-    assert expert_to_token_index.shape == expert_to_weights.shape
-    assert token_inputs.is_contiguous()
-    assert expert_to_token_num.is_contiguous()
-    assert expert_to_weights.is_contiguous()
-    assert expert_weights.is_contiguous()
-
     kernel = grouped_matmul_kernel.warmup(
         expert_token_limit,
         k,
diff --git a/lightllm/common/fused_moe/moe_kernel_configs.py b/lightllm/common/fused_moe/moe_kernel_configs.py
@@ -0,0 +1,59 @@
+import os
+from frozendict import frozendict
+from functools import lru_cache
+from lightllm.common.kernel_config import KernelConfigs
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+
+class MoeGroupedGemmKernelConfig(KernelConfigs):
+    @classmethod
+    @lru_cache(maxsize=200)
+    def try_to_get_best_config(
+        cls,
+        M: int,
+        N: int,
+        K: int,
+        topk_num: int,
+        expert_num: int,
+        mul_routed_weight: bool,
+        use_fp8_w8a8: bool,
+        out_dtype: str,
+    ) -> dict:
+        key_params = {
+            "N": N,
+            "K": K,
+            "topk_num": topk_num,
+            "expert_num": expert_num,
+            "mul_routed_weight": mul_routed_weight,
+            "use_fp8_w8a8": use_fp8_w8a8,
+            "out_dtype": out_dtype,
+        }
+        key_params = frozendict(key_params)
+
+        finded_config = KernelConfigs.get_the_config(key_params, os.path.dirname(os.path.realpath(__file__)))
+
+        if finded_config:
+            config = finded_config[min(finded_config.keys(), key=lambda x: abs(x - M))]
+            return config
+        else:
+            if M <= expert_num:
+                config = {
+                    "BLOCK_SIZE_M": 16,
+                    "BLOCK_SIZE_N": 32,
+                    "BLOCK_SIZE_K": 64,
+                    "GROUP_SIZE_M": 1,
+                    "num_warps": 4,
+                    "num_stages": 1,
+                }
+            else:
+                config = {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                    "num_warps": 4,
+                    "num_stages": 1,
+                }
+        return config
diff --git a/lightllm/common/kernel_config.py b/lightllm/common/kernel_config.py
@@ -0,0 +1,44 @@
+import os
+import json
+import re
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Optional
+from functools import lru_cache
+from lightllm.utils.log_utils import init_logger
+from lightllm.utils.device_utils import get_current_device_name
+
+logger = init_logger(__name__)
+
+
+class KernelConfigs(ABC):
+    @classmethod
+    def get_config_file_name(cls, params: Dict[str, Any]) -> str:
+        json_str = json.dumps(params, sort_keys=True)
+        json_str = json_str.replace(" ", "").replace("\n", "").replace('"', "")
+        filename = json_str
+        device_name = get_current_device_name().replace(" ", "_")
+        return f"{filename}_{device_name}.json"
+
+    @lru_cache(maxsize=None)
+    def get_the_config(params: Dict[str, Any], config_dir_path) -> Optional[dict]:
+        json_file_name = KernelConfigs.get_config_file_name(params)
+        config_file_path = os.path.join(config_dir_path, "configs", json_file_name)
+
+        if os.path.exists(config_file_path):
+            return json.load(config_file_path)
+        else:
+            logger.warning(f"can not find config_path {config_file_path}")
+            return None
+
+    @classmethod
+    def store_config(cls, params: Dict[str, Any], config_dir_path: str, dest_json: dict):
+        json_file_name = KernelConfigs.get_config_file_name(params)
+        config_file_path = os.path.join(config_dir_path, "configs", json_file_name)
+        with open(config_file_path, mode="w") as file:
+            json.dump(dest_json, file)
+        return
+
+    @classmethod
+    @abstractmethod
+    def try_to_get_best_config(cls, *args, **kwargs) -> dict:
+        pass
diff --git a/lightllm/utils/device_utils.py b/lightllm/utils/device_utils.py
@@ -35,3 +35,15 @@ def get_device_warp_size():
 
     properties = driver.active.utils.get_device_properties(0)
     return properties["warpSize"]
+
+
+@lru_cache(maxsize=None)
+def get_current_device_name():
+    import torch
+
+    if torch.cuda.is_available():
+        device = torch.cuda.current_device()
+        gpu_name = torch.cuda.get_device_name(device)
+        return gpu_name
+    else:
+        return None
diff --git a/requirements.txt b/requirements.txt
@@ -81,3 +81,4 @@ prometheus_client==0.20.0
 outlines==0.0.46
 cchardet==2.1.7
 ujson==5.10.0
+frozendict==2.4.6