ModelTC
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 4 additions & 8 deletions b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 3 additions & 3 deletions b/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lightllm/common/triton_utils/all_kernel_configs/triton_3.3.1/NVIDIA H200/grouped_matmul:v1/K=192,N=4096,expert_num=128,mul_routed_weight=True,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=False.json‎
Lines changed: 122 additions & 0 deletions b/‎lightllm/common/triton_utils/all_kernel_configs/triton_3.3.1/NVIDIA H200/grouped_matmul:v1/K=192,N=4096,expert_num=128,mul_routed_weight=True,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=False.json‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎lightllm/common/triton_utils/all_kernel_configs/triton_3.3.1/NVIDIA H200/grouped_matmul:v1/K=192,N=4096,expert_num=128,mul_routed_weight=True,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=True.json‎
Lines changed: 122 additions & 0 deletions b/‎lightllm/common/triton_utils/all_kernel_configs/triton_3.3.1/NVIDIA H200/grouped_matmul:v1/K=192,N=4096,expert_num=128,mul_routed_weight=True,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=True.json‎
Lines changed: 122 additions & 0 deletions
@@ -81,10 +81,6 @@ def __init__(self, kvargs):
 
         self._init_datatype()
         self._init_config()
-
-        if os.environ.get("LIGHTLLM_TRITON_AUTOTUNE", "0") == "1":
-            self.layers_num = self.autotune_layers()
-
         self._verify_must()
         self._verify_params()
         self._init_quant()
@@ -748,6 +744,8 @@ def _autotune_warmup(self):
 
         warmup_lengths.sort(reverse=True)
 
+        layer_num_bak = self.layers_num
+        self.layers_num = self.autotune_layers()
         for input_len in warmup_lengths:
             try:
                 logger.info(f"autotune warmup for length {input_len}")
@@ -779,16 +777,14 @@ def _autotune_warmup(self):
                 del model_output
                 self.req_manager.free_all()
                 self.mem_manager.free_all()
-                torch.cuda.empty_cache()
                 logger.info(f"autotune warmup for length {input_len} ok")
             except Exception as e:
                 logger.warning(f"autotune warmup for length {input_len} failed: {str(e)}")
                 self.req_manager.free_all()
                 self.mem_manager.free_all()
-                torch.cuda.empty_cache()
+        self.layers_num = layer_num_bak
         torch.distributed.barrier()
-        logger.info("autotune warmup done, exit!")
-        exit(0)
+        os.environ["LIGHTLLM_TRITON_AUTOTUNE"] = "0"
 
     @final
     @torch.no_grad()
 
@@ -478,7 +478,7 @@ def get_grouped_matmul_static_key(
             "BLOCK_SIZE_N": bn,
             "BLOCK_SIZE_K": bk,
             "GROUP_SIZE_M": gm,
-            "num_warps": nw,
+            "NUM_WARPS": nw,
             "NUM_STAGE": ns,
         }
         for ns in [1, 2, 3, 4, 5]
@@ -493,7 +493,7 @@ def get_grouped_matmul_static_key(
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 32,
         "GROUP_SIZE_M": 8,
-        "num_warps": 4,
+        "NUM_WARPS": 4,
         "NUM_STAGE": 1,
     },
     static_key_func=get_grouped_matmul_static_key,
@@ -550,7 +550,7 @@ def grouped_matmul(
     BLOCK_SIZE_N = run_config["BLOCK_SIZE_N"]
     BLOCK_SIZE_K = run_config["BLOCK_SIZE_K"]
     GROUP_SIZE_M = run_config["GROUP_SIZE_M"]
-    num_warps = run_config["num_warps"]
+    num_warps = run_config["NUM_WARPS"]
     num_stages = run_config["NUM_STAGE"]
 
     if block_size_k != 0:
 
@@ -0,0 +1,122 @@
+{
+  "1024": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 2,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "128": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 2,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "131072": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 2,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "16": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 2,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "16384": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 4,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 2,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 4,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 2
+  },
+  "32": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 2,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "32768": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 4,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NUM_STAGE": 4,
+    "NUM_WARPS": 4
+  },
+  "512": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "64": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 2,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "65536": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 2,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 2,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "8192": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 8,
+    "NUM_STAGE": 4,
+    "NUM_WARPS": 4
+  }
+}
@@ -0,0 +1,122 @@
+{
+  "1024": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 8,
+    "NUM_STAGE": 2,
+    "NUM_WARPS": 4
+  },
+  "128": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "131072": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 4,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "16": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 2
+  },
+  "16384": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 4,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 8,
+    "NUM_STAGE": 2,
+    "NUM_WARPS": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 4,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 2
+  },
+  "32": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 4,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 2
+  },
+  "32768": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 4,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 4,
+    "NUM_STAGE": 2,
+    "NUM_WARPS": 4
+  },
+  "512": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NUM_STAGE": 2,
+    "NUM_WARPS": 4
+  },
+  "64": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 2
+  },
+  "65536": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 4,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  },
+  "8192": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 4,
+    "NUM_STAGE": 3,
+    "NUM_WARPS": 4
+  }
+}