diff --git a/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=128,dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=128,dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..50f50b3732 --- /dev/null +++ b/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=128,dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"8192": {"BLOCK_SEQ": 1, "HEAD_PARALLEL_NUM": 4, "num_warps": 1, "num_stages": 5}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..802f8083b1 --- /dev/null +++ b/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"8192": {"BLOCK_SEQ": 1, "HEAD_PARALLEL_NUM": 2, "num_warps": 1, "num_stages": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=32,dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=32,dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..802f8083b1 --- /dev/null +++ b/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=32,dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"8192": {"BLOCK_SEQ": 1, "HEAD_PARALLEL_NUM": 2, "num_warps": 1, "num_stages": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=64,dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=64,dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..4b659e870f --- /dev/null +++ b/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=64,dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"8192": {"BLOCK_SEQ": 1, "HEAD_PARALLEL_NUM": 4, "num_warps": 1, "num_stages": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..afecaeac11 --- /dev/null +++ b/lightllm/common/all_kernel_configs/deepseek_v3_rotary_emb_kernel/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"8192": {"BLOCK_SEQ": 1, "HEAD_PARALLEL_NUM": 1, "num_warps": 1, "num_stages": 5}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=128,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=128,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000..bd2a5c76ee --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=128,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=128,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=128,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json new file mode 100644 index 0000000000..ea69378f41 --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=128,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 2, "num_warps": 2, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=192,N=4096,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=192,N=4096,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000..59b4578780 --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=192,N=4096,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "16384": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "32768": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=192,N=4096,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=192,N=4096,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000..1a1bb48e41 --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=192,N=4096,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 1}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000..b9a717ceee --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json index 5c0dab42bf..37ba845fac 100644 --- a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json @@ -1 +1 @@ -{"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}} \ No newline at end of file +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=257,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=257,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json new file mode 100644 index 0000000000..3774e2c2fc --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=257,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=192,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=192,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000..cd4b2b79e3 --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=192,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 5}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000..aeecd11098 --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000..591f301e72 --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 3}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=512,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=512,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json new file mode 100644 index 0000000000..fe56e1c44a --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=512,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=1024,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=1024,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json new file mode 100644 index 0000000000..25333e743d --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=1024,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=1024,expert_num=257,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=1024,expert_num=257,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json new file mode 100644 index 0000000000..ed56a6fc75 --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=1024,expert_num=257,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=256,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=256,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000..5e2f44cb07 --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=256,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=256,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=256,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json new file mode 100644 index 0000000000..bc763e8bc7 --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=256,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000..457d72dc8d --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json index 394ce3193c..a4f26860b2 100644 --- a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json @@ -1 +1 @@ -{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 5}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}} \ No newline at end of file +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=257,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=257,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json new file mode 100644 index 0000000000..e7751ab911 --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=257,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "4096": null, "8192": null} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=96,N=4096,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=96,N=4096,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000..2175152647 --- /dev/null +++ b/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=96,N=4096,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1 @@ +{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json deleted file mode 100644 index 394c5569fd..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 16}, "8": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 8}, "128": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "1024": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "2048": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "4096": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..01debc2a0e --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "64": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "200": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8, "NUM_STAGES": 1}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8, "NUM_STAGES": 1}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 4}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_H800.json deleted file mode 100644 index e209e80543..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_H800.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json deleted file mode 100644 index 60cfe642a3..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 16}, "64": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "128": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "256": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 16}, "512": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "1024": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "2048": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "4096": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..ca953eefc5 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "64": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 2, "NUM_STAGES": 2}, "128": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "200": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "256": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "512": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "2048": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 4}, "4096": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 2, "NUM_STAGES": 8}, "8192": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 8}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json deleted file mode 100644 index 937cb81f50..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "512": {"BLOCK_M": 4, "BLOCK_N": 64, "num_warps": 2}, "1024": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "2048": {"BLOCK_M": 32, "BLOCK_N": 64, "num_warps": 16}, "4096": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json deleted file mode 100644 index ee86227113..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 8}, "8": {"BLOCK_M": 2, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 4, "BLOCK_N": 512, "num_warps": 16}, "128": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "1024": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "2048": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "4096": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 1}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..972ea8e7d9 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 1}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "200": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 1}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2, "NUM_STAGES": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1, "NUM_STAGES": 1}, "4096": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1, "NUM_STAGES": 1}, "8192": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1, "NUM_STAGES": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_H800.json deleted file mode 100644 index 2d34061d19..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_H800.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8192": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json deleted file mode 100644 index b24fa36cfc..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 2, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 16}, "64": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 8}, "128": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "256": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "512": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "1024": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "2048": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "4096": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 4}, "8192": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..ddf72d9078 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 4, "NUM_STAGES": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 4, "NUM_STAGES": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 4, "NUM_STAGES": 4}, "128": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "200": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "256": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4, "NUM_STAGES": 1}, "512": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4, "NUM_STAGES": 1}, "1024": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4, "NUM_STAGES": 1}, "2048": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 2, "NUM_STAGES": 4}, "4096": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 2, "NUM_STAGES": 4}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1, "NUM_STAGES": 8}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H800.json deleted file mode 100644 index 7df7062ab3..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H800.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 2}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "512": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "1024": {"BLOCK_M": 16, "BLOCK_N": 128, "num_warps": 16}, "2048": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "4096": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json deleted file mode 100644 index 4d0066f0f6..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4}, "8": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 16}, "64": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "1024": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 32, "BLOCK_N": 64, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..55ec5a6c64 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 4, "NUM_STAGES": 2}, "8": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 4, "NUM_STAGES": 2}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8, "NUM_STAGES": 1}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8, "NUM_STAGES": 1}, "200": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "256": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 8}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2, "NUM_STAGES": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1, "NUM_STAGES": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 4}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_H800.json deleted file mode 100644 index 6a6371ae5a..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_H800.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2304,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2304,out_dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..e2d155957d --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2304,out_dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 32, "BLOCK_N": 1024, "num_warps": 8, "NUM_STAGES": 1}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8, "NUM_STAGES": 1}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "200": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8, "NUM_STAGES": 1}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8, "NUM_STAGES": 1}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2, "NUM_STAGES": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "4096": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 1, "NUM_STAGES": 4}, "8192": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 1, "NUM_STAGES": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json deleted file mode 100644 index b0373ee97c..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 2}, "8": {"BLOCK_M": 2, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 16}, "128": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "512": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4}, "1024": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 2}, "2048": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 256, "num_warps": 2}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..102f6da04d --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 4, "NUM_STAGES": 2}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "64": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "128": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "200": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "256": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "512": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4, "NUM_STAGES": 1}, "1024": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 2, "NUM_STAGES": 4}, "2048": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 1, "NUM_STAGES": 4}, "4096": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 2, "NUM_STAGES": 4}, "8192": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 1, "NUM_STAGES": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_H800.json deleted file mode 100644 index 72e06487b7..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_H800.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "1024": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 8}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json deleted file mode 100644 index 13d95c2176..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 16}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "128": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "256": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "512": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 8, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4}, "4096": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 1}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..be348b8801 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 4, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 1}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 1}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 1}, "200": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 1}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 1}, "512": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 8, "NUM_STAGES": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 4}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 4}, "8192": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2, "NUM_STAGES": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_H800.json deleted file mode 100644 index 6cbc291917..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_H800.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json deleted file mode 100644 index 434f223bce..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "128": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 2}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..5938cebdc7 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 8, "NUM_STAGES": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "64": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 1}, "200": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 1}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 1}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2, "NUM_STAGES": 1}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4, "NUM_STAGES": 1}, "4096": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2, "NUM_STAGES": 1}, "8192": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2, "NUM_STAGES": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json deleted file mode 100644 index e63a933482..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 16}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json deleted file mode 100644 index 4ffcf5a7e2..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "128": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "512": {"BLOCK_M": 32, "BLOCK_N": 128, "num_warps": 4}, "1024": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "2048": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "4096": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 0000000000..31f204c2a4 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 4, "BLOCK_N": 512, "num_warps": 8, "NUM_STAGES": 1}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4, "NUM_STAGES": 1}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2, "NUM_STAGES": 4}, "200": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2, "NUM_STAGES": 4}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 2}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4, "NUM_STAGES": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_H800.json deleted file mode 100644 index 5badc266e9..0000000000 --- a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_H800.json +++ /dev/null @@ -1 +0,0 @@ -{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 16}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}} \ No newline at end of file diff --git a/lightllm/common/basemodel/layer_infer/cache_tensor_manager.py b/lightllm/common/basemodel/layer_infer/cache_tensor_manager.py index fa87630d9f..8d5debcb45 100644 --- a/lightllm/common/basemodel/layer_infer/cache_tensor_manager.py +++ b/lightllm/common/basemodel/layer_infer/cache_tensor_manager.py @@ -123,6 +123,22 @@ def cache_env_out(self): self.cache_env_ok = False return + def empty( + self, + shape: Union[torch.Size, Iterable[int]], + dtype: torch.dtype, + device: str = "cuda", + is_graph_out: bool = False, + microbatch_index: int = 0, + ) -> torch.Tensor: + return self.alloc_tensor( + shape=shape, + data_type=dtype, + device=device, + is_graph_out=is_graph_out, + microbatch_index=microbatch_index, + ) + def alloc_tensor( self, shape: Union[torch.Size, Tuple[int, ...]], @@ -207,6 +223,16 @@ def cache_env_in( def cache_env_out(self): return + def empty( + self, + shape: Union[torch.Size, Iterable[int]], + dtype: torch.dtype, + device: str = "cuda", + is_graph_out: bool = False, + microbatch_index: int = 0, + ) -> torch.Tensor: + return torch.empty(shape, dtype=dtype, device=device, requires_grad=False) + def alloc_tensor( self, shape: Union[torch.Size, Iterable[int]], diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py index cc925525c1..70e05b2109 100644 --- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py +++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py @@ -4,7 +4,7 @@ from typing import Optional, Tuple, List, Dict, Any from lightllm.utils.dist_utils import get_global_world_size, get_global_rank, get_current_device_id from .base_weight import BaseWeight -from lightllm.common.fused_moe.grouped_fused_moe_ep import fused_experts_impl, masked_group_gemm, tma_aligned_quantize +from lightllm.common.fused_moe.grouped_fused_moe_ep import fused_experts_impl, masked_group_gemm from lightllm.common.fused_moe.moe_silu_and_mul import silu_and_mul_fwd from lightllm.distributed import dist_group_manager from lightllm.common.fused_moe.topk_select import select_experts @@ -95,7 +95,7 @@ def __init__( self.n_group = network_config["n_group"] network_config["topk_group"] = network_config.get("topk_group", 0) self.topk_group = network_config["topk_group"] - network_config["routed_scaling_factor"] = network_config.get("routed_scaling_factor", 0) + network_config["routed_scaling_factor"] = network_config.get("routed_scaling_factor", 1.0) self.routed_scaling_factor = network_config["routed_scaling_factor"] self.lock = threading.Lock() @@ -126,6 +126,7 @@ def experts( num_expert_group=num_expert_group, scoring_func=self.scoring_func, ) + topk_weights.mul_(self.routed_scaling_factor) if self.redundancy_expert_num > 0: redundancy_topk_ids_repair( @@ -173,6 +174,7 @@ def low_latency_dispatch( num_expert_group=self.n_group, scoring_func=self.scoring_func, ) + topk_weights.mul_(self.routed_scaling_factor) if self.redundancy_expert_num > 0: redundancy_topk_ids_repair( @@ -213,6 +215,7 @@ def select_experts_and_quant_input( num_expert_group=self.n_group, scoring_func=self.scoring_func, ) + topk_weights.mul_(self.routed_scaling_factor) if self.redundancy_expert_num > 0: redundancy_topk_ids_repair( topk_ids=topk_idx, @@ -228,9 +231,7 @@ def select_experts_and_quant_input( if w1.ndim == 3: block_size_k = w1.shape[2] // w1_scale.shape[2] assert block_size_k == 128, "block_size_k must be 128" - input_scale = torch.empty((M, K // block_size_k), dtype=torch.float32, device=hidden_states.device) - qinput_tensor = torch.empty((M, K), dtype=w1.dtype, device=hidden_states.device) - per_token_group_quant_fp8(hidden_states, block_size_k, qinput_tensor, input_scale) + qinput_tensor, input_scale = per_token_group_quant_fp8(hidden_states, block_size_k, dtype=w1.dtype) return topk_weights, topk_idx.to(torch.long), (qinput_tensor, input_scale) def dispatch( @@ -340,7 +341,9 @@ def prefilled_group_gemm( silu_out = torch.empty((all_tokens, N // 2), device=device, dtype=hidden_dtype) silu_and_mul_fwd(gemm_out_a.view(-1, N), silu_out) - qsilu_out, qsilu_out_scale = tma_aligned_quantize(silu_out) + qsilu_out, qsilu_out_scale = per_token_group_quant_fp8( + silu_out, self.block_size, dtype=w1.dtype, column_major_scales=True, scale_tma_aligned=True + ) # groupgemm (contiguous layout) gemm_out_b = torch.empty((all_tokens, K), device=device, dtype=hidden_dtype) diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py index 131e65f54e..3e61178f37 100644 --- a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py +++ b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py @@ -16,6 +16,7 @@ def __init__( e_score_correction_bias_name: str, weight_prefix: str, n_routed_experts: int, + num_fused_shared_experts: int, split_inter_size: int, data_type: torch.dtype, network_config: Dict[str, Any], @@ -34,7 +35,10 @@ def __init__( self.e_score_correction_bias_name = e_score_correction_bias_name self.weight_prefix = weight_prefix - self.n_routed_experts = n_routed_experts + assert num_fused_shared_experts in [0, 1], "num_fused_shared_experts can only support 0 or 1 now." + self.n_routed_experts = n_routed_experts + num_fused_shared_experts + self.num_fused_shared_experts = num_fused_shared_experts + self.routed_scaling_factor = network_config.get("routed_scaling_factor", 1.0) self.split_inter_size = split_inter_size self.data_type_ = data_type self.tp_rank_ = get_current_rank_in_dp() @@ -64,13 +68,36 @@ def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_t num_expert_group=num_expert_group, scoring_func=self.scoring_func, ) + topk_weights.mul_(self.routed_scaling_factor) + if self.num_fused_shared_experts > 0: + pad_topk_ids = ( + torch.arange( + start=self.n_routed_experts - self.num_fused_shared_experts, + end=self.n_routed_experts, + step=1, + dtype=topk_ids.dtype, + device="cuda", + ) + .view(1, self.num_fused_shared_experts) + .repeat(topk_ids.shape[0], 1) + ) + pad_topk_weights = torch.full( + (topk_weights.shape[0], self.num_fused_shared_experts), + fill_value=1.0, + device="cuda", + dtype=topk_weights.dtype, + ) + + topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1) + topk_weights = torch.cat([topk_weights, pad_topk_weights], dim=1) + w1, w1_scale = self.w1 w2, w2_scale = self.w2 use_fp8_w8a8 = self.quant_method is not None - from lightllm.common.fused_moe.grouped_fused_moe import fused_experts_impl + from lightllm.common.fused_moe.grouped_fused_moe import fused_experts - fused_experts_impl( + fused_experts( hidden_states=input_tensor, w1=w1, w2=w2, @@ -93,16 +120,18 @@ def _fuse(self): and None not in self.experts_gate_projs and None not in self.w2_list ): - w1_list = [] + gate_out_dim, gate_in_dim = self.experts_gate_projs[0].shape + up_out_dim, up_in_dim = self.experts_up_projs[0].shape + assert gate_in_dim == up_in_dim + dtype = self.experts_gate_projs[0].dtype + total_expert_num = self.n_routed_experts + + w1 = torch.empty((total_expert_num, gate_out_dim + up_out_dim, gate_in_dim), dtype=dtype, device="cpu") + for i_experts in range(self.n_routed_experts): - expert_gate_up_proj = torch.cat( - [self.experts_gate_projs[i_experts], self.experts_up_projs[i_experts]], dim=0 - ) - expert_gate_up_proj = expert_gate_up_proj - w1_list.append(expert_gate_up_proj) - - inter_shape, hidden_size = w1_list[0].shape[0], w1_list[0].shape[1] - w1 = torch._utils._flatten_dense_tensors(w1_list).view(len(w1_list), inter_shape, hidden_size) + w1[i_experts, 0:gate_out_dim:, :] = self.experts_gate_projs[i_experts] + w1[i_experts, gate_out_dim:, :] = self.experts_up_projs[i_experts] + inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1] w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size) if not self.quantized_weight and self.quant_method is not None: @@ -123,17 +152,19 @@ def _fuse_weight_scale(self): and None not in self.experts_gate_proj_scales and None not in self.w2_scale_list ): - w1_scale_list = [] - for i_experts in range(self.n_routed_experts): - expert_gate_up_proj_scale = torch.cat( - [self.experts_gate_proj_scales[i_experts], self.experts_up_proj_scales[i_experts]], dim=0 - ) - w1_scale_list.append(expert_gate_up_proj_scale) - - inter_shape, hidden_size = w1_scale_list[0].shape[0], w1_scale_list[0].shape[1] - w1_scale = torch._utils._flatten_dense_tensors(w1_scale_list).view( - len(w1_scale_list), inter_shape, hidden_size + gate_out_dim, gate_in_dim = self.experts_gate_proj_scales[0].shape + up_out_dim, up_in_dim = self.experts_up_proj_scales[0].shape + assert gate_in_dim == up_in_dim + dtype = self.experts_gate_proj_scales[0].dtype + total_expert_num = self.n_routed_experts + + w1_scale = torch.empty( + (total_expert_num, gate_out_dim + up_out_dim, gate_in_dim), dtype=dtype, device="cpu" ) + + for i_experts in range(self.n_routed_experts): + w1_scale[i_experts, 0:gate_out_dim:, :] = self.experts_gate_proj_scales[i_experts] + w1_scale[i_experts, gate_out_dim:, :] = self.experts_up_proj_scales[i_experts] inter_shape, hidden_size = self.w2_scale_list[0].shape[0], self.w2_scale_list[0].shape[1] w2_scale = torch._utils._flatten_dense_tensors(self.w2_scale_list).view( len(self.w2_scale_list), inter_shape, hidden_size diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py index c1e239bef5..04c32c101a 100644 --- a/lightllm/common/fused_moe/grouped_fused_moe.py +++ b/lightllm/common/fused_moe/grouped_fused_moe.py @@ -34,8 +34,9 @@ from .moe_silu_and_mul import silu_and_mul_fwd from .moe_sum_reduce import moe_sum_reduce from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8 +from lightllm.utils.torch_ops_utils import direct_register_custom_op -FFN_MOE_CHUNK_SIZE = 8 * 1024 +FFN_MOE_CHUNK_SIZE = 32 * 1024 logger = init_logger(__name__) @@ -355,7 +356,7 @@ def grouped_matmul_kernel( tile_n_idx = pid_n # get the gemm size of the current problem - cur_m = tl.load(expert_to_token_num + expert_id, eviction_policy="evict_last") + cur_m = tl.load(expert_to_token_num + expert_id) # do regular gemm here offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) @@ -463,7 +464,7 @@ def grouped_matmul( use_fp8_w8a8: bool, alloc_tensor_func=torch.empty, reused_mblock_infos=None, - **run_config, + run_config: Optional[dict] = None, ): """ token_num_mul_topk_num is int equal token_num * topk_num, @@ -493,7 +494,8 @@ def grouped_matmul( if expert_to_weights_scale.ndim == 3: block_size_n = expert_weights.shape[1] // expert_to_weights_scale.shape[1] block_size_k = expert_weights.shape[2] // expert_to_weights_scale.shape[2] - if not run_config: + + if run_config is None: run_config = MoeGroupedGemmKernelConfig.try_to_get_best_config( M=token_inputs.shape[0], N=n, @@ -524,10 +526,9 @@ def grouped_matmul( else: _m, _k = token_inputs.shape assert _k % block_size_k == 0 - input_scale = alloc_tensor_func((_m, _k // block_size_k), dtype=torch.float32, device=token_inputs.device) - qinput_tensor = alloc_tensor_func((_m, _k), dtype=expert_weights.dtype, device=token_inputs.device) - per_token_group_quant_fp8(token_inputs, block_size_k, qinput_tensor, input_scale) - token_inputs, token_input_scale = qinput_tensor, input_scale + token_inputs, token_input_scale = per_token_group_quant_fp8( + token_inputs, block_size_k, dtype=expert_weights.dtype + ) if reused_mblock_infos is None: mblocks_to_expert_id, mblocks_to_m_index = moe_align2(token_num_mul_topk_num, expert_to_token_num, BLOCK_SIZE_M) @@ -611,7 +612,7 @@ def fused_experts_impl( a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, alloc_tensor_func=torch.empty, - **run_config, + run_config: Optional[dict] = None, ): # Check constraints. assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" @@ -626,12 +627,15 @@ def fused_experts_impl( topk_num = topk_ids.shape[1] M = min(num_tokens, CHUNK_SIZE) - intermediate_cache1 = alloc_tensor_func((M, topk_num, N), device=hidden_states.device, dtype=hidden_states.dtype) + intermediate_cache13_shared = alloc_tensor_func( + (M, topk_num, max(N, w2.shape[1])), device=hidden_states.device, dtype=hidden_states.dtype + ) + intermediate_cache1 = intermediate_cache13_shared.view(-1)[: (M * topk_num * N)].view(M, topk_num, N) intermediate_cache2 = alloc_tensor_func( (M, topk_num, N // 2), device=hidden_states.device, dtype=hidden_states.dtype ) - intermediate_cache3 = alloc_tensor_func( - (M, topk_num, w2.shape[1]), device=hidden_states.device, dtype=hidden_states.dtype + intermediate_cache3 = intermediate_cache13_shared.view(-1)[: (M * topk_num * w2.shape[1])].view( + M, topk_num, w2.shape[1] ) if inplace: @@ -673,7 +677,7 @@ def fused_experts_impl( mul_routed_weight=False, use_fp8_w8a8=use_fp8_w8a8, alloc_tensor_func=alloc_tensor_func, - **run_config, + run_config=run_config, ) silu_and_mul_fwd(intermediate_cache1.view(-1, N), intermediate_cache2.view(-1, N // 2)) @@ -693,10 +697,161 @@ def fused_experts_impl( use_fp8_w8a8=use_fp8_w8a8, alloc_tensor_func=alloc_tensor_func, reused_mblock_infos=reused_mblock_infos, - **run_config, + run_config=run_config, ) moe_sum_reduce( intermediate_cache3.view(*intermediate_cache3.shape), out_hidden_states[begin_chunk_idx:end_chunk_idx] ) return out_hidden_states + + +def inplace_fused_experts_impl( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, +) -> None: + fused_experts_impl( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + True, + use_fp8_w8a8, + use_int8_w8a16, + w1_scale, + w2_scale, + a1_scale, + a2_scale, + ) + + +def inplace_fused_experts_impl_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, +) -> None: + pass + + +direct_register_custom_op( + "inplace_fused_experts_impl", + inplace_fused_experts_impl, + ["hidden_states"], + inplace_fused_experts_impl_fake, +) + + +def outplace_fused_experts_impl( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, +) -> None: + return fused_experts_impl( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + False, + use_fp8_w8a8, + use_int8_w8a16, + w1_scale, + w2_scale, + a1_scale, + a2_scale, + ) + + +def outplace_fused_experts_impl_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, +) -> None: + return torch.empty_like(hidden_states) + + +direct_register_custom_op( + "outplace_fused_experts_impl", + outplace_fused_experts_impl, + [], + outplace_fused_experts_impl_fake, +) + + +def fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, +): + if inplace: + torch.ops.lightllm.inplace_fused_experts_impl( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + use_fp8_w8a8, + use_int8_w8a16, + w1_scale, + w2_scale, + a1_scale, + a2_scale, + ) + return hidden_states + else: + return torch.ops.lightllm.outplace_fused_experts_impl( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + use_fp8_w8a8, + use_int8_w8a16, + w1_scale, + w2_scale, + a1_scale, + a2_scale, + ) diff --git a/lightllm/common/fused_moe/grouped_fused_moe_ep.py b/lightllm/common/fused_moe/grouped_fused_moe_ep.py index 3b5cc6b91d..cf298d6426 100644 --- a/lightllm/common/fused_moe/grouped_fused_moe_ep.py +++ b/lightllm/common/fused_moe/grouped_fused_moe_ep.py @@ -26,17 +26,6 @@ logger.warning("no deepep or deep_gemm") -def tma_aligned_quantize( - input_tensor: torch.Tensor, block_size: int = 128, dtype: torch.dtype = torch.float8_e4m3fn -) -> Tuple[torch.Tensor, torch.Tensor]: - m, k = input_tensor.shape - input_scale = torch.empty((m, k // block_size), dtype=torch.float32, device=input_tensor.device) - qinput_tensor = torch.empty((m, k), dtype=dtype, device=input_tensor.device) - per_token_group_quant_fp8(input_tensor, block_size, qinput_tensor, input_scale) - input_scale = tma_align_input_scale(input_scale) - return qinput_tensor, input_scale - - def masked_group_gemm( recv_x: Tuple[torch.Tensor], masked_m: torch.Tensor, @@ -106,9 +95,7 @@ def fused_experts_impl( combined_x = None if is_prefill: - input_scale = torch.empty((M, K // block_size_k), dtype=torch.float32, device=hidden_states.device) - qinput_tensor = torch.empty((M, K), dtype=w1.dtype, device=hidden_states.device) - per_token_group_quant_fp8(hidden_states, block_size_k, qinput_tensor, input_scale) + qinput_tensor, input_scale = per_token_group_quant_fp8(hidden_states, block_size_k, dtype=w1.dtype) # get_dispatch_layout ( @@ -186,7 +173,9 @@ def fused_experts_impl( silu_out = torch.empty((all_tokens, N // 2), device=hidden_states.device, dtype=hidden_states.dtype) silu_and_mul_fwd(gemm_out_a.view(-1, N), silu_out) - qsilu_out, qsilu_out_scale = tma_aligned_quantize(silu_out) + qsilu_out, qsilu_out_scale = per_token_group_quant_fp8( + silu_out, block_size_k, dtype=w1.dtype, column_major_scales=True, scale_tma_aligned=True + ) # groupgemm (contiguous layout) gemm_out_b = torch.empty((all_tokens, K), device=hidden_states.device, dtype=hidden_states.dtype) diff --git a/lightllm/common/fused_moe/moe_silu_and_mul.py b/lightllm/common/fused_moe/moe_silu_and_mul.py index 3f6bdb44f2..f930f3bd63 100644 --- a/lightllm/common/fused_moe/moe_silu_and_mul.py +++ b/lightllm/common/fused_moe/moe_silu_and_mul.py @@ -6,7 +6,7 @@ @triton.jit -def _silu_and_mul_kernel( +def _silu_and_mul_kernel_fast( input_ptr, output_ptr, stride_input_m, @@ -17,41 +17,49 @@ def _silu_and_mul_kernel( size_n, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, + NUM_STAGES: tl.constexpr, + NEED_MASK: tl.constexpr, ): stride_input_m = tl.cast(stride_input_m, dtype=tl.int64) stride_output_m = tl.cast(stride_output_m, dtype=tl.int64) - tid = tl.program_id(0) - input_m_offsets = tid * BLOCK_M + tl.arange(0, BLOCK_M) - output_m_offsets = tid * BLOCK_M + tl.arange(0, BLOCK_M) + n_block_index = tl.program_id(0) + m_block_index = tl.program_id(1) + n_offsets = n_block_index * BLOCK_N + tl.arange(0, BLOCK_N) + m_start_index = m_block_index * BLOCK_M + m_end_index = (m_block_index + 1) * BLOCK_M + m_end_index = tl.where(m_end_index < size_m, m_end_index, size_m) + if NEED_MASK: + mask = n_offsets[None, :] < size_n + other = 0.0 + else: + mask = None + other = None - pid = tl.program_id(1) - input_n_offsets = pid * BLOCK_N + tl.arange(0, BLOCK_N) - output_n_offsets = pid * BLOCK_N + tl.arange(0, BLOCK_N) + for m_index in tl.range(m_start_index, m_end_index, num_stages=NUM_STAGES): + gate_offsets = m_index * stride_input_m + n_offsets[None, :] + up_offsets = m_index * stride_input_m + (n_offsets[None, :] + size_n) + out_offsets = m_index * stride_output_m + n_offsets[None, :] - up_offsets = input_m_offsets[:, None] * stride_input_m + (input_n_offsets[None, :] + size_n) - gate_offsets = input_m_offsets[:, None] * stride_input_m + input_n_offsets[None, :] - res_offsets = output_m_offsets[:, None] * stride_output_m + output_n_offsets[None, :] + up = tl.load( + input_ptr + up_offsets, + mask=mask, + other=other, + ) + gate = tl.load( + input_ptr + gate_offsets, + mask=mask, + other=other, + ).to(tl.float32) - up = tl.load( - input_ptr + up_offsets, - mask=(input_n_offsets < size_n)[None, :] * (input_m_offsets < size_m)[:, None], - other=0.0, - ) - gate = tl.load( - input_ptr + gate_offsets, - mask=(input_n_offsets < size_n)[None, :] * (input_m_offsets < size_m)[:, None], - other=0.0, - ).to(tl.float32) - - gate = gate / (1 + tl.exp(-gate)) - gate = gate.to(input_ptr.dtype.element_ty) + gate = gate / (1 + tl.exp(-gate)) + gate = gate.to(input_ptr.dtype.element_ty) - tl.store( - output_ptr + res_offsets, - up * gate, - mask=(output_n_offsets < size_n)[None, :] * (output_m_offsets < size_m)[:, None], - ) + tl.store( + output_ptr + out_offsets, + up * gate, + mask=mask, + ) def silu_and_mul_fwd(input: torch.Tensor, output: torch.Tensor, **run_config): @@ -71,22 +79,29 @@ def silu_and_mul_fwd(input: torch.Tensor, output: torch.Tensor, **run_config): BLOCK_M = run_config["BLOCK_M"] BLOCK_N = run_config["BLOCK_N"] num_warps = run_config["num_warps"] + NUM_STAGES = run_config["NUM_STAGES"] + # limit the grid size to avoid the invalid argument error of triton + while triton.cdiv(size_m, BLOCK_M) > 8192: + BLOCK_M *= 2 grid = ( - triton.cdiv(size_m, BLOCK_M), triton.cdiv(size_n, BLOCK_N), + triton.cdiv(size_m, BLOCK_M), ) - _silu_and_mul_kernel[grid]( - input, - output, - stride_input_m, - stride_input_n, - stride_output_m, - stride_output_n, - size_m, - size_n, + NEED_MASK = (size_n % BLOCK_N) != 0 + _silu_and_mul_kernel_fast[grid]( + input_ptr=input, + output_ptr=output, + stride_input_m=stride_input_m, + stride_input_n=stride_input_n, + stride_output_m=stride_output_m, + stride_output_n=stride_output_n, + size_m=size_m, + size_n=size_n, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, + NUM_STAGES=NUM_STAGES, + NEED_MASK=NEED_MASK, num_warps=num_warps, ) return diff --git a/lightllm/common/fused_moe/moe_silu_and_mul_config.py b/lightllm/common/fused_moe/moe_silu_and_mul_config.py index e69680a1f3..173101b898 100644 --- a/lightllm/common/fused_moe/moe_silu_and_mul_config.py +++ b/lightllm/common/fused_moe/moe_silu_and_mul_config.py @@ -30,7 +30,10 @@ def try_to_get_best_config( config = finded_config[min(finded_config.keys(), key=lambda x: abs(int(x) - M))] return config else: - config = {"BLOCK_M": 128, "BLOCK_N": 128, "num_warps": 4} + if M < 256: + config = {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 1, "NUM_STAGES": 1} + else: + config = {"BLOCK_M": 16, "BLOCK_N": 128, "num_warps": 4, "NUM_STAGES": 5} return config diff --git a/lightllm/common/quantization/deepgemm_quant.py b/lightllm/common/quantization/deepgemm_quant.py index 8d14805ad5..66f1f00253 100644 --- a/lightllm/common/quantization/deepgemm_quant.py +++ b/lightllm/common/quantization/deepgemm_quant.py @@ -51,22 +51,20 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_ else: qweight, weight_scale = weights input_scale = None + alloc_func = torch.empty if not use_custom_tensor_mananger else self.cache_manager.empty m, k = input_tensor.shape n = weights[0].shape[1] if input_scale is None: - input_scale = torch.empty((m, k // self.block_size), dtype=torch.float32, device=input_tensor.device) - qinput_tensor = self.cache_manager.alloc_tensor( - (m, k), qweight.dtype, device=qweight.device, is_graph_out=False + qinput_tensor, input_scale = per_token_group_quant_fp8( + input_tensor, + self.block_size, + dtype=qweight.dtype, + column_major_scales=True, + scale_tma_aligned=True, + alloc_func=alloc_func, ) - per_token_group_quant_fp8(input_tensor, self.block_size, qinput_tensor, input_scale) - input_scale = tma_align_input_scale(input_scale) if out is None: - if use_custom_tensor_mananger: - out = self.cache_manager.alloc_tensor( - (m, n), input_tensor.dtype, device=input_tensor.device, is_graph_out=False - ) - else: - out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device) + out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device) deep_gemm.gemm_fp8_fp8_bf16_nt([qinput_tensor, input_scale], [qweight.t(), weight_scale.t()], out) return out diff --git a/lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py b/lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py index 120880143f..db56e3626b 100644 --- a/lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py +++ b/lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py @@ -6,7 +6,7 @@ from lightllm.utils.sgl_utils import HAS_SGL_KERNEL, sgl_ops from frozendict import frozendict from functools import lru_cache -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple try: from deep_gemm import ceil_div @@ -108,17 +108,56 @@ def lightllm_per_token_group_quant_fp8( def per_token_group_quant_fp8( x: torch.Tensor, group_size: int, - x_q: torch.Tensor, - x_s: torch.Tensor, eps: float = 1e-10, dtype: torch.dtype = torch.float8_e4m3fn, + column_major_scales: bool = False, + scale_tma_aligned: bool = False, + alloc_func: Callable = torch.empty, ): + x_q = alloc_func(x.shape, dtype=dtype, device=x.device) + x_s = None + # Adapted from + # https://github.com/sgl-project/sglang/blob/7e257cd666c0d639626487987ea8e590da1e9395/python/sglang/srt/layers/quantization/fp8_kernel.py#L290 if HAS_SGL_KERNEL: finfo = torch.finfo(dtype) fp8_max, fp8_min = finfo.max, finfo.min + + # 创建scale张量 + if column_major_scales: + if scale_tma_aligned: + # 对齐到4 * sizeof(float) + aligned_size = (x.shape[-2] + 3) // 4 * 4 + x_s = alloc_func( + x.shape[:-2] + (x.shape[-1] // group_size, aligned_size), + device=x.device, + dtype=torch.float32, + ).permute(-1, -2)[: x.shape[-2], :] + else: + x_s = alloc_func( + (x.shape[-1] // group_size,) + x.shape[:-1], + device=x.device, + dtype=torch.float32, + ).permute(-1, -2) + else: + x_s = alloc_func( + x.shape[:-1] + (x.shape[-1] // group_size,), + device=x.device, + dtype=torch.float32, + ) + + # 使用SGL kernel进行量化 sgl_ops.sgl_per_token_group_quant_fp8(x, x_q, x_s, group_size, 1e-10, fp8_min, fp8_max, False) else: + # 使用LightLLM kernel进行量化 + x_s = alloc_func( + x.shape[:-1] + (x.shape[-1] // group_size,), + device=x.device, + dtype=torch.float32, + ) lightllm_per_token_group_quant_fp8(x, group_size, x_q, x_s, eps=1e-10, dtype=torch.float8_e4m3fn) + if column_major_scales and scale_tma_aligned: + x_s = tma_align_input_scale(x_s) + return x_q, x_s # copy from @@ -208,9 +247,9 @@ def test_tma_align(): m = 576 k = 8192 x = torch.randn((m, k // 128), dtype=torch.float32).cuda() + for _ in range(10): x_padded = tma_align_input_scale(x) - print(x_padded.shape) import time torch.cuda.synchronize() @@ -226,11 +265,9 @@ def test_tma_align(): def test_per_token_group_quant_fp8(): group_size = 128 x = torch.randn((1024, 8192), dtype=torch.bfloat16).cuda() - - x_q = torch.randn((1024, 8192)).cuda().to(torch.float8_e4m3fn) # x_s = torch.randn((1024, 8192 // group_size), dtype=torch.float32).cuda() - x_s = torch.randn((8192 // group_size, 1024 + 10), dtype=torch.float32).cuda().t() - per_token_group_quant_fp8(x, group_size, x_q, x_s) + # x_s = torch.randn((8192 // group_size, 1024 + 10), dtype=torch.float32).cuda().t() + x_q, x_s = per_token_group_quant_fp8(x, group_size, column_major_scales=True, scale_tma_aligned=True) x_s = x_s[:1024] th_x_q, th_x_s = torch_quant(x, group_size) print("th_x_s - x_s", torch.abs(th_x_s - x_s.reshape(-1)).max()) @@ -238,4 +275,5 @@ def test_per_token_group_quant_fp8(): if __name__ == "__main__": + test_per_token_group_quant_fp8() test_tma_align() diff --git a/lightllm/common/quantization/triton_quant/triton_quant.py b/lightllm/common/quantization/triton_quant/triton_quant.py index d005a950c5..a8d6a00559 100644 --- a/lightllm/common/quantization/triton_quant/triton_quant.py +++ b/lightllm/common/quantization/triton_quant/triton_quant.py @@ -38,26 +38,18 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_ qweight, weight_scale, input_scale = weights m, k = input_tensor.shape n = qweight.shape[1] + alloc_func = torch.empty if not use_custom_tensor_mananger else self.cache_manager.empty if input_scale is None: - input_scale = self.cache_manager.alloc_tensor( - (m, k // self.block_size), torch.float32, device=input_tensor.device, is_graph_out=False + input_tensor_q, input_scale = per_token_group_quant_fp8( + input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func ) - input_tensor_q = self.cache_manager.alloc_tensor( - (m, k), qweight.dtype, device=qweight.device, is_graph_out=False - ) - per_token_group_quant_fp8(input_tensor, self.block_size, input_tensor_q, input_scale) else: # TODO raise "statci input scale is not supported by triton fp8 block gemm kernel." m = input_tensor.shape[0] n = qweight.shape[1] if out is None: - if use_custom_tensor_mananger: - out = self.cache_manager.alloc_tensor( - (m, n), input_tensor.dtype, device=input_tensor.device, is_graph_out=False - ) - else: - out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device) + out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device) w8a8_block_fp8_matmul( input_tensor_q, qweight, diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py index d8cb10404a..d3b068dbe9 100644 --- a/lightllm/common/quantization/w8a8_quant.py +++ b/lightllm/common/quantization/w8a8_quant.py @@ -131,21 +131,13 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_ qweight, weight_scale, input_scale = weights m, k = input_tensor.shape n = weights[0].shape[1] + alloc_func = torch.empty if not use_custom_tensor_mananger else self.cache_manager.empty if input_scale is None: - input_scale = self.cache_manager.alloc_tensor( - (m, k // self.block_size), torch.float32, device=input_tensor.device, is_graph_out=False + qinput_tensor, input_scale = per_token_group_quant_fp8( + input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func ) - qinput_tensor = self.cache_manager.alloc_tensor( - (m, k), qweight.dtype, device=qweight.device, is_graph_out=False - ) - per_token_group_quant_fp8(input_tensor, self.block_size, qinput_tensor, input_scale) if out is None: - if use_custom_tensor_mananger: - out = self.cache_manager.alloc_tensor( - (m, n), input_tensor.dtype, device=input_tensor.device, is_graph_out=False - ) - else: - out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device) + out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device) if n % 128 != 0: w8a8_block_fp8_matmul( qinput_tensor, diff --git a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py index ba752a4e84..985102b375 100644 --- a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py +++ b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py @@ -19,7 +19,6 @@ from lightllm.models.deepseek2.triton_kernel.gqa_flash_decoding_fp8 import gqa_token_decode_attention_flash_decoding_fp8 from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer from lightllm.models.llama.triton_kernel.rmsnorm import rmsnorm_forward -from lightllm.models.llama.triton_kernel.silu_and_mul import silu_and_mul_fwd from lightllm.models.deepseek2.triton_kernel.rotary_emb import rotary_emb_fwd from lightllm.models.deepseek2.infer_struct import Deepseek2InferStateInfo from lightllm.models.deepseek2.flashinfer_struct import Deepseek2FlashInferStateInfo @@ -57,7 +56,6 @@ def __init__(self, layer_num, network_config, mode=[]): self.norm_topk_prob = network_config["norm_topk_prob"] self.n_group = network_config["n_group"] self.topk_group = network_config["topk_group"] - self.routed_scaling_factor = network_config["routed_scaling_factor"] self.softmax_scale = (self.qk_nope_head_dim + self.qk_rope_head_dim) ** (-0.5) if network_config.get("rope_scaling", None) is not None: @@ -666,7 +664,8 @@ def _moe_ffn( hidden_states = input.view(-1, self.embed_dim_) num_tokens, hidden_dim = hidden_states.shape - if self.n_shared_experts is not None: + # if fused_shared_experts is not enabled, compute shared_output + if self.n_shared_experts is not None and layer_weight.num_fused_shared_experts == 0: shared_output = LlamaTransformerLayerInfer._ffn(self, hidden_states, infer_state, layer_weight) router_logits = layer_weight.moe_gate.mm(hidden_states) @@ -680,9 +679,7 @@ def _moe_ffn( num_expert_group=self.n_group, ) - hidden_states.mul_(self.routed_scaling_factor) - - if self.n_shared_experts is not None: + if self.n_shared_experts is not None and layer_weight.num_fused_shared_experts == 0: hidden_states.add_(shared_output) return hidden_states.view(num_tokens, hidden_dim) @@ -707,7 +704,6 @@ def _moe_ffn_edp( num_expert_group=self.n_group, is_prefill=infer_state.is_prefill, ) - ep_output.mul_(self.routed_scaling_factor) if self.n_shared_experts is not None: ep_output.add_(shared_output) @@ -819,7 +815,6 @@ def overlap_tpsp_token_forward( # 0 hook if getattr(infer_state, "hook", None) is not None: infer_state.hook() - _0_ffn_out *= self.routed_scaling_factor if self.n_shared_experts is not None: _0_ffn_out.add_(_0_shared_output) input_embdings.add_(_0_ffn_out.view(-1, self.embed_dim_)) @@ -833,7 +828,6 @@ def overlap_tpsp_token_forward( def _1_hook_post(): _1_hook() nonlocal _1_ffn_out - _1_ffn_out *= self.routed_scaling_factor if self.n_shared_experts is not None: _1_ffn_out.add_(_1_shared_output) input_embdings1.add_(_1_ffn_out.view(-1, self.embed_dim_)) @@ -965,7 +959,6 @@ def overlap_tpsp_context_forward( _1_combine_event = Buffer.capture() - _0_ffn_out *= self.routed_scaling_factor if self.n_shared_experts is not None: _0_ffn_out.add_(_0_shared_output) input_embdings.add_(_0_ffn_out.view(-1, self.embed_dim_)) @@ -976,7 +969,6 @@ def overlap_tpsp_context_forward( def _1_hook_post(): _1_hook() nonlocal _1_ffn_out - _1_ffn_out *= self.routed_scaling_factor if self.n_shared_experts is not None: _1_ffn_out.add_(_1_shared_output) input_embdings1.add_(_1_ffn_out.view(-1, self.embed_dim_)) diff --git a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py index 7a9f3c1500..f78f7e8498 100644 --- a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py +++ b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py @@ -3,7 +3,7 @@ import math import numpy as np from lightllm.common.basemodel import TransformerLayerWeight -from lightllm.utils.envs_utils import enable_env_vars +from lightllm.utils.envs_utils import enable_env_vars, get_env_start_args from lightllm.common.basemodel.layer_weights.meta_weights import ( ROWMMWeight, MultiROWMMWeight, @@ -39,6 +39,12 @@ def _parse_config(self): self.v_head_dim = self.network_config_["v_head_dim"] self.num_attention_heads = self.network_config_["num_attention_heads"] self.kv_lora_rank = self.network_config_["kv_lora_rank"] + self.num_fused_shared_experts = 0 + if get_env_start_args().enable_fused_shared_experts and self.is_moe: + # MOE_MODE 处于 TP 模式下才能使能 enable_fused_shared_experts + moe_mode = os.getenv("MOE_MODE", "TP") + assert moe_mode == "TP" + self.num_fused_shared_experts = self.network_config_.get("n_shared_experts", 0) def _init_weight_names(self): if self.q_lora_rank is None: @@ -55,21 +61,6 @@ def _init_weight(self): self._init_ffn() self._init_norm() - def _load_q_rope(self, q_weight_): - q_split_n_embed_with_rope = ( - (self.qk_nope_head_dim + self.qk_rope_head_dim) * self.num_attention_heads // self.tp_world_size_ - ) - q_weight_ = q_weight_[ - q_split_n_embed_with_rope * self.tp_rank_ : q_split_n_embed_with_rope * (self.tp_rank_ + 1), : - ] - q_weight_ = q_weight_.transpose(0, 1).contiguous() - q_nope_proj_, q_rope_proj_ = torch.split( - q_weight_.view(-1, self.tp_q_head_num_, self.qk_nope_head_dim + self.qk_rope_head_dim), - [self.qk_nope_head_dim, self.qk_rope_head_dim], - dim=-1, - ) - return q_rope_proj_.reshape(-1, self.qk_rope_head_dim * self.tp_q_head_num_).transpose(0, 1).contiguous() - def _load_kb(self, kv_b_proj_): k_b_proj_ = kv_b_proj_.view(self.num_attention_heads, self.qk_nope_head_dim * 2, self.kv_lora_rank)[ :, : self.qk_nope_head_dim, : @@ -96,8 +87,28 @@ def _load_vb_scale(self, kv_b_proj_scale_, block_size): )[:, :, self.qk_nope_head_dim // block_size :].transpose(0, 1) return v_b_proj_scale_.contiguous().to(kv_b_proj_scale_.dtype) + def _rename_shared_experts(self, weights, weight_scale_suffix): + # 将共享专家对应的参数,改造为与路由专家一致的权重名称和映射关系。 + old_prefix = f"model.layers.{self.layer_num_}.mlp.shared_experts" + new_prefix = f"model.layers.{self.layer_num_}.mlp.experts" + proj_names = ["gate_proj", "down_proj", "up_proj"] + for i in range(self.num_fused_shared_experts): + expert_id = self.n_routed_experts + i + for proj in proj_names: + weight_tensor = weights.get(f"{old_prefix}.{proj}.weight") + if weight_tensor is not None: + weights[f"{new_prefix}.{expert_id}.{proj}.weight"] = weight_tensor + if self.quant_cfg.quantized_weight: + assert weight_scale_suffix is not None + scale_tensor = weights.get(f"{old_prefix}.{proj}." + weight_scale_suffix) + if scale_tensor is not None: + weights[f"{new_prefix}.{expert_id}.{proj}." + weight_scale_suffix] = scale_tensor + def load_hf_weights(self, weights): kv_b_quant_method = self.quant_cfg.get_quant_method(self.layer_num_, "kv_b_proj") + weight_scale_suffix = None + if self.quant_cfg.quantized_weight: + weight_scale_suffix = kv_b_quant_method.weight_scale_suffix if f"model.layers.{self.layer_num_}.self_attn.kv_b_proj.weight" in weights: kv_b_proj_ = weights[f"model.layers.{self.layer_num_}.self_attn.kv_b_proj.weight"] @@ -105,29 +116,27 @@ def load_hf_weights(self, weights): if self.quant_cfg.quantized_weight: kv_b_proj_ = weight_dequant( kv_b_proj_.cuda(), - weights[ - f"model.layers.{self.layer_num_}.self_attn.kv_b_proj." + kv_b_quant_method.weight_scale_suffix - ].cuda(), + weights[f"model.layers.{self.layer_num_}.self_attn.kv_b_proj." + weight_scale_suffix].cuda(), ).cpu() weights[f"model.layers.{self.layer_num_}.self_attn.k_b_proj.weight"] = self._load_kb(kv_b_proj_) weights[f"model.layers.{self.layer_num_}.self_attn.v_b_proj.weight"] = self._load_vb(kv_b_proj_) if ( self.quant_cfg.quantized_weight - and f"model.layers.{self.layer_num_}.self_attn.kv_b_proj." + kv_b_quant_method.weight_scale_suffix - in weights + and f"model.layers.{self.layer_num_}.self_attn.kv_b_proj." + weight_scale_suffix in weights ): - kv_b_proj_scale_ = weights[ - f"model.layers.{self.layer_num_}.self_attn.kv_b_proj." + kv_b_quant_method.weight_scale_suffix - ] + kv_b_proj_scale_ = weights[f"model.layers.{self.layer_num_}.self_attn.kv_b_proj." + weight_scale_suffix] block_size = 128 - weights[ - f"model.layers.{self.layer_num_}.self_attn.k_b_proj." + kv_b_quant_method.weight_scale_suffix - ] = self._load_kb_scale(kv_b_proj_scale_, block_size) - weights[ - f"model.layers.{self.layer_num_}.self_attn.v_b_proj." + kv_b_quant_method.weight_scale_suffix - ] = self._load_vb_scale(kv_b_proj_scale_, block_size) + weights[f"model.layers.{self.layer_num_}.self_attn.k_b_proj." + weight_scale_suffix] = self._load_kb_scale( + kv_b_proj_scale_, block_size + ) + weights[f"model.layers.{self.layer_num_}.self_attn.v_b_proj." + weight_scale_suffix] = self._load_vb_scale( + kv_b_proj_scale_, block_size + ) + # rename the shared experts weight + if self.num_fused_shared_experts > 0: + self._rename_shared_experts(weights, weight_scale_suffix) return super().load_hf_weights(weights) def _init_qkvo(self): @@ -223,8 +232,14 @@ def _init_moe(self): tp_rank=0, tp_world_size=1, ) - - self._load_mlp(f"model.layers.{self.layer_num_}.mlp.shared_experts") + # deepseekv3 模型初始几层是非moe架构,后续层才是moe架构 + # 当使能了共享专家融合策略时,共享专家不再以普通的mlp形式进行 + # 加载,而是和路由专家一起融合成一体进行推理,所以当发现当前 + # 层是moe,同时使能了共享专家融合功能时,不初始化独立的共享 + # 专家对应的 gate_up_proj 等weight 参数。当 num_fused_shared_experts + # == 0 时,说明不存在融合共享专家,共享专家单独加载和进行推理。 + if self.num_fused_shared_experts == 0: + self._load_mlp(f"model.layers.{self.layer_num_}.mlp.shared_experts") moe_mode = os.getenv("MOE_MODE", "TP") assert moe_mode in ["EP", "TP"] if moe_mode == "TP": @@ -235,6 +250,7 @@ def _init_moe(self): e_score_correction_bias_name=self.e_score_correction_bias_name, weight_prefix=f"model.layers.{self.layer_num_}.mlp.experts", n_routed_experts=self.n_routed_experts, + num_fused_shared_experts=self.num_fused_shared_experts, split_inter_size=moe_intermediate_size // self.tp_world_size_, data_type=self.data_type_, network_config=self.network_config_, diff --git a/lightllm/models/deepseek2/triton_kernel/rotary_emb.py b/lightllm/models/deepseek2/triton_kernel/rotary_emb.py index 93ff323f34..111ea54aba 100644 --- a/lightllm/models/deepseek2/triton_kernel/rotary_emb.py +++ b/lightllm/models/deepseek2/triton_kernel/rotary_emb.py @@ -21,140 +21,129 @@ def _rotary_kernel( stride_sinbs, stride_sind, max_total_len, - HEAD_Q, - HEAD_K, # N_CTX 代表要计算的上下文长度 - BLOCK_HEAD: tl.constexpr, + HEAD_PARALLEL_NUM: tl.constexpr, + HEAD_Q: tl.constexpr, + HEAD_K: tl.constexpr, BLOCK_SEQ: tl.constexpr, BLOCK_DMODEL: tl.constexpr, + NUM_STAGE: tl.constexpr, ): - cur_head_index = tl.program_id(0) - cur_seq_index = tl.program_id(1) - - cur_head_range = cur_head_index * BLOCK_HEAD + tl.arange(0, BLOCK_HEAD) - cur_seq_range = cur_seq_index * BLOCK_SEQ + tl.arange(0, BLOCK_SEQ) + head_start_index = tl.program_id(0) + seq_block_index = tl.program_id(1) + seq_start_index = seq_block_index * BLOCK_SEQ + seq_end_index = (seq_block_index + 1) * BLOCK_SEQ + seq_end_index = tl.where(seq_end_index < max_total_len, seq_end_index, max_total_len) dim_range0 = tl.arange(0, BLOCK_DMODEL // 2) * 2 dim_range1 = dim_range0 + 1 - - off_q0 = ( - cur_seq_range[:, None, None] * stride_qbs - + cur_head_range[None, :, None] * stride_qh - + dim_range0[None, None, :] * stride_qd - ) - off_q1 = ( - cur_seq_range[:, None, None] * stride_qbs - + cur_head_range[None, :, None] * stride_qh - + dim_range1[None, None, :] * stride_qd - ) - cos_range = tl.arange(0, BLOCK_DMODEL // 2) - off_dimcos_sin = cur_seq_range[:, None, None] * stride_cosbs + cos_range[None, None, :] * stride_cosd - - q0 = tl.load( - Q + off_q0, - mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_Q), - other=0.0, - ) - q1 = tl.load( - Q + off_q1, - mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_Q), - other=0.0, - ) - - cos = tl.load(Cos + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0) - sin = tl.load(Sin + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0) - - out0 = q0 * cos - q1 * sin - out1 = q0 * sin + q1 * cos - - tl.store( - Q + off_q0, out0, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_Q) - ) - tl.store( - Q + off_q1, out1, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_Q) - ) - - off_k0 = ( - cur_seq_range[:, None, None] * stride_kbs - + cur_head_range[None, :, None] * stride_kh - + dim_range0[None, None, :] * stride_kd - ) - off_k1 = ( - cur_seq_range[:, None, None] * stride_kbs - + cur_head_range[None, :, None] * stride_kh - + dim_range1[None, None, :] * stride_kd - ) - - off_dimcos_sin = cur_seq_range[:, None, None] * stride_cosbs + cos_range[None, None, :] * stride_cosd - - k0 = tl.load( - K + off_k0, - mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_K), - other=0.0, - ) - k1 = tl.load( - K + off_k1, - mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_K), - other=0.0, - ) - - cos = tl.load(Cos + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0) - sin = tl.load(Sin + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0) - - out_k0 = k0 * cos - k1 * sin - out_k1 = k0 * sin + k1 * cos - - tl.store( - K + off_k0, - out_k0, - mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_K), - ) - tl.store( - K + off_k1, - out_k1, - mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_K), - ) + for seq_index in tl.range(seq_start_index, seq_end_index): + + off_dimcos_sin = seq_index * stride_cosbs + cos_range * stride_cosd + cos = tl.load(Cos + off_dimcos_sin) + sin = tl.load(Sin + off_dimcos_sin) + + if HEAD_PARALLEL_NUM == 1: + for q_head_index in tl.static_range(0, HEAD_Q, step=1): + off_q0 = seq_index * stride_qbs + q_head_index * stride_qh + dim_range0 * stride_qd + off_q1 = seq_index * stride_qbs + q_head_index * stride_qh + dim_range1 * stride_qd + q0 = tl.load(Q + off_q0) + q1 = tl.load(Q + off_q1) + out_q0 = q0 * cos - q1 * sin + out_q1 = q0 * sin + q1 * cos + tl.store(Q + off_q0, out_q0) + tl.store(Q + off_q1, out_q1) + + for k_head_index in tl.static_range(0, HEAD_K, step=1): + off_k0 = seq_index * stride_kbs + k_head_index * stride_kh + dim_range0 * stride_kd + off_k1 = seq_index * stride_kbs + k_head_index * stride_kh + dim_range1 * stride_kd + + k0 = tl.load(K + off_k0) + k1 = tl.load(K + off_k1) + + out_k0 = k0 * cos - k1 * sin + out_k1 = k0 * sin + k1 * cos + + tl.store(K + off_k0, out_k0) + tl.store(K + off_k1, out_k1) + else: + for q_head_index in tl.range(head_start_index, HEAD_Q, step=HEAD_PARALLEL_NUM, num_stages=NUM_STAGE): + off_q0 = seq_index * stride_qbs + q_head_index * stride_qh + dim_range0 * stride_qd + off_q1 = seq_index * stride_qbs + q_head_index * stride_qh + dim_range1 * stride_qd + q0 = tl.load(Q + off_q0) + q1 = tl.load(Q + off_q1) + out_q0 = q0 * cos - q1 * sin + out_q1 = q0 * sin + q1 * cos + tl.store(Q + off_q0, out_q0) + tl.store(Q + off_q1, out_q1) + + for k_head_index in tl.range(head_start_index, HEAD_K, step=HEAD_PARALLEL_NUM, num_stages=NUM_STAGE): + off_k0 = seq_index * stride_kbs + k_head_index * stride_kh + dim_range0 * stride_kd + off_k1 = seq_index * stride_kbs + k_head_index * stride_kh + dim_range1 * stride_kd + + k0 = tl.load(K + off_k0) + k1 = tl.load(K + off_k1) + + out_k0 = k0 * cos - k1 * sin + out_k1 = k0 * sin + k1 * cos + + tl.store(K + off_k0, out_k0) + tl.store(K + off_k1, out_k1) return @torch.no_grad() -def rotary_emb_fwd(q, k, cos, sin): +def rotary_emb_fwd(q, k, cos, sin, **run_config): total_len = q.shape[0] head_num_q, head_num_k = q.shape[1], k.shape[1] head_dim = q.shape[2] assert q.shape[0] == cos.shape[0] and q.shape[0] == sin.shape[0], f"q shape {q.shape} cos shape {cos.shape}" assert k.shape[0] == cos.shape[0] and k.shape[0] == sin.shape[0], f"k shape {k.shape} cos shape {cos.shape}" - - BLOCK_SEQ = 16 - BLOCK_HEAD = 4 - if head_dim >= 128: - num_warps = 8 - else: - num_warps = 4 - - grid = (triton.cdiv(head_num_q, BLOCK_HEAD), triton.cdiv(total_len, BLOCK_SEQ)) + assert triton.next_power_of_2(head_dim) == head_dim + + from .rotary_emb_config import DeepseekV3RotaryKernelConfig + + if not run_config: + run_config = DeepseekV3RotaryKernelConfig.try_to_get_best_config( + M=total_len, + Q_HEAD_NUM=head_num_q, + K_HEAD_NUM=head_num_k, + HEAD_DIM=head_dim, + dtype=str(q.dtype), + ) + + BLOCK_SEQ = run_config["BLOCK_SEQ"] + HEAD_PARALLEL_NUM = run_config["HEAD_PARALLEL_NUM"] + num_warps = run_config["num_warps"] + num_stages = run_config["num_stages"] + + grid = ( + HEAD_PARALLEL_NUM, + triton.cdiv(total_len, BLOCK_SEQ), + ) _rotary_kernel[grid]( - q, - k, - cos, - sin, - q.stride(0), - q.stride(1), - q.stride(2), - k.stride(0), - k.stride(1), - k.stride(2), - cos.stride(0), - cos.stride(1), - sin.stride(0), - sin.stride(1), - total_len, - head_num_q, - head_num_k, - BLOCK_HEAD=BLOCK_HEAD, + Q=q, + K=k, + Cos=cos, + Sin=sin, + stride_qbs=q.stride(0), + stride_qh=q.stride(1), + stride_qd=q.stride(2), + stride_kbs=k.stride(0), + stride_kh=k.stride(1), + stride_kd=k.stride(2), + stride_cosbs=cos.stride(0), + stride_cosd=cos.stride(1), + stride_sinbs=sin.stride(0), + stride_sind=sin.stride(1), + max_total_len=total_len, + HEAD_Q=head_num_q, + HEAD_PARALLEL_NUM=HEAD_PARALLEL_NUM, + HEAD_K=head_num_k, BLOCK_SEQ=BLOCK_SEQ, BLOCK_DMODEL=head_dim, + NUM_STAGE=num_stages, num_warps=num_warps, - num_stages=1, + num_stages=num_stages, ) return diff --git a/lightllm/models/deepseek2/triton_kernel/rotary_emb_config.py b/lightllm/models/deepseek2/triton_kernel/rotary_emb_config.py new file mode 100644 index 0000000000..9ea5825957 --- /dev/null +++ b/lightllm/models/deepseek2/triton_kernel/rotary_emb_config.py @@ -0,0 +1,61 @@ +import os +from frozendict import frozendict +from functools import lru_cache +from lightllm.common.kernel_config import KernelConfigs +from lightllm.utils.log_utils import init_logger + +logger = init_logger(__name__) + + +class DeepseekV3RotaryKernelConfig(KernelConfigs): + kernel_name: str = "deepseek_v3_rotary_emb_kernel" + + @classmethod + @lru_cache(maxsize=200) + def try_to_get_best_config( + cls, + M: int, + Q_HEAD_NUM: int, + K_HEAD_NUM: int, + HEAD_DIM: int, + dtype: str, + ) -> dict: + key_params = { + "Q_HEAD_NUM": Q_HEAD_NUM, + "K_HEAD_NUM": K_HEAD_NUM, + "HEAD_DIM": HEAD_DIM, + "dtype": str(dtype), + } + key_params = frozendict(key_params) + + finded_config = cls.get_the_config(key_params) + + if finded_config: + config = finded_config[min(finded_config.keys(), key=lambda x: abs(int(x) - M))] + return config + else: + if M <= 256: + config = {"BLOCK_SEQ": 1, "NUM_STAGE": 1, "num_warps": 1, "num_stages": 1, "HEAD_PARALLEL_NUM": 1} + else: + config = {"BLOCK_SEQ": 16, "NUM_STAGE": 1, "num_warps": 1, "num_stages": 1, "HEAD_PARALLEL_NUM": 1} + + return config + + @classmethod + def save_config( + cls, + Q_HEAD_NUM: int, + K_HEAD_NUM: int, + HEAD_DIM: int, + dtype: str, + config_json: dict, + ): + key_params = { + "Q_HEAD_NUM": Q_HEAD_NUM, + "K_HEAD_NUM": K_HEAD_NUM, + "HEAD_DIM": HEAD_DIM, + "dtype": str(dtype), + } + key_params = frozendict(key_params) + + return cls.store_config(key_params, config_json) diff --git a/lightllm/models/llama/layer_infer/transformer_layer_infer.py b/lightllm/models/llama/layer_infer/transformer_layer_infer.py index 4b06a75c35..b00215cff8 100755 --- a/lightllm/models/llama/layer_infer/transformer_layer_infer.py +++ b/lightllm/models/llama/layer_infer/transformer_layer_infer.py @@ -16,7 +16,7 @@ from lightllm.models.llama.triton_kernel.token_attention_nopad_reduceV import token_att_fwd2, token_att_fwd2_int8v from lightllm.models.llama.triton_kernel.rmsnorm import rmsnorm_forward from lightllm.models.llama.triton_kernel.rotary_emb import rotary_emb_fwd -from lightllm.models.llama.triton_kernel.silu_and_mul import silu_and_mul_fwd +from lightllm.common.fused_moe.moe_silu_and_mul import silu_and_mul_fwd from lightllm.models.llama.infer_struct import LlamaInferStateInfo from lightllm.models.llama.flashattention_infer_struct import FlashAttentionStateInfo diff --git a/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py index 35eee10b71..a918c6aec0 100644 --- a/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py +++ b/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py @@ -89,6 +89,7 @@ def _init_moe(self): network_config=self.network_config_, layer_num=self.layer_num_, quant_cfg=self.quant_cfg, + num_fused_shared_experts=0, ) elif moe_mode == "EP": self.experts = FusedMoeWeightEP( diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index 465f9cc920..0ae258f20e 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -442,6 +442,11 @@ def make_argument_parser() -> argparse.ArgumentParser: action="store_true", help="""Whether to update the redundant expert for deepseekv3 model by online expert used counter.""", ) + parser.add_argument( + "--enable_fused_shared_experts", + action="store_true", + help="""Whether to enable fused shared experts for deepseekv3 model. only work when MOE_MODE=TP """, + ) parser.add_argument( "--mtp_mode", choices=["deepseekv3", None], diff --git a/lightllm/utils/torch_ops_utils.py b/lightllm/utils/torch_ops_utils.py new file mode 100644 index 0000000000..d88402ad25 --- /dev/null +++ b/lightllm/utils/torch_ops_utils.py @@ -0,0 +1,56 @@ +# Adapted from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/utils.py +from torch.library import Library + +from typing import ( + Any, + Callable, + Dict, + Generic, + List, + Optional, + Protocol, + Set, + Tuple, + TypeVar, + Union, +) +import torch + +lightllm_lib = Library("lightllm", "FRAGMENT") # noqa + + +# Some backends use pytorch version < 2.4.0 which doesn't +# support `torch.library.custom_op`. +def supports_custom_op() -> bool: + return hasattr(torch.library, "custom_op") + + +def direct_register_custom_op( + op_name: str, + op_func: Callable, + mutates_args: List[str], + fake_impl: Optional[Callable] = None, + target_lib: Optional[Library] = None, +): + """ + `torch.library.custom_op` can have significant overhead because it + needs to consider complicated dispatching logic. This function + directly registers a custom op and dispatches it to the CUDA backend. + See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5 + for more details. + """ + import torch.library + + if hasattr(torch.library, "infer_schema"): + schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args) + else: + # for pytorch 2.4 + import torch._custom_op.impl + + schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args) + + my_lib = target_lib or lightllm_lib + my_lib.define(op_name + schema_str) + my_lib.impl(op_name, op_func, "CUDA") + if fake_impl is not None: + my_lib._register_fake(op_name, fake_impl) diff --git a/lightllm/utils/tuning_utils.py b/lightllm/utils/tuning_utils.py index f934bc90b0..93f482d976 100644 --- a/lightllm/utils/tuning_utils.py +++ b/lightllm/utils/tuning_utils.py @@ -54,7 +54,7 @@ def mp_tuning(func, args: Dict[str, Any]): best_cost_time = _cost_time best_config = _config - logger.info(f"best config {best_config} best cost time {best_cost_time}") + logger.info(f"args: {args} best config {best_config} best cost time {best_cost_time}") return best_config diff --git a/test/kernel/deepseekv3_rotary_emb_tuning.py b/test/kernel/deepseekv3_rotary_emb_tuning.py new file mode 100644 index 0000000000..fa9d169db7 --- /dev/null +++ b/test/kernel/deepseekv3_rotary_emb_tuning.py @@ -0,0 +1,243 @@ +import os +import torch +import time +import torch.multiprocessing as mp +import itertools +from lightllm.models.deepseek2.triton_kernel.rotary_emb import rotary_emb_fwd +from lightllm.models.deepseek2.triton_kernel.rotary_emb_config import DeepseekV3RotaryKernelConfig +from lightllm.utils.watchdog_utils import Watchdog +from typing import List +from lightllm.utils.log_utils import init_logger + +logger = init_logger(__name__) + + +def set_seed(): + import torch + import random + import numpy as np + + seed = 42 + torch.manual_seed(seed) + random.seed(seed) + np.random.seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + return + + +@torch.no_grad() +def test_kernel( + M: int, + Q_HEAD_NUM: int, + K_HEAD_NUM: int, + HEAD_DIM: int, + dtype: torch.dtype, + test_count: int, + **config, +): + set_seed() + input_tuples = [] + + q = torch.randn((M, Q_HEAD_NUM, HEAD_DIM), device="cuda", dtype=dtype) / 10 + k = torch.randn((M, K_HEAD_NUM, HEAD_DIM), device="cuda", dtype=dtype) / 10 + cos = torch.randn((M, HEAD_DIM // 2), device="cuda", dtype=dtype) + sin = torch.randn((M, HEAD_DIM // 2), device="cuda", dtype=dtype) + + for _ in range(test_count): + input_tuples.append((q.clone(), k.clone(), cos.clone(), sin.clone())) + + # warm_up + rotary_emb_fwd(q=q, k=k, cos=cos, sin=sin, **config) + + graph = torch.cuda.CUDAGraph() + + with torch.cuda.graph(graph): + for index in range(test_count): + q, k, cos, sin = input_tuples[index] + rotary_emb_fwd(q=q, k=k, cos=cos, sin=sin, **config) + + graph.replay() + + torch.cuda.synchronize() + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + graph.replay() + end_event.record() + end_event.synchronize() + + cost_time = start_event.elapsed_time(end_event) + + logger.info(str(config)) + logger.info(f"bf16 {M} cost time: {cost_time} ms") + return cost_time + + +def worker( + M: int, + Q_HEAD_NUM: int, + K_HEAD_NUM: int, + HEAD_DIM: int, + dtype: torch.dtype, + test_count: int, + test_configs, + queue, +): + dog = Watchdog(timeout=10) + dog.start() + try: + for index in range(len(test_configs)): + cost_time = test_kernel( + M=M, + Q_HEAD_NUM=Q_HEAD_NUM, + K_HEAD_NUM=K_HEAD_NUM, + HEAD_DIM=HEAD_DIM, + dtype=dtype, + test_count=test_count, + **test_configs[index], + ) + dog.heartbeat() + queue.put(cost_time) # Put result in queue + + except Exception as ex: + logger.error(str(ex)) + logger.exception(str(ex)) + import sys + + sys.exit(-1) + pass + + +def get_test_configs(split_id, split_count): + index = 0 + result = itertools.product([1, 2, 4, 8, 16, 32], [1, 2, 4, 8], [1, 2, 3, 4, 5], [1, 2, 4, 8, 16]) + for BLOCK_SEQ, num_warps, num_stages, HEAD_PARALLEL_NUM in result: + t_config = { + "BLOCK_SEQ": BLOCK_SEQ, + "HEAD_PARALLEL_NUM": HEAD_PARALLEL_NUM, + "num_warps": num_warps, + "num_stages": num_stages, + } + if index % split_count == split_id: + yield t_config + index += 1 + else: + index += 1 + + +def tuning_configs( + device_id: int, # use for mult mp tunning + device_count: int, + M: int, + Q_HEAD_NUM: int, + K_HEAD_NUM: int, + HEAD_DIM: int, + dtype: torch.dtype, + test_count: int, +): + os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id) + best_config, best_cost_time = None, 10000000 + queue = mp.Queue() + test_configs = [] + for t_config in get_test_configs(device_id, device_count): + test_configs.append(t_config) + if len(test_configs) < 256: + continue + + p = mp.Process( + target=worker, + args=( + M, + Q_HEAD_NUM, + K_HEAD_NUM, + HEAD_DIM, + dtype, + test_count, + test_configs, + queue, + ), + ) + p.start() + p.join() + while len(test_configs) != 0: + try: + cost_time = queue.get_nowait() + logger.info(f"get {test_configs[0]} cost_time: {cost_time}") + if cost_time < best_cost_time: + best_config = test_configs[0] + best_cost_time = cost_time + logger.info(f"cur best : {best_config} {best_cost_time}") + del test_configs[0:1] + except: + del test_configs[0:16] + logger.info(f"cur best : {best_config} {best_cost_time}") + break + + while len(test_configs) != 0: + p = mp.Process( + target=worker, + args=( + M, + Q_HEAD_NUM, + K_HEAD_NUM, + HEAD_DIM, + dtype, + test_count, + test_configs, + queue, + ), + ) + p.start() + p.join() + + while len(test_configs) != 0: + try: + cost_time = queue.get_nowait() + logger.info(f"get {test_configs[0]} cost_time: {cost_time}") + if cost_time < best_cost_time: + best_config = test_configs[0] + best_cost_time = cost_time + logger.info(f"cur best : {best_config} {best_cost_time}") + del test_configs[0:1] + except: + del test_configs[0:16] + logger.info(f"cur best : {best_config} {best_cost_time}") + break + + logger.info(f"M {M} {best_config} best cost: {best_cost_time}") + return best_config, best_cost_time + + +if __name__ == "__main__": + torch.multiprocessing.set_start_method("spawn") + from lightllm.utils.tuning_utils import mp_tuning + + # for deepseekv3 600B + + for q_head_num in [128, 64, 32, 16, 8]: + k_head_num = 1 + head_dim = 64 + dtype = torch.bfloat16 + for m in [1, 128, 256, 1024, 2048, 4096, 8192]: + json_dict = {} + ans = mp_tuning( + tuning_configs, + { + "M": m, + "Q_HEAD_NUM": q_head_num, + "K_HEAD_NUM": k_head_num, + "HEAD_DIM": head_dim, + "dtype": dtype, + "test_count": 20, + }, + ) + json_dict[m] = ans + DeepseekV3RotaryKernelConfig.save_config( + Q_HEAD_NUM=q_head_num, + K_HEAD_NUM=k_head_num, + HEAD_DIM=head_dim, + dtype=str(dtype), + config_json=json_dict, + ) diff --git a/test/kernel/fuse_moe_tuning.py b/test/kernel/fuse_moe_tuning.py index 6e971573ad..69fc4e0c80 100644 --- a/test/kernel/fuse_moe_tuning.py +++ b/test/kernel/fuse_moe_tuning.py @@ -7,6 +7,7 @@ from typing import List from lightllm.utils.log_utils import init_logger from transformers import AutoConfig +import torch.nn.functional as F logger = init_logger(__name__) @@ -61,6 +62,7 @@ def test_kernel( use_fp8_w8a8: bool, is_up: bool, block_shape, + num_fused_shared_experts: int, **config, ): set_seed() @@ -68,6 +70,8 @@ def test_kernel( a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 w1_scale = w2_scale = None + if num_fused_shared_experts > 0: + expert_num += num_fused_shared_experts if use_fp8_w8a8: init_dtype = dtype @@ -91,19 +95,35 @@ def test_kernel( w1 = torch.randn(expert_num, 2 * n, k, dtype=dtype).cuda() w2 = torch.randn(expert_num, k, 2 * n // 2, dtype=dtype).cuda() - rnd_logics = torch.randn(m, expert_num, device="cuda") + rnd_logics = torch.randn(m, expert_num - num_fused_shared_experts, device="cuda") topk_values, topk_ids = torch.topk(rnd_logics, topk, dim=1) - topk_weights = torch.randn((m, topk), device="cuda", dtype=dtype) / 10 - - expert_to_tokens = torch.empty((expert_num, topk * m), dtype=torch.int32, device="cuda") - expert_to_weights = torch.empty((expert_num, topk * m), dtype=torch.float32, device="cuda") + if num_fused_shared_experts > 0: + # 存在融合共享专家的时候,需要pad 共享专家对应的id 到topk_ids 中 + pad_topk_ids = ( + torch.arange( + start=expert_num - num_fused_shared_experts, end=expert_num, step=1, dtype=topk_ids.dtype, device="cuda" + ) + .view(1, num_fused_shared_experts) + .repeat(topk_ids.shape[0], 1) + ) + topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1) + topk_weights = torch.randn((m, topk + num_fused_shared_experts), device="cuda", dtype=dtype) / 10 + + expert_to_tokens = torch.empty( + (expert_num, (topk + num_fused_shared_experts) * m), dtype=torch.int32, device="cuda" + ) + expert_to_weights = torch.empty( + (expert_num, (topk + num_fused_shared_experts) * m), dtype=torch.float32, device="cuda" + ) moe_align(topk_ids=topk_ids, out=expert_to_tokens) expert_to_token_num = torch.empty((expert_num,), dtype=torch.int32, device="cuda") - moe_align1(expert_to_tokens, topk_weights, expert_to_weights, expert_to_token_num, topk=topk) + moe_align1( + expert_to_tokens, topk_weights, expert_to_weights, expert_to_token_num, topk=topk + num_fused_shared_experts + ) - out1 = torch.zeros((m * topk, 2 * n), dtype=torch.bfloat16, device="cuda") - down_in = torch.zeros((m * topk, n), dtype=torch.bfloat16, device="cuda") - out2 = torch.zeros((m * topk, k), dtype=torch.bfloat16, device="cuda") + out1 = torch.zeros((m * (topk + num_fused_shared_experts), 2 * n), dtype=torch.bfloat16, device="cuda") + down_in = torch.zeros((m * (topk + num_fused_shared_experts), n), dtype=torch.bfloat16, device="cuda") + out2 = torch.zeros((m * (topk + num_fused_shared_experts), k), dtype=torch.bfloat16, device="cuda") for _ in range(test_count): input_tuples.append( @@ -111,8 +131,8 @@ def test_kernel( a.clone(), w1.clone(), w2.clone(), - w1_scale.clone(), - w2_scale.clone(), + w1_scale.clone() if w1_scale is not None else None, + w2_scale.clone() if w2_scale is not None else None, topk_ids.clone(), topk_weights.clone(), out1.clone(), @@ -135,7 +155,7 @@ def test_kernel( out=out1, mul_routed_weight=False, use_fp8_w8a8=use_fp8_w8a8, - **config, + run_config=config, ) else: grouped_matmul( @@ -151,7 +171,7 @@ def test_kernel( out=out2, mul_routed_weight=True, use_fp8_w8a8=use_fp8_w8a8, - **config, + run_config=config, ) graph = torch.cuda.CUDAGraph() @@ -171,10 +191,9 @@ def test_kernel( expert_to_weights_scale=w1_scale, topk_num=topk, out=out1, - expert_token_limit=2 ** 31 - 1, mul_routed_weight=False, use_fp8_w8a8=use_fp8_w8a8, - **config, + run_config=config, ) else: grouped_matmul( @@ -188,10 +207,9 @@ def test_kernel( expert_to_weights_scale=w2_scale, topk_num=1, out=out2, - expert_token_limit=2 ** 31 - 1, mul_routed_weight=True, use_fp8_w8a8=use_fp8_w8a8, - **config, + run_config=config, ) graph.replay() @@ -219,6 +237,7 @@ def worker( use_fp8_w8a8: bool, is_up: bool, block_shape, + num_fused_shared_experts: int, test_configs, queue, ): @@ -235,6 +254,7 @@ def worker( use_fp8_w8a8=use_fp8_w8a8, is_up=is_up, block_shape=block_shape, + num_fused_shared_experts=num_fused_shared_experts, **test_configs[index], ) queue.put(cost_time) # Put result in queue @@ -267,12 +287,8 @@ def get_test_configs(split_id, split_count): 4, 8, ]: - for BLOCK_SIZE_M in [ - 16, - 32, - 64, - ]: - for BLOCK_SIZE_N in [64, 128]: + for BLOCK_SIZE_M in [16, 32, 64, 128]: + for BLOCK_SIZE_N in [32, 64, 128]: for BLOCK_SIZE_K in [32, 64, 128]: t_config = { "BLOCK_SIZE_M": BLOCK_SIZE_M, @@ -302,6 +318,7 @@ def tuning_configs( use_fp8_w8a8: bool, is_up: bool, block_shape, + num_fused_shared_experts: int, ): os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id) best_config, best_cost_time = None, 10000000 @@ -325,6 +342,7 @@ def tuning_configs( use_fp8_w8a8, is_up, block_shape, + num_fused_shared_experts, test_configs, queue, ), @@ -359,6 +377,7 @@ def tuning_configs( use_fp8_w8a8, is_up, block_shape, + num_fused_shared_experts, test_configs, queue, ), @@ -393,15 +412,16 @@ def main(args): if config.architectures[0] == "Qwen3MoeForCausalLM": expert_num = config.num_experts topk_num = config.num_experts_per_tok - n = 2 * config.moe_intermediate_size // args.tp + n = config.moe_intermediate_size // args.tp elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: expert_num = config.n_routed_experts topk_num = config.num_experts_per_tok - n = 2 * config.moe_intermediate_size // args.tp + n = config.moe_intermediate_size // args.tp else: pass hidden_dim = getattr(config, "hidden_size", None) or config.text_config.hidden_size + print(n, hidden_dim) use_fp8_w8a8 = args.use_fp8_w8a8 block_shape = None if hasattr(config, "quantization_config") and "weight_block_size" in config.quantization_config: @@ -424,6 +444,7 @@ def main(args): "use_fp8_w8a8": use_fp8_w8a8, "is_up": True, "block_shape": block_shape, + "num_fused_shared_experts": args.num_fused_shared_experts, }, ) up_dict[m] = ans @@ -431,7 +452,7 @@ def main(args): N=n * 2, K=hidden_dim, topk_num=topk_num, - expert_num=expert_num, + expert_num=expert_num + args.num_fused_shared_experts, mul_routed_weight=False, use_fp8_w8a8=use_fp8_w8a8, out_dtype=str(torch.bfloat16), @@ -439,7 +460,7 @@ def main(args): ) down_dict = {} - for m in [1, 8, 64, 128, 256, 512, 1024, 4096, 8192]: + for m in [1, 8, 64, 128, 256, 512, 1024, 4096, 8192, 16384, 32768]: ans = mp_tuning( tuning_configs, { @@ -453,6 +474,7 @@ def main(args): "use_fp8_w8a8": use_fp8_w8a8, "is_up": False, "block_shape": block_shape, + "num_fused_shared_experts": args.num_fused_shared_experts, }, ) down_dict[m] = ans @@ -461,7 +483,7 @@ def main(args): N=hidden_dim, K=n, topk_num=1, - expert_num=expert_num, + expert_num=expert_num + args.num_fused_shared_experts, mul_routed_weight=True, use_fp8_w8a8=use_fp8_w8a8, out_dtype=str(torch.bfloat16), @@ -474,5 +496,6 @@ def main(args): parser.add_argument("--model_dir", type=str, default="deepseek-ai/DeepSeek-R1") parser.add_argument("--tp", type=int, default=8) parser.add_argument("--use_fp8_w8a8", action="store_true") + parser.add_argument("--num_fused_shared_experts", type=int, default=0) args = parser.parse_args() main(args) diff --git a/test/kernel/moe_silu_and_mul_tuning_bf16.py b/test/kernel/moe_silu_and_mul_tuning_bf16.py index 950d30d249..94897e70ee 100644 --- a/test/kernel/moe_silu_and_mul_tuning_bf16.py +++ b/test/kernel/moe_silu_and_mul_tuning_bf16.py @@ -100,12 +100,13 @@ def worker( def get_test_configs(split_id, split_count): index = 0 - result = itertools.product([1, 2, 4, 8, 16, 32], [64, 128, 256, 512, 1024], [1, 2, 4, 8, 16]) - for BLOCK_M, BLOCK_N, num_warps in result: + result = itertools.product([1, 2, 4, 8, 16, 32], [64, 128, 256, 512, 1024], [1, 2, 4, 8, 16], [1, 2, 4, 8, 16]) + for BLOCK_M, BLOCK_N, num_warps, NUM_STAGES in result: t_config = { "BLOCK_M": BLOCK_M, "BLOCK_N": BLOCK_N, "num_warps": num_warps, + "NUM_STAGES": NUM_STAGES, } if index % split_count == split_id: yield t_config @@ -196,7 +197,7 @@ def tuning_configs( from lightllm.utils.tuning_utils import mp_tuning # tuning to get silu and mul - for n in [128, 192, 256, 512, 1024, 1408, 2048, 4096, 8192]: + for n in [128, 2304, 192, 256, 512, 1024, 1408, 2048, 4096, 8192]: json_dict = {} for m in [1, 8, 64, 128, 200, 256, 512, 1024, 2048, 4096, 8192]: ans = mp_tuning( diff --git a/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py b/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py index def7b82c74..3d786e008b 100644 --- a/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py +++ b/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py @@ -32,20 +32,15 @@ def test_silu_and_mul_masked(expert_num, token_num, hidden_dim): ) true_out_tensor_mid = torch.randn((expert_num, token_num, hidden_dim // 2), dtype=torch.float16, device="cuda") - true_out_tensor = torch.empty((expert_num, token_num, hidden_dim // 2), dtype=torch.float8_e4m3fn, device="cuda") - true_out_scale_tensor = torch.randn( - (expert_num, token_num, hidden_dim // 2 // quant_group_size), dtype=torch.float32, device="cuda" - ) masked_m = [random.randint(0, token_num) for _ in range(expert_num)] masked_m = torch.tensor(masked_m, dtype=torch.int32, device="cuda") silu_and_mul_fwd(in_tensor.view(-1, hidden_dim), true_out_tensor_mid.view(-1, hidden_dim // 2)) - per_token_group_quant_fp8( + true_out_tensor, true_out_scale_tensor = per_token_group_quant_fp8( true_out_tensor_mid.view(-1, hidden_dim // 2), quant_group_size, - true_out_tensor.view(-1, hidden_dim // 2), - true_out_scale_tensor.view(-1, hidden_dim // 2 // quant_group_size), + alloc_func=torch.empty, ) silu_and_mul_masked_post_quant_fwd(in_tensor, out_tensor, out_scale_tensor, quant_group_size, masked_m)