Skip to content

Commit c0a1110

Browse files
committed
moe config
1 parent 32651c7 commit c0a1110

4 files changed

+313
-37
lines changed

lightllm/common/triton_utils/autotune_kernel_configs/triton_3.3.1/NVIDIA_H200/grouped_matmul:v1/{K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,80 +3,108 @@
33
"BLOCK_SIZE_K": 128,
44
"BLOCK_SIZE_M": 16,
55
"BLOCK_SIZE_N": 128,
6-
"GROUP_SIZE_M": 8,
6+
"GROUP_SIZE_M": 32,
7+
"NEED_TRANS": true,
78
"num_stages": 2,
89
"num_warps": 4
910
},
1011
"128": {
1112
"BLOCK_SIZE_K": 128,
1213
"BLOCK_SIZE_M": 16,
1314
"BLOCK_SIZE_N": 128,
14-
"GROUP_SIZE_M": 8,
15+
"GROUP_SIZE_M": 64,
16+
"NEED_TRANS": true,
1517
"num_stages": 2,
1618
"num_warps": 4
1719
},
1820
"131072": {
1921
"BLOCK_SIZE_K": 128,
20-
"BLOCK_SIZE_M": 128,
21-
"BLOCK_SIZE_N": 64,
22-
"GROUP_SIZE_M": 2,
22+
"BLOCK_SIZE_M": 64,
23+
"BLOCK_SIZE_N": 128,
24+
"GROUP_SIZE_M": 16,
25+
"NEED_TRANS": false,
2326
"num_stages": 3,
2427
"num_warps": 4
2528
},
2629
"16384": {
2730
"BLOCK_SIZE_K": 128,
2831
"BLOCK_SIZE_M": 64,
2932
"BLOCK_SIZE_N": 128,
30-
"GROUP_SIZE_M": 8,
33+
"GROUP_SIZE_M": 32,
34+
"NEED_TRANS": false,
3135
"num_stages": 3,
3236
"num_warps": 4
3337
},
3438
"2048": {
3539
"BLOCK_SIZE_K": 128,
3640
"BLOCK_SIZE_M": 16,
3741
"BLOCK_SIZE_N": 128,
38-
"GROUP_SIZE_M": 8,
42+
"GROUP_SIZE_M": 64,
43+
"NEED_TRANS": true,
44+
"num_stages": 2,
45+
"num_warps": 4
46+
},
47+
"256": {
48+
"BLOCK_SIZE_K": 128,
49+
"BLOCK_SIZE_M": 16,
50+
"BLOCK_SIZE_N": 128,
51+
"GROUP_SIZE_M": 64,
52+
"NEED_TRANS": true,
3953
"num_stages": 2,
4054
"num_warps": 4
4155
},
4256
"32768": {
4357
"BLOCK_SIZE_K": 128,
4458
"BLOCK_SIZE_M": 64,
4559
"BLOCK_SIZE_N": 128,
46-
"GROUP_SIZE_M": 8,
60+
"GROUP_SIZE_M": 32,
61+
"NEED_TRANS": false,
4762
"num_stages": 3,
4863
"num_warps": 4
4964
},
5065
"512": {
5166
"BLOCK_SIZE_K": 128,
5267
"BLOCK_SIZE_M": 16,
5368
"BLOCK_SIZE_N": 128,
54-
"GROUP_SIZE_M": 8,
69+
"GROUP_SIZE_M": 64,
70+
"NEED_TRANS": true,
5571
"num_stages": 2,
5672
"num_warps": 4
5773
},
5874
"64": {
5975
"BLOCK_SIZE_K": 128,
6076
"BLOCK_SIZE_M": 16,
6177
"BLOCK_SIZE_N": 128,
62-
"GROUP_SIZE_M": 2,
78+
"GROUP_SIZE_M": 64,
79+
"NEED_TRANS": true,
6380
"num_stages": 2,
6481
"num_warps": 4
6582
},
6683
"8": {
67-
"BLOCK_SIZE_K": 32,
84+
"BLOCK_SIZE_K": 64,
6885
"BLOCK_SIZE_M": 16,
6986
"BLOCK_SIZE_N": 128,
70-
"GROUP_SIZE_M": 1,
87+
"GROUP_SIZE_M": 64,
88+
"NEED_TRANS": true,
7189
"num_stages": 3,
72-
"num_warps": 2
90+
"num_warps": 4
7391
},
74-
"8192": {
92+
"800": {
7593
"BLOCK_SIZE_K": 128,
7694
"BLOCK_SIZE_M": 16,
7795
"BLOCK_SIZE_N": 128,
78-
"GROUP_SIZE_M": 8,
96+
"GROUP_SIZE_M": 32,
97+
"NEED_TRANS": true,
7998
"num_stages": 2,
8099
"num_warps": 4
100+
},
101+
"8192": {
102+
"BLOCK_SIZE_K": 128,
103+
"BLOCK_SIZE_M": 64,
104+
"BLOCK_SIZE_N": 128,
105+
"GROUP_SIZE_M": 64,
106+
"NEED_TRANS": false,
107+
"num_stages": 3,
108+
"num_warps": 4
81109
}
82110
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
{
2+
"1152": {
3+
"BLOCK_SIZE_K": 128,
4+
"BLOCK_SIZE_M": 16,
5+
"BLOCK_SIZE_N": 128,
6+
"GROUP_SIZE_M": 32,
7+
"NEED_TRANS": true,
8+
"num_stages": 2,
9+
"num_warps": 4
10+
},
11+
"144": {
12+
"BLOCK_SIZE_K": 128,
13+
"BLOCK_SIZE_M": 16,
14+
"BLOCK_SIZE_N": 128,
15+
"GROUP_SIZE_M": 64,
16+
"NEED_TRANS": true,
17+
"num_stages": 2,
18+
"num_warps": 4
19+
},
20+
"147456": {
21+
"BLOCK_SIZE_K": 128,
22+
"BLOCK_SIZE_M": 64,
23+
"BLOCK_SIZE_N": 128,
24+
"GROUP_SIZE_M": 16,
25+
"NEED_TRANS": false,
26+
"num_stages": 3,
27+
"num_warps": 4
28+
},
29+
"18432": {
30+
"BLOCK_SIZE_K": 128,
31+
"BLOCK_SIZE_M": 64,
32+
"BLOCK_SIZE_N": 128,
33+
"GROUP_SIZE_M": 32,
34+
"NEED_TRANS": false,
35+
"num_stages": 3,
36+
"num_warps": 4
37+
},
38+
"2304": {
39+
"BLOCK_SIZE_K": 128,
40+
"BLOCK_SIZE_M": 16,
41+
"BLOCK_SIZE_N": 128,
42+
"GROUP_SIZE_M": 64,
43+
"NEED_TRANS": true,
44+
"num_stages": 2,
45+
"num_warps": 4
46+
},
47+
"288": {
48+
"BLOCK_SIZE_K": 128,
49+
"BLOCK_SIZE_M": 16,
50+
"BLOCK_SIZE_N": 128,
51+
"GROUP_SIZE_M": 64,
52+
"NEED_TRANS": true,
53+
"num_stages": 2,
54+
"num_warps": 4
55+
},
56+
"36864": {
57+
"BLOCK_SIZE_K": 128,
58+
"BLOCK_SIZE_M": 64,
59+
"BLOCK_SIZE_N": 128,
60+
"GROUP_SIZE_M": 32,
61+
"NEED_TRANS": false,
62+
"num_stages": 3,
63+
"num_warps": 4
64+
},
65+
"576": {
66+
"BLOCK_SIZE_K": 128,
67+
"BLOCK_SIZE_M": 16,
68+
"BLOCK_SIZE_N": 128,
69+
"GROUP_SIZE_M": 64,
70+
"NEED_TRANS": true,
71+
"num_stages": 2,
72+
"num_warps": 4
73+
},
74+
"72": {
75+
"BLOCK_SIZE_K": 128,
76+
"BLOCK_SIZE_M": 16,
77+
"BLOCK_SIZE_N": 128,
78+
"GROUP_SIZE_M": 64,
79+
"NEED_TRANS": true,
80+
"num_stages": 2,
81+
"num_warps": 4
82+
},
83+
"9": {
84+
"BLOCK_SIZE_K": 64,
85+
"BLOCK_SIZE_M": 16,
86+
"BLOCK_SIZE_N": 128,
87+
"GROUP_SIZE_M": 64,
88+
"NEED_TRANS": true,
89+
"num_stages": 3,
90+
"num_warps": 4
91+
},
92+
"900": {
93+
"BLOCK_SIZE_K": 128,
94+
"BLOCK_SIZE_M": 16,
95+
"BLOCK_SIZE_N": 128,
96+
"GROUP_SIZE_M": 64,
97+
"NEED_TRANS": true,
98+
"num_stages": 2,
99+
"num_warps": 4
100+
},
101+
"9216": {
102+
"BLOCK_SIZE_K": 128,
103+
"BLOCK_SIZE_M": 64,
104+
"BLOCK_SIZE_N": 128,
105+
"GROUP_SIZE_M": 64,
106+
"NEED_TRANS": false,
107+
"num_stages": 3,
108+
"num_warps": 4
109+
}
110+
}

lightllm/common/triton_utils/autotune_kernel_configs/triton_3.3.1/NVIDIA_H200/grouped_matmul:v1/{K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json

Lines changed: 50 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,38 +4,52 @@
44
"BLOCK_SIZE_M": 16,
55
"BLOCK_SIZE_N": 64,
66
"GROUP_SIZE_M": 1,
7+
"NEED_TRANS": true,
8+
"num_stages": 5,
9+
"num_warps": 4
10+
},
11+
"100": {
12+
"BLOCK_SIZE_K": 128,
13+
"BLOCK_SIZE_M": 16,
14+
"BLOCK_SIZE_N": 128,
15+
"GROUP_SIZE_M": 64,
16+
"NEED_TRANS": true,
717
"num_stages": 5,
818
"num_warps": 4
919
},
1020
"1024": {
1121
"BLOCK_SIZE_K": 128,
12-
"BLOCK_SIZE_M": 32,
22+
"BLOCK_SIZE_M": 64,
1323
"BLOCK_SIZE_N": 128,
14-
"GROUP_SIZE_M": 4,
15-
"num_stages": 3,
24+
"GROUP_SIZE_M": 1,
25+
"NEED_TRANS": false,
26+
"num_stages": 4,
1627
"num_warps": 4
1728
},
1829
"128": {
1930
"BLOCK_SIZE_K": 128,
2031
"BLOCK_SIZE_M": 16,
21-
"BLOCK_SIZE_N": 64,
22-
"GROUP_SIZE_M": 2,
32+
"BLOCK_SIZE_N": 128,
33+
"GROUP_SIZE_M": 32,
34+
"NEED_TRANS": true,
2335
"num_stages": 3,
2436
"num_warps": 4
2537
},
2638
"16": {
2739
"BLOCK_SIZE_K": 128,
2840
"BLOCK_SIZE_M": 16,
29-
"BLOCK_SIZE_N": 64,
30-
"GROUP_SIZE_M": 4,
31-
"num_stages": 3,
41+
"BLOCK_SIZE_N": 128,
42+
"GROUP_SIZE_M": 64,
43+
"NEED_TRANS": true,
44+
"num_stages": 4,
3245
"num_warps": 4
3346
},
3447
"16384": {
3548
"BLOCK_SIZE_K": 128,
36-
"BLOCK_SIZE_M": 128,
37-
"BLOCK_SIZE_N": 64,
38-
"GROUP_SIZE_M": 1,
49+
"BLOCK_SIZE_M": 64,
50+
"BLOCK_SIZE_N": 128,
51+
"GROUP_SIZE_M": 16,
52+
"NEED_TRANS": false,
3953
"num_stages": 4,
4054
"num_warps": 4
4155
},
@@ -44,39 +58,53 @@
4458
"BLOCK_SIZE_M": 64,
4559
"BLOCK_SIZE_N": 128,
4660
"GROUP_SIZE_M": 1,
61+
"NEED_TRANS": false,
4762
"num_stages": 4,
4863
"num_warps": 4
4964
},
5065
"256": {
5166
"BLOCK_SIZE_K": 128,
52-
"BLOCK_SIZE_M": 32,
67+
"BLOCK_SIZE_M": 16,
5368
"BLOCK_SIZE_N": 128,
54-
"GROUP_SIZE_M": 4,
55-
"num_stages": 5,
69+
"GROUP_SIZE_M": 1,
70+
"NEED_TRANS": true,
71+
"num_stages": 3,
72+
"num_warps": 4
73+
},
74+
"32": {
75+
"BLOCK_SIZE_K": 128,
76+
"BLOCK_SIZE_M": 16,
77+
"BLOCK_SIZE_N": 64,
78+
"GROUP_SIZE_M": 32,
79+
"NEED_TRANS": true,
80+
"num_stages": 4,
5681
"num_warps": 4
5782
},
5883
"4096": {
5984
"BLOCK_SIZE_K": 128,
6085
"BLOCK_SIZE_M": 64,
6186
"BLOCK_SIZE_N": 128,
62-
"GROUP_SIZE_M": 1,
63-
"num_stages": 3,
87+
"GROUP_SIZE_M": 16,
88+
"NEED_TRANS": false,
89+
"num_stages": 4,
6490
"num_warps": 4
6591
},
6692
"64": {
6793
"BLOCK_SIZE_K": 128,
6894
"BLOCK_SIZE_M": 16,
69-
"BLOCK_SIZE_N": 64,
70-
"GROUP_SIZE_M": 2,
71-
"num_stages": 5,
95+
"BLOCK_SIZE_N": 128,
96+
"GROUP_SIZE_M": 32,
97+
"NEED_TRANS": true,
98+
"num_stages": 3,
7299
"num_warps": 4
73100
},
74101
"8": {
75102
"BLOCK_SIZE_K": 128,
76-
"BLOCK_SIZE_M": 32,
103+
"BLOCK_SIZE_M": 16,
77104
"BLOCK_SIZE_N": 64,
78-
"GROUP_SIZE_M": 4,
79-
"num_stages": 4,
105+
"GROUP_SIZE_M": 32,
106+
"NEED_TRANS": true,
107+
"num_stages": 5,
80108
"num_warps": 4
81109
}
82110
}

0 commit comments

Comments
 (0)