Skip to content

Commit fde213e

Browse files
committed
[Kernel][Model] Tune fused_moe configs for Qwen3-30B A3/A3B on H100 (BF16 & FP8); per-(E,dtype) distinct tables
Signed-off-by: Shivam <[email protected]>
1 parent 14699ba commit fde213e

4 files changed

+172
-428
lines changed
Lines changed: 40 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -1,146 +1,82 @@
11
{
22
"1": {
33
"BLOCK_SIZE_M": 64,
4-
"BLOCK_SIZE_N": 128,
5-
"BLOCK_SIZE_K": 256,
6-
"GROUP_SIZE_M": 64,
4+
"BLOCK_SIZE_N": 256,
5+
"BLOCK_SIZE_K": 64,
6+
"GROUP_SIZE_M": 1,
77
"num_warps": 4,
8-
"num_stages": 4
8+
"num_stages": 3
99
},
1010
"2": {
11-
"BLOCK_SIZE_M": 64,
12-
"BLOCK_SIZE_N": 64,
11+
"BLOCK_SIZE_M": 32,
12+
"BLOCK_SIZE_N": 256,
1313
"BLOCK_SIZE_K": 256,
14-
"GROUP_SIZE_M": 1,
14+
"GROUP_SIZE_M": 16,
1515
"num_warps": 4,
16-
"num_stages": 5
16+
"num_stages": 4
1717
},
1818
"4": {
19-
"BLOCK_SIZE_M": 64,
20-
"BLOCK_SIZE_N": 64,
21-
"BLOCK_SIZE_K": 256,
22-
"GROUP_SIZE_M": 1,
23-
"num_warps": 4,
24-
"num_stages": 5
25-
},
26-
"8": {
27-
"BLOCK_SIZE_M": 64,
19+
"BLOCK_SIZE_M": 32,
2820
"BLOCK_SIZE_N": 256,
29-
"BLOCK_SIZE_K": 128,
30-
"GROUP_SIZE_M": 32,
21+
"BLOCK_SIZE_K": 64,
22+
"GROUP_SIZE_M": 16,
3123
"num_warps": 4,
3224
"num_stages": 4
3325
},
34-
"16": {
35-
"BLOCK_SIZE_M": 64,
26+
"8": {
27+
"BLOCK_SIZE_M": 128,
3628
"BLOCK_SIZE_N": 128,
37-
"BLOCK_SIZE_K": 128,
38-
"GROUP_SIZE_M": 32,
29+
"BLOCK_SIZE_K": 256,
30+
"GROUP_SIZE_M": 8,
3931
"num_warps": 4,
40-
"num_stages": 5
32+
"num_stages": 3
4133
},
42-
"24": {
43-
"BLOCK_SIZE_M": 64,
34+
"16": {
35+
"BLOCK_SIZE_M": 128,
4436
"BLOCK_SIZE_N": 64,
45-
"BLOCK_SIZE_K": 256,
46-
"GROUP_SIZE_M": 1,
47-
"num_warps": 4,
37+
"BLOCK_SIZE_K": 64,
38+
"GROUP_SIZE_M": 8,
39+
"num_warps": 8,
4840
"num_stages": 3
4941
},
5042
"32": {
5143
"BLOCK_SIZE_M": 64,
52-
"BLOCK_SIZE_N": 128,
53-
"BLOCK_SIZE_K": 256,
54-
"GROUP_SIZE_M": 1,
55-
"num_warps": 4,
56-
"num_stages": 4
57-
},
58-
"48": {
59-
"BLOCK_SIZE_M": 64,
60-
"BLOCK_SIZE_N": 128,
44+
"BLOCK_SIZE_N": 256,
6145
"BLOCK_SIZE_K": 256,
6246
"GROUP_SIZE_M": 1,
63-
"num_warps": 4,
64-
"num_stages": 4
47+
"num_warps": 8,
48+
"num_stages": 3
6549
},
6650
"64": {
67-
"BLOCK_SIZE_M": 64,
68-
"BLOCK_SIZE_N": 128,
69-
"BLOCK_SIZE_K": 256,
70-
"GROUP_SIZE_M": 1,
71-
"num_warps": 4,
72-
"num_stages": 4
73-
},
74-
"96": {
75-
"BLOCK_SIZE_M": 64,
51+
"BLOCK_SIZE_M": 128,
7652
"BLOCK_SIZE_N": 128,
7753
"BLOCK_SIZE_K": 256,
78-
"GROUP_SIZE_M": 1,
79-
"num_warps": 4,
54+
"GROUP_SIZE_M": 16,
55+
"num_warps": 8,
8056
"num_stages": 4
8157
},
8258
"128": {
83-
"BLOCK_SIZE_M": 64,
84-
"BLOCK_SIZE_N": 128,
85-
"BLOCK_SIZE_K": 256,
86-
"GROUP_SIZE_M": 1,
87-
"num_warps": 4,
88-
"num_stages": 4
89-
},
90-
"256": {
91-
"BLOCK_SIZE_M": 64,
59+
"BLOCK_SIZE_M": 16,
9260
"BLOCK_SIZE_N": 128,
9361
"BLOCK_SIZE_K": 128,
94-
"GROUP_SIZE_M": 64,
95-
"num_warps": 4,
96-
"num_stages": 3
97-
},
98-
"512": {
99-
"BLOCK_SIZE_M": 128,
100-
"BLOCK_SIZE_N": 256,
101-
"BLOCK_SIZE_K": 128,
102-
"GROUP_SIZE_M": 64,
103-
"num_warps": 8,
104-
"num_stages": 4
105-
},
106-
"1024": {
107-
"BLOCK_SIZE_M": 128,
108-
"BLOCK_SIZE_N": 256,
109-
"BLOCK_SIZE_K": 128,
110-
"GROUP_SIZE_M": 32,
111-
"num_warps": 8,
112-
"num_stages": 4
113-
},
114-
"1536": {
115-
"BLOCK_SIZE_M": 128,
116-
"BLOCK_SIZE_N": 256,
117-
"BLOCK_SIZE_K": 128,
118-
"GROUP_SIZE_M": 64,
62+
"GROUP_SIZE_M": 16,
11963
"num_warps": 8,
120-
"num_stages": 4
64+
"num_stages": 2
12165
},
122-
"2048": {
123-
"BLOCK_SIZE_M": 128,
124-
"BLOCK_SIZE_N": 256,
125-
"BLOCK_SIZE_K": 128,
126-
"GROUP_SIZE_M": 64,
127-
"num_warps": 8,
128-
"num_stages": 4
129-
},
130-
"3072": {
131-
"BLOCK_SIZE_M": 128,
132-
"BLOCK_SIZE_N": 256,
66+
"256": {
67+
"BLOCK_SIZE_M": 32,
68+
"BLOCK_SIZE_N": 64,
13369
"BLOCK_SIZE_K": 128,
134-
"GROUP_SIZE_M": 32,
70+
"GROUP_SIZE_M": 8,
13571
"num_warps": 8,
136-
"num_stages": 4
72+
"num_stages": 2
13773
},
138-
"4096": {
74+
"512": {
13975
"BLOCK_SIZE_M": 128,
140-
"BLOCK_SIZE_N": 256,
141-
"BLOCK_SIZE_K": 128,
76+
"BLOCK_SIZE_N": 64,
77+
"BLOCK_SIZE_K": 64,
14278
"GROUP_SIZE_M": 16,
143-
"num_warps": 8,
144-
"num_stages": 4
79+
"num_warps": 4,
80+
"num_stages": 2
14581
}
14682
}
Lines changed: 44 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,146 +1,82 @@
11
{
22
"1": {
33
"BLOCK_SIZE_M": 64,
4-
"BLOCK_SIZE_N": 128,
5-
"BLOCK_SIZE_K": 256,
6-
"GROUP_SIZE_M": 64,
7-
"num_warps": 4,
8-
"num_stages": 4
4+
"BLOCK_SIZE_N": 256,
5+
"BLOCK_SIZE_K": 64,
6+
"GROUP_SIZE_M": 1,
7+
"num_warps": 8,
8+
"num_stages": 3
99
},
1010
"2": {
11-
"BLOCK_SIZE_M": 64,
11+
"BLOCK_SIZE_M": 16,
1212
"BLOCK_SIZE_N": 64,
13-
"BLOCK_SIZE_K": 256,
14-
"GROUP_SIZE_M": 1,
15-
"num_warps": 4,
16-
"num_stages": 5
13+
"BLOCK_SIZE_K": 64,
14+
"GROUP_SIZE_M": 16,
15+
"num_warps": 8,
16+
"num_stages": 3
1717
},
1818
"4": {
19-
"BLOCK_SIZE_M": 64,
19+
"BLOCK_SIZE_M": 32,
2020
"BLOCK_SIZE_N": 64,
21-
"BLOCK_SIZE_K": 256,
22-
"GROUP_SIZE_M": 1,
23-
"num_warps": 4,
24-
"num_stages": 5
21+
"BLOCK_SIZE_K": 64,
22+
"GROUP_SIZE_M": 16,
23+
"num_warps": 8,
24+
"num_stages": 4
2525
},
2626
"8": {
27-
"BLOCK_SIZE_M": 64,
27+
"BLOCK_SIZE_M": 16,
2828
"BLOCK_SIZE_N": 256,
2929
"BLOCK_SIZE_K": 128,
30-
"GROUP_SIZE_M": 32,
31-
"num_warps": 4,
32-
"num_stages": 4
33-
},
34-
"16": {
35-
"BLOCK_SIZE_M": 64,
36-
"BLOCK_SIZE_N": 128,
37-
"BLOCK_SIZE_K": 128,
38-
"GROUP_SIZE_M": 32,
39-
"num_warps": 4,
40-
"num_stages": 5
41-
},
42-
"24": {
43-
"BLOCK_SIZE_M": 64,
44-
"BLOCK_SIZE_N": 64,
45-
"BLOCK_SIZE_K": 256,
46-
"GROUP_SIZE_M": 1,
47-
"num_warps": 4,
48-
"num_stages": 3
49-
},
50-
"32": {
51-
"BLOCK_SIZE_M": 64,
52-
"BLOCK_SIZE_N": 128,
53-
"BLOCK_SIZE_K": 256,
54-
"GROUP_SIZE_M": 1,
55-
"num_warps": 4,
56-
"num_stages": 4
57-
},
58-
"48": {
59-
"BLOCK_SIZE_M": 64,
60-
"BLOCK_SIZE_N": 128,
61-
"BLOCK_SIZE_K": 256,
62-
"GROUP_SIZE_M": 1,
63-
"num_warps": 4,
64-
"num_stages": 4
65-
},
66-
"64": {
67-
"BLOCK_SIZE_M": 64,
68-
"BLOCK_SIZE_N": 128,
69-
"BLOCK_SIZE_K": 256,
70-
"GROUP_SIZE_M": 1,
71-
"num_warps": 4,
72-
"num_stages": 4
73-
},
74-
"96": {
75-
"BLOCK_SIZE_M": 64,
76-
"BLOCK_SIZE_N": 128,
77-
"BLOCK_SIZE_K": 256,
78-
"GROUP_SIZE_M": 1,
79-
"num_warps": 4,
80-
"num_stages": 4
81-
},
82-
"128": {
83-
"BLOCK_SIZE_M": 64,
84-
"BLOCK_SIZE_N": 128,
85-
"BLOCK_SIZE_K": 256,
8630
"GROUP_SIZE_M": 1,
87-
"num_warps": 4,
31+
"num_warps": 8,
8832
"num_stages": 4
8933
},
90-
"256": {
91-
"BLOCK_SIZE_M": 64,
34+
"16": {
35+
"BLOCK_SIZE_M": 16,
9236
"BLOCK_SIZE_N": 128,
9337
"BLOCK_SIZE_K": 128,
94-
"GROUP_SIZE_M": 64,
95-
"num_warps": 4,
96-
"num_stages": 3
97-
},
98-
"512": {
99-
"BLOCK_SIZE_M": 128,
100-
"BLOCK_SIZE_N": 256,
101-
"BLOCK_SIZE_K": 128,
102-
"GROUP_SIZE_M": 64,
38+
"GROUP_SIZE_M": 8,
10339
"num_warps": 8,
104-
"num_stages": 4
40+
"num_stages": 2
10541
},
106-
"1024": {
107-
"BLOCK_SIZE_M": 128,
108-
"BLOCK_SIZE_N": 256,
42+
"32": {
43+
"BLOCK_SIZE_M": 16,
44+
"BLOCK_SIZE_N": 64,
10945
"BLOCK_SIZE_K": 128,
110-
"GROUP_SIZE_M": 32,
46+
"GROUP_SIZE_M": 16,
11147
"num_warps": 8,
112-
"num_stages": 4
48+
"num_stages": 2
11349
},
114-
"1536": {
115-
"BLOCK_SIZE_M": 128,
50+
"64": {
51+
"BLOCK_SIZE_M": 32,
11652
"BLOCK_SIZE_N": 256,
11753
"BLOCK_SIZE_K": 128,
118-
"GROUP_SIZE_M": 64,
54+
"GROUP_SIZE_M": 16,
11955
"num_warps": 8,
12056
"num_stages": 4
12157
},
122-
"2048": {
123-
"BLOCK_SIZE_M": 128,
58+
"128": {
59+
"BLOCK_SIZE_M": 16,
12460
"BLOCK_SIZE_N": 256,
125-
"BLOCK_SIZE_K": 128,
126-
"GROUP_SIZE_M": 64,
61+
"BLOCK_SIZE_K": 64,
62+
"GROUP_SIZE_M": 8,
12763
"num_warps": 8,
128-
"num_stages": 4
64+
"num_stages": 3
12965
},
130-
"3072": {
131-
"BLOCK_SIZE_M": 128,
132-
"BLOCK_SIZE_N": 256,
66+
"256": {
67+
"BLOCK_SIZE_M": 16,
68+
"BLOCK_SIZE_N": 64,
13369
"BLOCK_SIZE_K": 128,
134-
"GROUP_SIZE_M": 32,
135-
"num_warps": 8,
136-
"num_stages": 4
70+
"GROUP_SIZE_M": 8,
71+
"num_warps": 4,
72+
"num_stages": 2
13773
},
138-
"4096": {
139-
"BLOCK_SIZE_M": 128,
74+
"512": {
75+
"BLOCK_SIZE_M": 32,
14076
"BLOCK_SIZE_N": 256,
141-
"BLOCK_SIZE_K": 128,
77+
"BLOCK_SIZE_K": 64,
14278
"GROUP_SIZE_M": 16,
14379
"num_warps": 8,
144-
"num_stages": 4
80+
"num_stages": 2
14581
}
14682
}

0 commit comments

Comments
 (0)