Skip to content

Commit 66ee774

Browse files
rasmithqli88
andauthored
[Model] DeepSeek Tunings (ROCm#423)
* fused_moe config for DSv3 on MI300X updated * Add tuning script and post processing script Signed-off-by: Randall Smith <[email protected]> * Add modification to fp8_utils for tuning Signed-off-by: Randall Smith <[email protected]> * update tuning script and add the configs Signed-off-by: Randall Smith <[email protected]> * slightly better tunings Signed-off-by: Randall Smith <[email protected]> * benchmark_moe.py is updated to generate more accurate MoE configs and a specific MoE config for DSv3 is added * Bug in sgl_moe_align_block_size() is fixed by Greg * Generate fp8_w8a8 config for MI300XHF * tunings that don't give garbage output Signed-off-by: Randall Smith <[email protected]> * More accurate tunings Signed-off-by: Randall Smith <[email protected]> * More accurate tunings and reject inaccurate configs Signed-off-by: Randall Smith <[email protected]> * add new tunings Signed-off-by: Randall Smith <[email protected]> * rename tuning script and add benchmark script to use for optimizing blockwise quant Signed-off-by: Randall Smith <[email protected]> * remove white space from file names Signed-off-by: Randall Smith <[email protected]> * remove white space from file names Signed-off-by: Randall Smith <[email protected]> * Remove some unnecessary changes Signed-off-by: Randall Smith <[email protected]> * don't use space in file names Signed-off-by: Randall Smith <[email protected]> * remove XHF tunings Signed-off-by: Randall Smith <[email protected]> * remove OAM from file name Signed-off-by: Randall Smith <[email protected]> * rmeove OAM from file names Signed-off-by: Randall Smith <[email protected]> * yapf Signed-off-by: Randall Smith <[email protected]> * update config name Signed-off-by: Randall Smith <[email protected]> * remove benchmark_moe.py changes Signed-off-by: Randall Smith <[email protected]> * remove is_contiguous Signed-off-by: Randall Smith <[email protected]> * use more recent fp8_utils.py Signed-off-by: Randall Smith <[email protected]> * remove is_contiguous Signed-off-by: Randall Smith <[email protected]> --------- Signed-off-by: Randall Smith <[email protected]> Co-authored-by: qli88 <[email protected]>
1 parent aa63571 commit 66ee774

File tree

26 files changed

+4088
-24
lines changed

26 files changed

+4088
-24
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
{
2+
"1": {
3+
"BLOCK_SIZE_M": 16,
4+
"BLOCK_SIZE_N": 32,
5+
"BLOCK_SIZE_K": 256,
6+
"GROUP_SIZE_M": 1,
7+
"num_warps": 4,
8+
"num_stages": 2,
9+
"waves_per_eu": 0
10+
},
11+
"2": {
12+
"BLOCK_SIZE_M": 32,
13+
"BLOCK_SIZE_N": 16,
14+
"BLOCK_SIZE_K": 256,
15+
"GROUP_SIZE_M": 1,
16+
"num_warps": 2,
17+
"num_stages": 2,
18+
"waves_per_eu": 0
19+
},
20+
"4": {
21+
"BLOCK_SIZE_M": 16,
22+
"BLOCK_SIZE_N": 64,
23+
"BLOCK_SIZE_K": 256,
24+
"GROUP_SIZE_M": 1,
25+
"num_warps": 4,
26+
"num_stages": 2,
27+
"waves_per_eu": 0
28+
},
29+
"8": {
30+
"BLOCK_SIZE_M": 16,
31+
"BLOCK_SIZE_N": 128,
32+
"BLOCK_SIZE_K": 128,
33+
"GROUP_SIZE_M": 1,
34+
"num_warps": 4,
35+
"num_stages": 2,
36+
"waves_per_eu": 0
37+
},
38+
"16": {
39+
"BLOCK_SIZE_M": 16,
40+
"BLOCK_SIZE_N": 64,
41+
"BLOCK_SIZE_K": 128,
42+
"GROUP_SIZE_M": 4,
43+
"num_warps": 2,
44+
"num_stages": 2,
45+
"waves_per_eu": 0
46+
},
47+
"24": {
48+
"BLOCK_SIZE_M": 16,
49+
"BLOCK_SIZE_N": 128,
50+
"BLOCK_SIZE_K": 128,
51+
"GROUP_SIZE_M": 1,
52+
"num_warps": 4,
53+
"num_stages": 2,
54+
"waves_per_eu": 0
55+
},
56+
"32": {
57+
"BLOCK_SIZE_M": 16,
58+
"BLOCK_SIZE_N": 64,
59+
"BLOCK_SIZE_K": 128,
60+
"GROUP_SIZE_M": 4,
61+
"num_warps": 2,
62+
"num_stages": 2,
63+
"waves_per_eu": 0
64+
},
65+
"48": {
66+
"BLOCK_SIZE_M": 16,
67+
"BLOCK_SIZE_N": 64,
68+
"BLOCK_SIZE_K": 128,
69+
"GROUP_SIZE_M": 4,
70+
"num_warps": 2,
71+
"num_stages": 2,
72+
"waves_per_eu": 0
73+
},
74+
"64": {
75+
"BLOCK_SIZE_M": 16,
76+
"BLOCK_SIZE_N": 64,
77+
"BLOCK_SIZE_K": 128,
78+
"GROUP_SIZE_M": 1,
79+
"num_warps": 2,
80+
"num_stages": 2,
81+
"waves_per_eu": 0
82+
},
83+
"96": {
84+
"BLOCK_SIZE_M": 16,
85+
"BLOCK_SIZE_N": 64,
86+
"BLOCK_SIZE_K": 128,
87+
"GROUP_SIZE_M": 4,
88+
"num_warps": 4,
89+
"num_stages": 2,
90+
"waves_per_eu": 0
91+
},
92+
"128": {
93+
"BLOCK_SIZE_M": 16,
94+
"BLOCK_SIZE_N": 64,
95+
"BLOCK_SIZE_K": 256,
96+
"GROUP_SIZE_M": 1,
97+
"num_warps": 2,
98+
"num_stages": 2,
99+
"waves_per_eu": 0
100+
},
101+
"256": {
102+
"BLOCK_SIZE_M": 16,
103+
"BLOCK_SIZE_N": 64,
104+
"BLOCK_SIZE_K": 128,
105+
"GROUP_SIZE_M": 4,
106+
"num_warps": 4,
107+
"num_stages": 2,
108+
"waves_per_eu": 0
109+
},
110+
"512": {
111+
"BLOCK_SIZE_M": 32,
112+
"BLOCK_SIZE_N": 256,
113+
"BLOCK_SIZE_K": 128,
114+
"GROUP_SIZE_M": 8,
115+
"num_warps": 8,
116+
"num_stages": 2,
117+
"waves_per_eu": 0
118+
},
119+
"1024": {
120+
"BLOCK_SIZE_M": 64,
121+
"BLOCK_SIZE_N": 256,
122+
"BLOCK_SIZE_K": 128,
123+
"GROUP_SIZE_M": 8,
124+
"num_warps": 8,
125+
"num_stages": 2,
126+
"waves_per_eu": 0
127+
}
128+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
{
2+
"1": {
3+
"BLOCK_SIZE_K": 128,
4+
"BLOCK_SIZE_M": 32,
5+
"BLOCK_SIZE_N": 32,
6+
"GROUP_SIZE_M": 1,
7+
"kpack": 1,
8+
"matrix_instr_nonkdim": 16,
9+
"num_warps": 4
10+
},
11+
"2": {
12+
"BLOCK_SIZE_K": 128,
13+
"BLOCK_SIZE_M": 32,
14+
"BLOCK_SIZE_N": 32,
15+
"GROUP_SIZE_M": 8,
16+
"kpack": 1,
17+
"matrix_instr_nonkdim": 16,
18+
"num_warps": 4
19+
},
20+
"4": {
21+
"BLOCK_SIZE_K": 128,
22+
"BLOCK_SIZE_M": 32,
23+
"BLOCK_SIZE_N": 32,
24+
"GROUP_SIZE_M": 16,
25+
"kpack": 1,
26+
"matrix_instr_nonkdim": 16,
27+
"num_warps": 4
28+
},
29+
"8": {
30+
"BLOCK_SIZE_K": 128,
31+
"BLOCK_SIZE_M": 32,
32+
"BLOCK_SIZE_N": 32,
33+
"GROUP_SIZE_M": 16,
34+
"kpack": 1,
35+
"matrix_instr_nonkdim": 16,
36+
"num_warps": 4
37+
},
38+
"16": {
39+
"BLOCK_SIZE_K": 128,
40+
"BLOCK_SIZE_M": 32,
41+
"BLOCK_SIZE_N": 32,
42+
"GROUP_SIZE_M": 8,
43+
"kpack": 1,
44+
"matrix_instr_nonkdim": 16,
45+
"num_warps": 4
46+
},
47+
"24": {
48+
"BLOCK_SIZE_K": 128,
49+
"BLOCK_SIZE_M": 32,
50+
"BLOCK_SIZE_N": 32,
51+
"GROUP_SIZE_M": 8,
52+
"kpack": 1,
53+
"matrix_instr_nonkdim": 16,
54+
"num_warps": 4
55+
},
56+
"32": {
57+
"BLOCK_SIZE_K": 128,
58+
"BLOCK_SIZE_M": 32,
59+
"BLOCK_SIZE_N": 32,
60+
"GROUP_SIZE_M": 16,
61+
"kpack": 1,
62+
"matrix_instr_nonkdim": 16,
63+
"num_warps": 4
64+
},
65+
"48": {
66+
"BLOCK_SIZE_K": 128,
67+
"BLOCK_SIZE_M": 32,
68+
"BLOCK_SIZE_N": 32,
69+
"GROUP_SIZE_M": 8,
70+
"kpack": 1,
71+
"matrix_instr_nonkdim": 16,
72+
"num_warps": 4
73+
},
74+
"64": {
75+
"BLOCK_SIZE_K": 128,
76+
"BLOCK_SIZE_M": 32,
77+
"BLOCK_SIZE_N": 32,
78+
"GROUP_SIZE_M": 8,
79+
"kpack": 1,
80+
"matrix_instr_nonkdim": 16,
81+
"num_warps": 4
82+
},
83+
"96": {
84+
"BLOCK_SIZE_K": 128,
85+
"BLOCK_SIZE_M": 32,
86+
"BLOCK_SIZE_N": 32,
87+
"GROUP_SIZE_M": 16,
88+
"kpack": 1,
89+
"matrix_instr_nonkdim": 16,
90+
"num_warps": 4
91+
},
92+
"128": {
93+
"BLOCK_SIZE_K": 128,
94+
"BLOCK_SIZE_M": 32,
95+
"BLOCK_SIZE_N": 32,
96+
"GROUP_SIZE_M": 16,
97+
"kpack": 1,
98+
"matrix_instr_nonkdim": 16,
99+
"num_warps": 4
100+
},
101+
"256": {
102+
"BLOCK_SIZE_K": 128,
103+
"BLOCK_SIZE_M": 32,
104+
"BLOCK_SIZE_N": 32,
105+
"GROUP_SIZE_M": 8,
106+
"kpack": 1,
107+
"matrix_instr_nonkdim": 16,
108+
"num_warps": 4
109+
},
110+
"512": {
111+
"BLOCK_SIZE_K": 128,
112+
"BLOCK_SIZE_M": 64,
113+
"BLOCK_SIZE_N": 32,
114+
"GROUP_SIZE_M": 1,
115+
"kpack": 1,
116+
"matrix_instr_nonkdim": 16,
117+
"num_warps": 4
118+
},
119+
"1024": {
120+
"BLOCK_SIZE_K": 128,
121+
"BLOCK_SIZE_M": 64,
122+
"BLOCK_SIZE_N": 64,
123+
"GROUP_SIZE_M": 8,
124+
"kpack": 1,
125+
"matrix_instr_nonkdim": 16,
126+
"num_warps": 4
127+
},
128+
"1536": {
129+
"BLOCK_SIZE_K": 128,
130+
"BLOCK_SIZE_M": 64,
131+
"BLOCK_SIZE_N": 64,
132+
"GROUP_SIZE_M": 1,
133+
"kpack": 1,
134+
"matrix_instr_nonkdim": 16,
135+
"num_warps": 4
136+
},
137+
"2048": {
138+
"BLOCK_SIZE_K": 128,
139+
"BLOCK_SIZE_M": 64,
140+
"BLOCK_SIZE_N": 64,
141+
"GROUP_SIZE_M": 8,
142+
"kpack": 1,
143+
"matrix_instr_nonkdim": 16,
144+
"num_warps": 4
145+
},
146+
"3072": {
147+
"BLOCK_SIZE_K": 128,
148+
"BLOCK_SIZE_M": 64,
149+
"BLOCK_SIZE_N": 128,
150+
"GROUP_SIZE_M": 32,
151+
"kpack": 1,
152+
"matrix_instr_nonkdim": 16,
153+
"num_warps": 4
154+
},
155+
"4096": {
156+
"BLOCK_SIZE_K": 128,
157+
"BLOCK_SIZE_M": 64,
158+
"BLOCK_SIZE_N": 64,
159+
"GROUP_SIZE_M": 32,
160+
"kpack": 1,
161+
"matrix_instr_nonkdim": 16,
162+
"num_warps": 4
163+
}
164+
}

0 commit comments

Comments
 (0)