File tree Expand file tree Collapse file tree 4 files changed +313
-37
lines changed
lightllm/common/triton_utils/autotune_kernel_configs/triton_3.3.1/NVIDIA_H200/grouped_matmul:v1 Expand file tree Collapse file tree 4 files changed +313
-37
lines changed Original file line number Diff line number Diff line change 33 "BLOCK_SIZE_K" : 128 ,
44 "BLOCK_SIZE_M" : 16 ,
55 "BLOCK_SIZE_N" : 128 ,
6- "GROUP_SIZE_M" : 8 ,
6+ "GROUP_SIZE_M" : 32 ,
7+ "NEED_TRANS" : true ,
78 "num_stages" : 2 ,
89 "num_warps" : 4
910 },
1011 "128" : {
1112 "BLOCK_SIZE_K" : 128 ,
1213 "BLOCK_SIZE_M" : 16 ,
1314 "BLOCK_SIZE_N" : 128 ,
14- "GROUP_SIZE_M" : 8 ,
15+ "GROUP_SIZE_M" : 64 ,
16+ "NEED_TRANS" : true ,
1517 "num_stages" : 2 ,
1618 "num_warps" : 4
1719 },
1820 "131072" : {
1921 "BLOCK_SIZE_K" : 128 ,
20- "BLOCK_SIZE_M" : 128 ,
21- "BLOCK_SIZE_N" : 64 ,
22- "GROUP_SIZE_M" : 2 ,
22+ "BLOCK_SIZE_M" : 64 ,
23+ "BLOCK_SIZE_N" : 128 ,
24+ "GROUP_SIZE_M" : 16 ,
25+ "NEED_TRANS" : false ,
2326 "num_stages" : 3 ,
2427 "num_warps" : 4
2528 },
2629 "16384" : {
2730 "BLOCK_SIZE_K" : 128 ,
2831 "BLOCK_SIZE_M" : 64 ,
2932 "BLOCK_SIZE_N" : 128 ,
30- "GROUP_SIZE_M" : 8 ,
33+ "GROUP_SIZE_M" : 32 ,
34+ "NEED_TRANS" : false ,
3135 "num_stages" : 3 ,
3236 "num_warps" : 4
3337 },
3438 "2048" : {
3539 "BLOCK_SIZE_K" : 128 ,
3640 "BLOCK_SIZE_M" : 16 ,
3741 "BLOCK_SIZE_N" : 128 ,
38- "GROUP_SIZE_M" : 8 ,
42+ "GROUP_SIZE_M" : 64 ,
43+ "NEED_TRANS" : true ,
44+ "num_stages" : 2 ,
45+ "num_warps" : 4
46+ },
47+ "256" : {
48+ "BLOCK_SIZE_K" : 128 ,
49+ "BLOCK_SIZE_M" : 16 ,
50+ "BLOCK_SIZE_N" : 128 ,
51+ "GROUP_SIZE_M" : 64 ,
52+ "NEED_TRANS" : true ,
3953 "num_stages" : 2 ,
4054 "num_warps" : 4
4155 },
4256 "32768" : {
4357 "BLOCK_SIZE_K" : 128 ,
4458 "BLOCK_SIZE_M" : 64 ,
4559 "BLOCK_SIZE_N" : 128 ,
46- "GROUP_SIZE_M" : 8 ,
60+ "GROUP_SIZE_M" : 32 ,
61+ "NEED_TRANS" : false ,
4762 "num_stages" : 3 ,
4863 "num_warps" : 4
4964 },
5065 "512" : {
5166 "BLOCK_SIZE_K" : 128 ,
5267 "BLOCK_SIZE_M" : 16 ,
5368 "BLOCK_SIZE_N" : 128 ,
54- "GROUP_SIZE_M" : 8 ,
69+ "GROUP_SIZE_M" : 64 ,
70+ "NEED_TRANS" : true ,
5571 "num_stages" : 2 ,
5672 "num_warps" : 4
5773 },
5874 "64" : {
5975 "BLOCK_SIZE_K" : 128 ,
6076 "BLOCK_SIZE_M" : 16 ,
6177 "BLOCK_SIZE_N" : 128 ,
62- "GROUP_SIZE_M" : 2 ,
78+ "GROUP_SIZE_M" : 64 ,
79+ "NEED_TRANS" : true ,
6380 "num_stages" : 2 ,
6481 "num_warps" : 4
6582 },
6683 "8" : {
67- "BLOCK_SIZE_K" : 32 ,
84+ "BLOCK_SIZE_K" : 64 ,
6885 "BLOCK_SIZE_M" : 16 ,
6986 "BLOCK_SIZE_N" : 128 ,
70- "GROUP_SIZE_M" : 1 ,
87+ "GROUP_SIZE_M" : 64 ,
88+ "NEED_TRANS" : true ,
7189 "num_stages" : 3 ,
72- "num_warps" : 2
90+ "num_warps" : 4
7391 },
74- "8192 " : {
92+ "800 " : {
7593 "BLOCK_SIZE_K" : 128 ,
7694 "BLOCK_SIZE_M" : 16 ,
7795 "BLOCK_SIZE_N" : 128 ,
78- "GROUP_SIZE_M" : 8 ,
96+ "GROUP_SIZE_M" : 32 ,
97+ "NEED_TRANS" : true ,
7998 "num_stages" : 2 ,
8099 "num_warps" : 4
100+ },
101+ "8192" : {
102+ "BLOCK_SIZE_K" : 128 ,
103+ "BLOCK_SIZE_M" : 64 ,
104+ "BLOCK_SIZE_N" : 128 ,
105+ "GROUP_SIZE_M" : 64 ,
106+ "NEED_TRANS" : false ,
107+ "num_stages" : 3 ,
108+ "num_warps" : 4
81109 }
82110}
Original file line number Diff line number Diff line change 1+ {
2+ "1152" : {
3+ "BLOCK_SIZE_K" : 128 ,
4+ "BLOCK_SIZE_M" : 16 ,
5+ "BLOCK_SIZE_N" : 128 ,
6+ "GROUP_SIZE_M" : 32 ,
7+ "NEED_TRANS" : true ,
8+ "num_stages" : 2 ,
9+ "num_warps" : 4
10+ },
11+ "144" : {
12+ "BLOCK_SIZE_K" : 128 ,
13+ "BLOCK_SIZE_M" : 16 ,
14+ "BLOCK_SIZE_N" : 128 ,
15+ "GROUP_SIZE_M" : 64 ,
16+ "NEED_TRANS" : true ,
17+ "num_stages" : 2 ,
18+ "num_warps" : 4
19+ },
20+ "147456" : {
21+ "BLOCK_SIZE_K" : 128 ,
22+ "BLOCK_SIZE_M" : 64 ,
23+ "BLOCK_SIZE_N" : 128 ,
24+ "GROUP_SIZE_M" : 16 ,
25+ "NEED_TRANS" : false ,
26+ "num_stages" : 3 ,
27+ "num_warps" : 4
28+ },
29+ "18432" : {
30+ "BLOCK_SIZE_K" : 128 ,
31+ "BLOCK_SIZE_M" : 64 ,
32+ "BLOCK_SIZE_N" : 128 ,
33+ "GROUP_SIZE_M" : 32 ,
34+ "NEED_TRANS" : false ,
35+ "num_stages" : 3 ,
36+ "num_warps" : 4
37+ },
38+ "2304" : {
39+ "BLOCK_SIZE_K" : 128 ,
40+ "BLOCK_SIZE_M" : 16 ,
41+ "BLOCK_SIZE_N" : 128 ,
42+ "GROUP_SIZE_M" : 64 ,
43+ "NEED_TRANS" : true ,
44+ "num_stages" : 2 ,
45+ "num_warps" : 4
46+ },
47+ "288" : {
48+ "BLOCK_SIZE_K" : 128 ,
49+ "BLOCK_SIZE_M" : 16 ,
50+ "BLOCK_SIZE_N" : 128 ,
51+ "GROUP_SIZE_M" : 64 ,
52+ "NEED_TRANS" : true ,
53+ "num_stages" : 2 ,
54+ "num_warps" : 4
55+ },
56+ "36864" : {
57+ "BLOCK_SIZE_K" : 128 ,
58+ "BLOCK_SIZE_M" : 64 ,
59+ "BLOCK_SIZE_N" : 128 ,
60+ "GROUP_SIZE_M" : 32 ,
61+ "NEED_TRANS" : false ,
62+ "num_stages" : 3 ,
63+ "num_warps" : 4
64+ },
65+ "576" : {
66+ "BLOCK_SIZE_K" : 128 ,
67+ "BLOCK_SIZE_M" : 16 ,
68+ "BLOCK_SIZE_N" : 128 ,
69+ "GROUP_SIZE_M" : 64 ,
70+ "NEED_TRANS" : true ,
71+ "num_stages" : 2 ,
72+ "num_warps" : 4
73+ },
74+ "72" : {
75+ "BLOCK_SIZE_K" : 128 ,
76+ "BLOCK_SIZE_M" : 16 ,
77+ "BLOCK_SIZE_N" : 128 ,
78+ "GROUP_SIZE_M" : 64 ,
79+ "NEED_TRANS" : true ,
80+ "num_stages" : 2 ,
81+ "num_warps" : 4
82+ },
83+ "9" : {
84+ "BLOCK_SIZE_K" : 64 ,
85+ "BLOCK_SIZE_M" : 16 ,
86+ "BLOCK_SIZE_N" : 128 ,
87+ "GROUP_SIZE_M" : 64 ,
88+ "NEED_TRANS" : true ,
89+ "num_stages" : 3 ,
90+ "num_warps" : 4
91+ },
92+ "900" : {
93+ "BLOCK_SIZE_K" : 128 ,
94+ "BLOCK_SIZE_M" : 16 ,
95+ "BLOCK_SIZE_N" : 128 ,
96+ "GROUP_SIZE_M" : 64 ,
97+ "NEED_TRANS" : true ,
98+ "num_stages" : 2 ,
99+ "num_warps" : 4
100+ },
101+ "9216" : {
102+ "BLOCK_SIZE_K" : 128 ,
103+ "BLOCK_SIZE_M" : 64 ,
104+ "BLOCK_SIZE_N" : 128 ,
105+ "GROUP_SIZE_M" : 64 ,
106+ "NEED_TRANS" : false ,
107+ "num_stages" : 3 ,
108+ "num_warps" : 4
109+ }
110+ }
Original file line number Diff line number Diff line change 44 "BLOCK_SIZE_M" : 16 ,
55 "BLOCK_SIZE_N" : 64 ,
66 "GROUP_SIZE_M" : 1 ,
7+ "NEED_TRANS" : true ,
8+ "num_stages" : 5 ,
9+ "num_warps" : 4
10+ },
11+ "100" : {
12+ "BLOCK_SIZE_K" : 128 ,
13+ "BLOCK_SIZE_M" : 16 ,
14+ "BLOCK_SIZE_N" : 128 ,
15+ "GROUP_SIZE_M" : 64 ,
16+ "NEED_TRANS" : true ,
717 "num_stages" : 5 ,
818 "num_warps" : 4
919 },
1020 "1024" : {
1121 "BLOCK_SIZE_K" : 128 ,
12- "BLOCK_SIZE_M" : 32 ,
22+ "BLOCK_SIZE_M" : 64 ,
1323 "BLOCK_SIZE_N" : 128 ,
14- "GROUP_SIZE_M" : 4 ,
15- "num_stages" : 3 ,
24+ "GROUP_SIZE_M" : 1 ,
25+ "NEED_TRANS" : false ,
26+ "num_stages" : 4 ,
1627 "num_warps" : 4
1728 },
1829 "128" : {
1930 "BLOCK_SIZE_K" : 128 ,
2031 "BLOCK_SIZE_M" : 16 ,
21- "BLOCK_SIZE_N" : 64 ,
22- "GROUP_SIZE_M" : 2 ,
32+ "BLOCK_SIZE_N" : 128 ,
33+ "GROUP_SIZE_M" : 32 ,
34+ "NEED_TRANS" : true ,
2335 "num_stages" : 3 ,
2436 "num_warps" : 4
2537 },
2638 "16" : {
2739 "BLOCK_SIZE_K" : 128 ,
2840 "BLOCK_SIZE_M" : 16 ,
29- "BLOCK_SIZE_N" : 64 ,
30- "GROUP_SIZE_M" : 4 ,
31- "num_stages" : 3 ,
41+ "BLOCK_SIZE_N" : 128 ,
42+ "GROUP_SIZE_M" : 64 ,
43+ "NEED_TRANS" : true ,
44+ "num_stages" : 4 ,
3245 "num_warps" : 4
3346 },
3447 "16384" : {
3548 "BLOCK_SIZE_K" : 128 ,
36- "BLOCK_SIZE_M" : 128 ,
37- "BLOCK_SIZE_N" : 64 ,
38- "GROUP_SIZE_M" : 1 ,
49+ "BLOCK_SIZE_M" : 64 ,
50+ "BLOCK_SIZE_N" : 128 ,
51+ "GROUP_SIZE_M" : 16 ,
52+ "NEED_TRANS" : false ,
3953 "num_stages" : 4 ,
4054 "num_warps" : 4
4155 },
4458 "BLOCK_SIZE_M" : 64 ,
4559 "BLOCK_SIZE_N" : 128 ,
4660 "GROUP_SIZE_M" : 1 ,
61+ "NEED_TRANS" : false ,
4762 "num_stages" : 4 ,
4863 "num_warps" : 4
4964 },
5065 "256" : {
5166 "BLOCK_SIZE_K" : 128 ,
52- "BLOCK_SIZE_M" : 32 ,
67+ "BLOCK_SIZE_M" : 16 ,
5368 "BLOCK_SIZE_N" : 128 ,
54- "GROUP_SIZE_M" : 4 ,
55- "num_stages" : 5 ,
69+ "GROUP_SIZE_M" : 1 ,
70+ "NEED_TRANS" : true ,
71+ "num_stages" : 3 ,
72+ "num_warps" : 4
73+ },
74+ "32" : {
75+ "BLOCK_SIZE_K" : 128 ,
76+ "BLOCK_SIZE_M" : 16 ,
77+ "BLOCK_SIZE_N" : 64 ,
78+ "GROUP_SIZE_M" : 32 ,
79+ "NEED_TRANS" : true ,
80+ "num_stages" : 4 ,
5681 "num_warps" : 4
5782 },
5883 "4096" : {
5984 "BLOCK_SIZE_K" : 128 ,
6085 "BLOCK_SIZE_M" : 64 ,
6186 "BLOCK_SIZE_N" : 128 ,
62- "GROUP_SIZE_M" : 1 ,
63- "num_stages" : 3 ,
87+ "GROUP_SIZE_M" : 16 ,
88+ "NEED_TRANS" : false ,
89+ "num_stages" : 4 ,
6490 "num_warps" : 4
6591 },
6692 "64" : {
6793 "BLOCK_SIZE_K" : 128 ,
6894 "BLOCK_SIZE_M" : 16 ,
69- "BLOCK_SIZE_N" : 64 ,
70- "GROUP_SIZE_M" : 2 ,
71- "num_stages" : 5 ,
95+ "BLOCK_SIZE_N" : 128 ,
96+ "GROUP_SIZE_M" : 32 ,
97+ "NEED_TRANS" : true ,
98+ "num_stages" : 3 ,
7299 "num_warps" : 4
73100 },
74101 "8" : {
75102 "BLOCK_SIZE_K" : 128 ,
76- "BLOCK_SIZE_M" : 32 ,
103+ "BLOCK_SIZE_M" : 16 ,
77104 "BLOCK_SIZE_N" : 64 ,
78- "GROUP_SIZE_M" : 4 ,
79- "num_stages" : 4 ,
105+ "GROUP_SIZE_M" : 32 ,
106+ "NEED_TRANS" : true ,
107+ "num_stages" : 5 ,
80108 "num_warps" : 4
81109 }
82110}
You can’t perform that action at this time.
0 commit comments