Skip to content

Commit e0c5114

Browse files
added tuned gemms for r9700
1 parent f174268 commit e0c5114

31 files changed

+2760
-0
lines changed
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"small": {
3+
"BLOCK_SIZE_M": 256,
4+
"BLOCK_SIZE_N": 64,
5+
"BLOCK_SIZE_K": 128,
6+
"GROUP_SIZE_M": 16,
7+
"num_warps": 8,
8+
"num_stages": 2,
9+
"waves_per_eu": 2,
10+
"kpack": 2,
11+
"matrix_instr_nonkdim": 16
12+
},
13+
"large": {
14+
"BLOCK_SIZE_M": 256,
15+
"BLOCK_SIZE_N": 64,
16+
"BLOCK_SIZE_K": 128,
17+
"GROUP_SIZE_M": 1,
18+
"num_warps": 8,
19+
"num_stages": 1,
20+
"waves_per_eu": 2,
21+
"kpack": 2,
22+
"matrix_instr_nonkdim": 16
23+
}
24+
}
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
{
2+
"small": {
3+
"BLOCK_SIZE_M": 16,
4+
"BLOCK_SIZE_N": 32,
5+
"BLOCK_SIZE_K": 128,
6+
"GROUP_SIZE_M": 64,
7+
"num_warps": 4,
8+
"num_stages": 2,
9+
"waves_per_eu": 4,
10+
"matrix_instr_nonkdim": 16,
11+
"cache_modifier": "",
12+
"NUM_KSPLIT": 1,
13+
"SPLITK_BLOCK_SIZE": 1024
14+
},
15+
"medium_M32": {
16+
"BLOCK_SIZE_M": 64,
17+
"BLOCK_SIZE_N": 32,
18+
"BLOCK_SIZE_K": 128,
19+
"GROUP_SIZE_M": 1,
20+
"num_warps": 8,
21+
"num_stages": 3,
22+
"waves_per_eu": 8,
23+
"matrix_instr_nonkdim": 16,
24+
"cache_modifier": "",
25+
"NUM_KSPLIT": 1,
26+
"SPLITK_BLOCK_SIZE": 1024
27+
},
28+
"medium_M64": {
29+
"BLOCK_SIZE_M": 64,
30+
"BLOCK_SIZE_N": 32,
31+
"BLOCK_SIZE_K": 128,
32+
"GROUP_SIZE_M": 1,
33+
"num_warps": 8,
34+
"num_stages": 3,
35+
"waves_per_eu": 4,
36+
"matrix_instr_nonkdim": 16,
37+
"cache_modifier": "",
38+
"NUM_KSPLIT": 1,
39+
"SPLITK_BLOCK_SIZE": 1024
40+
},
41+
"medium_M128": {
42+
"BLOCK_SIZE_M": 128,
43+
"BLOCK_SIZE_N": 32,
44+
"BLOCK_SIZE_K": 64,
45+
"GROUP_SIZE_M": 8,
46+
"num_warps": 8,
47+
"num_stages": 2,
48+
"waves_per_eu": 2,
49+
"matrix_instr_nonkdim": 16,
50+
"cache_modifier": "",
51+
"NUM_KSPLIT": 1,
52+
"SPLITK_BLOCK_SIZE": 1024
53+
},
54+
"large": {
55+
"BLOCK_SIZE_M": 64,
56+
"BLOCK_SIZE_N": 64,
57+
"BLOCK_SIZE_K": 128,
58+
"GROUP_SIZE_M": 64,
59+
"num_warps": 8,
60+
"num_stages": 2,
61+
"waves_per_eu": 4,
62+
"matrix_instr_nonkdim": 16,
63+
"cache_modifier": "",
64+
"NUM_KSPLIT": 1,
65+
"SPLITK_BLOCK_SIZE": 1024
66+
},
67+
"xlarge": {
68+
"BLOCK_SIZE_M": 128,
69+
"BLOCK_SIZE_N": 256,
70+
"BLOCK_SIZE_K": 64,
71+
"GROUP_SIZE_M": 64,
72+
"num_warps": 8,
73+
"num_stages": 2,
74+
"waves_per_eu": 4,
75+
"matrix_instr_nonkdim": 16,
76+
"cache_modifier": "",
77+
"NUM_KSPLIT": 1,
78+
"SPLITK_BLOCK_SIZE": 1024
79+
},
80+
"any": {
81+
"BLOCK_SIZE_M": 128,
82+
"BLOCK_SIZE_N": 128,
83+
"BLOCK_SIZE_K": 64,
84+
"GROUP_SIZE_M": 32,
85+
"num_warps": 8,
86+
"num_stages": 2,
87+
"waves_per_eu": 4,
88+
"matrix_instr_nonkdim": 16,
89+
"cache_modifier": "",
90+
"NUM_KSPLIT": 1,
91+
"SPLITK_BLOCK_SIZE": 1024
92+
}
93+
}
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
{
2+
"small": {
3+
"BLOCK_SIZE_M": 32,
4+
"BLOCK_SIZE_N": 32,
5+
"BLOCK_SIZE_K": 128,
6+
"GROUP_SIZE_M": 16,
7+
"num_warps": 4,
8+
"num_stages": 5,
9+
"waves_per_eu": 4,
10+
"matrix_instr_nonkdim": 16,
11+
"cache_modifier": "",
12+
"NUM_KSPLIT": 1,
13+
"SPLITK_BLOCK_SIZE": 2048
14+
},
15+
"medium_M32": {
16+
"BLOCK_SIZE_M": 32,
17+
"BLOCK_SIZE_N": 32,
18+
"BLOCK_SIZE_K": 128,
19+
"GROUP_SIZE_M": 32,
20+
"num_warps": 4,
21+
"num_stages": 3,
22+
"waves_per_eu": 8,
23+
"matrix_instr_nonkdim": 16,
24+
"cache_modifier": "",
25+
"NUM_KSPLIT": 1,
26+
"SPLITK_BLOCK_SIZE": 2048
27+
},
28+
"medium_M64": {
29+
"BLOCK_SIZE_M": 64,
30+
"BLOCK_SIZE_N": 32,
31+
"BLOCK_SIZE_K": 128,
32+
"GROUP_SIZE_M": 32,
33+
"num_warps": 8,
34+
"num_stages": 2,
35+
"waves_per_eu": 2,
36+
"matrix_instr_nonkdim": 16,
37+
"cache_modifier": "",
38+
"NUM_KSPLIT": 1,
39+
"SPLITK_BLOCK_SIZE": 2048
40+
},
41+
"medium_M128": {
42+
"BLOCK_SIZE_M": 32,
43+
"BLOCK_SIZE_N": 64,
44+
"BLOCK_SIZE_K": 128,
45+
"GROUP_SIZE_M": 8,
46+
"num_warps": 8,
47+
"num_stages": 2,
48+
"waves_per_eu": 2,
49+
"matrix_instr_nonkdim": 16,
50+
"cache_modifier": "",
51+
"NUM_KSPLIT": 1,
52+
"SPLITK_BLOCK_SIZE": 2048
53+
},
54+
"large": {
55+
"BLOCK_SIZE_M": 64,
56+
"BLOCK_SIZE_N": 64,
57+
"BLOCK_SIZE_K": 128,
58+
"GROUP_SIZE_M": 1,
59+
"num_warps": 8,
60+
"num_stages": 2,
61+
"waves_per_eu": 2,
62+
"matrix_instr_nonkdim": 16,
63+
"cache_modifier": "",
64+
"NUM_KSPLIT": 1,
65+
"SPLITK_BLOCK_SIZE": 2048
66+
},
67+
"xlarge": {
68+
"BLOCK_SIZE_M": 128,
69+
"BLOCK_SIZE_N": 256,
70+
"BLOCK_SIZE_K": 64,
71+
"GROUP_SIZE_M": 1,
72+
"num_warps": 8,
73+
"num_stages": 2,
74+
"waves_per_eu": 4,
75+
"matrix_instr_nonkdim": 16,
76+
"cache_modifier": "",
77+
"NUM_KSPLIT": 1,
78+
"SPLITK_BLOCK_SIZE": 2048
79+
},
80+
"any": {
81+
"BLOCK_SIZE_M": 128,
82+
"BLOCK_SIZE_N": 128,
83+
"BLOCK_SIZE_K": 64,
84+
"GROUP_SIZE_M": 1,
85+
"num_warps": 8,
86+
"num_stages": 2,
87+
"waves_per_eu": 4,
88+
"matrix_instr_nonkdim": 16,
89+
"cache_modifier": "",
90+
"NUM_KSPLIT": 1,
91+
"SPLITK_BLOCK_SIZE": 2048
92+
}
93+
}
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
{
2+
"small": {
3+
"BLOCK_SIZE_M": 32,
4+
"BLOCK_SIZE_N": 32,
5+
"BLOCK_SIZE_K": 128,
6+
"GROUP_SIZE_M": 64,
7+
"num_warps": 8,
8+
"num_stages": 2,
9+
"waves_per_eu": 4,
10+
"matrix_instr_nonkdim": 16,
11+
"cache_modifier": "",
12+
"NUM_KSPLIT": 1,
13+
"SPLITK_BLOCK_SIZE": 3072
14+
},
15+
"medium_M32": {
16+
"BLOCK_SIZE_M": 32,
17+
"BLOCK_SIZE_N": 32,
18+
"BLOCK_SIZE_K": 128,
19+
"GROUP_SIZE_M": 1,
20+
"num_warps": 4,
21+
"num_stages": 2,
22+
"waves_per_eu": 2,
23+
"matrix_instr_nonkdim": 16,
24+
"cache_modifier": "",
25+
"NUM_KSPLIT": 1,
26+
"SPLITK_BLOCK_SIZE": 3072
27+
},
28+
"medium_M64": {
29+
"BLOCK_SIZE_M": 64,
30+
"BLOCK_SIZE_N": 32,
31+
"BLOCK_SIZE_K": 128,
32+
"GROUP_SIZE_M": 16,
33+
"num_warps": 8,
34+
"num_stages": 2,
35+
"waves_per_eu": 8,
36+
"matrix_instr_nonkdim": 16,
37+
"cache_modifier": "",
38+
"NUM_KSPLIT": 1,
39+
"SPLITK_BLOCK_SIZE": 3072
40+
},
41+
"medium_M128": {
42+
"BLOCK_SIZE_M": 32,
43+
"BLOCK_SIZE_N": 32,
44+
"BLOCK_SIZE_K": 128,
45+
"GROUP_SIZE_M": 16,
46+
"num_warps": 4,
47+
"num_stages": 2,
48+
"waves_per_eu": 2,
49+
"matrix_instr_nonkdim": 16,
50+
"cache_modifier": "",
51+
"NUM_KSPLIT": 1,
52+
"SPLITK_BLOCK_SIZE": 3072
53+
},
54+
"large": {
55+
"BLOCK_SIZE_M": 64,
56+
"BLOCK_SIZE_N": 64,
57+
"BLOCK_SIZE_K": 128,
58+
"GROUP_SIZE_M": 16,
59+
"num_warps": 8,
60+
"num_stages": 2,
61+
"waves_per_eu": 2,
62+
"matrix_instr_nonkdim": 16,
63+
"cache_modifier": "",
64+
"NUM_KSPLIT": 1,
65+
"SPLITK_BLOCK_SIZE": 3072
66+
},
67+
"xlarge": {
68+
"BLOCK_SIZE_M": 128,
69+
"BLOCK_SIZE_N": 256,
70+
"BLOCK_SIZE_K": 64,
71+
"GROUP_SIZE_M": 8,
72+
"num_warps": 8,
73+
"num_stages": 2,
74+
"waves_per_eu": 4,
75+
"matrix_instr_nonkdim": 16,
76+
"cache_modifier": "",
77+
"NUM_KSPLIT": 1,
78+
"SPLITK_BLOCK_SIZE": 3072
79+
},
80+
"any": {
81+
"BLOCK_SIZE_M": 128,
82+
"BLOCK_SIZE_N": 256,
83+
"BLOCK_SIZE_K": 64,
84+
"GROUP_SIZE_M": 64,
85+
"num_warps": 8,
86+
"num_stages": 2,
87+
"waves_per_eu": 2,
88+
"matrix_instr_nonkdim": 16,
89+
"cache_modifier": "",
90+
"NUM_KSPLIT": 1,
91+
"SPLITK_BLOCK_SIZE": 3072
92+
}
93+
}

0 commit comments

Comments
 (0)