Skip to content

Commit 4a7afd2

Browse files
Add gemm tuning configs for weekly tuning CI (#662)
* Add tune_gemm configs for perf CI * Add tuning configs for weekly ci * Add more tuning configs * Adding fallbacks * Disable configs which require masking * Add sched hint to fallback configs --------- Co-authored-by: Viacheslav Astrakhantsev <[email protected]>
1 parent d31692c commit 4a7afd2

File tree

2 files changed

+144
-0
lines changed

2 files changed

+144
-0
lines changed
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
- {'M': 4864, 'N': 4096, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
2+
- {'M': 4864, 'N': 4096, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
3+
- {'M': 4864, 'N': 4096, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
4+
- {'M': 4864, 'N': 4096, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 16, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
5+
- {'M': 4864, 'N': 4096, 'K': 4097, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
6+
- {'M': 4864, 'N': 4096, 'K': 4098, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
7+
- {'M': 4864, 'N': 4096, 'K': 4100, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
8+
- {'M': 4864, 'N': 4096, 'K': 4104, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
9+
- {'M': 4864, 'N': 4096, 'K': 4112, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
10+
- {'M': 4864, 'N': 8192, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
11+
- {'M': 4864, 'N': 8192, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
12+
- {'M': 4864, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 16, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
13+
- {'M': 4864, 'N': 8192, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
14+
- {'M': 1024, 'N': 1024, 'K': 1024, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
15+
- {'M': 1024, 'N': 1024, 'K': 1024, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 2, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
16+
- {'M': 8192, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
17+
- {'M': 8192, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
18+
- {'M': 4864, 'N': 4096, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
19+
- {'M': 4864, 'N': 4096, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 2, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
20+
- {'M': 1024, 'N': 8192, 'K': 28672, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
21+
- {'M': 1024, 'N': 8192, 'K': 28672, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 2, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 32, 'instruction_sched_variant': 'none'}
22+
- {'M': 1024, 'N': 28672, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
23+
- {'M': 1024, 'N': 28672, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
24+
- {'M': 1024, 'N': 14336, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
25+
- {'M': 1024, 'N': 14336, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 2, 'SPLIT_K': 1, 'num_warps': 2, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
26+
- {'M': 1, 'N': 8192, 'K': 28672, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
27+
- {'M': 1, 'N': 8192, 'K': 28672, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 1, 'SPLIT_K': 4, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
28+
- {'M': 1, 'N': 14336, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 2, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
29+
- {'M': 1, 'N': 14336, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
30+
- {'M': 1024, 'N': 16384, 'K': 53248, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
31+
- {'M': 1024, 'N': 16384, 'K': 53248, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
32+
- {'M': 1024, 'N': 53248, 'K': 16384, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
33+
- {'M': 1024, 'N': 53248, 'K': 16384, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
34+
- {'M': 32, 'N': 16384, 'K': 53248, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
35+
- {'M': 32, 'N': 16384, 'K': 53248, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
36+
- {'M': 32, 'N': 53248, 'K': 16384, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
37+
- {'M': 32, 'N': 53248, 'K': 16384, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 32, 'instruction_sched_variant': 'none'}
38+
- {'M': 2, 'N': 3584, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
39+
- {'M': 2, 'N': 3584, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, 'SPLIT_K': 8, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
40+
- {'M': 2, 'N': 4096, 'K': 1792, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
41+
- {'M': 2, 'N': 4096, 'K': 1792, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, 'SPLIT_K': 4, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
42+
- {'M': 4096, 'N': 13312, 'K': 8896, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
43+
- {'M': 4096, 'N': 13312, 'K': 8896, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
44+
- {'M': 2048, 'N': 17792, 'K': 13312, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
45+
- {'M': 2048, 'N': 17792, 'K': 13312, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 32, 'instruction_sched_variant': 'none'}
46+
- {'M': 1024, 'N': 13312, 'K': 1664, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 2, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
47+
- {'M': 1024, 'N': 13312, 'K': 1664, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
48+
- {'M': 8192, 'N': 1536, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
49+
- {'M': 8192, 'N': 1536, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
50+
- {'M': 8192, 'N': 5120, 'K': 1024, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
51+
- {'M': 8192, 'N': 5120, 'K': 1024, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 16, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
52+
- {'M': 8192, 'N': 1024, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
53+
- {'M': 8192, 'N': 1024, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
54+
- {'M': 32768, 'N': 5120, 'K': 512, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
55+
- {'M': 32768, 'N': 5120, 'K': 512, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 16, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
56+
- {'M': 512, 'N': 1536, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
57+
- {'M': 512, 'N': 1536, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 32, 'instruction_sched_variant': 'none'}
58+
- {'M': 512, 'N': 5120, 'K': 1024, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 2, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
59+
- {'M': 512, 'N': 5120, 'K': 1024, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 2, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
60+
- {'M': 512, 'N': 1024, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
61+
- {'M': 512, 'N': 1024, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
62+
- {'M': 2048, 'N': 5120, 'K': 512, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}
63+
- {'M': 2048, 'N': 5120, 'K': 512, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'}

0 commit comments

Comments
 (0)